D35248.id106145.diff
No OneTemporary
Actions

Size

229 KB

Referenced Files

None

Subscribers

None

D35248.id106145.diff
View Options

	Index: usr.sbin/makefs/Makefile
	===================================================================
	--- usr.sbin/makefs/Makefile
	+++ usr.sbin/makefs/Makefile
	@@ -13,7 +13,8 @@
	makefs.c \
	msdos.c \
	mtree.c \
	- walk.c
	+ walk.c \
	+ zfs.c
	MAN= makefs.8

	NO_WCAST_ALIGN=
	@@ -22,6 +23,7 @@
	.include "${SRCDIR}/cd9660/Makefile.inc"
	.include "${SRCDIR}/ffs/Makefile.inc"
	.include "${SRCDIR}/msdos/Makefile.inc"
	+.include "${SRCDIR}/zfs/Makefile.inc"

	CFLAGS+=-DHAVE_STRUCT_STAT_ST_FLAGS=1

	@@ -36,6 +38,9 @@
	CFLAGS+= -I${SRCTOP}/lib/libnetbsd
	LIBADD= netbsd util sbuf

	+CFLAGS.zfs.c+= -I${SRCDIR}/zfs \
	+ -I${SRCTOP}/sys/cddl/boot/zfs \
	+
	HAS_TESTS=
	SUBDIR.${MK_TESTS}+= tests

	Index: usr.sbin/makefs/makefs.h
	===================================================================
	--- usr.sbin/makefs/makefs.h
	+++ usr.sbin/makefs/makefs.h
	@@ -78,12 +78,14 @@
	FI_SIZED = 1<<0, /* inode sized */
	FI_ALLOCATED = 1<<1, /* fsinode->ino allocated */
	FI_WRITTEN = 1<<2, /* inode written */
	+ FI_ROOT = 1<<3, /* root of a ZFS dataset */
	};

	typedef struct {
	uint32_t ino; /* inode number used on target fs */
	uint32_t nlink; /* number of links to this entry */
	enum fi_flags flags; /* flags used by fs specific code */
	+ void param; / for use by individual fs impls */
	struct stat st; /* stat entry */
	} fsinode;

	@@ -186,6 +188,7 @@
	DECLARE_FUN(cd9660);
	DECLARE_FUN(ffs);
	DECLARE_FUN(msdos);
	+DECLARE_FUN(zfs);

	extern u_int debug;
	extern int dupsok;
	Index: usr.sbin/makefs/makefs.8
	===================================================================
	--- usr.sbin/makefs/makefs.8
	+++ usr.sbin/makefs/makefs.8
	@@ -35,7 +35,7 @@
	.\"
	.\" $FreeBSD$
	.\"
	-.Dd September 17, 2020
	+.Dd May 18, 2022
	.Dt MAKEFS 8
	.Os
	.Sh NAME
	@@ -266,6 +266,8 @@
	ISO 9660 file system.
	.It Sy msdos
	FAT12, FAT16, or FAT32 file system.
	+.It Sy zfs
	+ZFS pool containing one or more file systems.
	.El
	.It Fl x
	Exclude file system nodes not explicitly listed in the specfile.
	@@ -494,10 +496,87 @@
	.It Cm volume_label
	Volume Label.
	.El
	+.Ss zfs-specific options
	+The image created by
	+.Nm
	+contains a ZFS pool with a single vdev of type
	+.Ql disk .
	+The root dataset is always created implicitly and contains the entire input
	+directory tree unless additional datasets are specified using the options
	+described below.
	+.Pp
	+The arguments consist of a keyword, an equal sign
	+.Pq Ql = ,
	+and a value.
	+The following keywords are supported:
	+.Pp
	+.Bl -tag -width omit-trailing-period -offset indent -compact
	+.It ashift
	+The base-2 logarithm of the minimum block size.
	+Typical values are 9 (512B blocks) and 12 (4KB blocks).
	+The default value is 12.
	+.It bootfs
	+The name of the bootable dataset for the pool.
	+Specifying this option causes the
	+.Ql bootfs
	+property to be set in the created pool.
	+.It poolname
	+The name of the ZFS pool.
	+This option must be specified.
	+.It rootpath
	+An implicit path prefix added to dataset mountpoints.
	+By default it is
	+.Pa /<poolname> .
	+For creating bootable pools, the
	+.Va rootpath
	+should be set to
	+.Pa / .
	+At least one dataset must have a mountpoint equal to
	+.Va rootpath .
	+.It fs
	+Create an additional dataset.
	+This option may be specified multiple times.
	+The argument value must be of the form
	+.Ar <dataset>[:<prop1=v1>[:<prop2=v2>[:...]]] ,
	+where
	+.Ar dataset
	+is the name of the dataset and must belong to the pool's namespace.
	+For example, with a pool name of
	+.Ql test
	+all dataset names must be prefixed by
	+.Ql test/ .
	+A dataset must exist at each level of the pool's namespace.
	+For example, to create
	+.Ql test/foo/bar ,
	+.Ql test/foo
	+must be created as well.
	+.Pp
	+The dataset mountpoints determine how the datasets are populated with
	+files from the staged directory tree.
	+Conceptually, all datasets are mounted before any are populated with files.
	+The root of the staged directory tree is mapped to
	+.Va rootpath .
	+.Pp
	+Dataset properties, as described in
	+.Xr zfsprops 8 ,
	+may be specified following the dataset name.
	+The following properties may be set for a dataset:
	+.Pp
	+.Bl -tag -compact -offset indent
	+.It atime
	+.It canmount
	+.It exec
	+.It mountpoint
	+.It setuid
	+.El
	+.El
	.Sh SEE ALSO
	.Xr mtree 5 ,
	.Xr mtree 8 ,
	-.Xr newfs 8
	+.Xr newfs 8 ,
	+.Xr zfsconcepts 8 ,
	+.Xr zfsprops 8 ,
	+.Xr zpoolprops 8
	.Sh HISTORY
	The
	.Nm
	Index: usr.sbin/makefs/makefs.c
	===================================================================
	--- usr.sbin/makefs/makefs.c
	+++ usr.sbin/makefs/makefs.c
	@@ -77,6 +77,7 @@
	ENTRY(cd9660),
	ENTRY(ffs),
	ENTRY(msdos),
	+ ENTRY(zfs),
	{ .type = NULL },
	};

	@@ -266,7 +267,7 @@
	break;

	case 'Z':
	- /* Superscedes 'p' for compatibility with NetBSD makefs(8) */
	+ /* Supersedes 'p' for compatibility with NetBSD makefs(8) */
	fsoptions.sparse = 1;
	break;

	Index: usr.sbin/makefs/tests/Makefile
	===================================================================
	--- usr.sbin/makefs/tests/Makefile
	+++ usr.sbin/makefs/tests/Makefile
	@@ -2,6 +2,7 @@

	ATF_TESTS_SH+= makefs_cd9660_tests
	ATF_TESTS_SH+= makefs_ffs_tests
	+ATF_TESTS_SH+= makefs_zfs_tests

	BINDIR= ${TESTSDIR}

	@@ -12,7 +13,7 @@
	TEST_METADATA.makefs_cd9660_tests+= required_files="/sbin/mount_cd9660"

	.for t in ${ATF_TESTS_SH}
	-TEST_METADATA.$t+= required_user="root"
	+#TEST_METADATA.$t+= required_user="root"
	.endfor

	.include <bsd.test.mk>
	Index: usr.sbin/makefs/tests/makefs_zfs_tests.sh
	===================================================================
	--- /dev/null
	+++ usr.sbin/makefs/tests/makefs_zfs_tests.sh
	@@ -0,0 +1,521 @@
	+#-
	+# SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	+#
	+# Copyright (c) 2022 The FreeBSD Foundation
	+#
	+# This software was developed by Mark Johnston under sponsorship from
	+# the FreeBSD Foundation.
	+#
	+# Redistribution and use in source and binary forms, with or without
	+# modification, are permitted provided that the following conditions are
	+# met:
	+# 1. Redistributions of source code must retain the above copyright
	+# notice, this list of conditions and the following disclaimer.
	+# 2. Redistributions in binary form must reproduce the above copyright
	+# notice, this list of conditions and the following disclaimer in
	+# the documentation and/or other materials provided with the distribution.
	+#
	+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+# SUCH DAMAGE.
	+#
	+
	+MAKEFS="makefs -t zfs"
	+ZFS_POOL_NAME="makefstest$(jot -r 1 100000)"
	+TEST_ZFS_POOL_NAME="$TMPDIR/poolname"
	+
	+. "$(dirname "$0")/makefs_tests_common.sh"
	+
	+common_cleanup()
	+{
	+ local pool md
	+
	+ # Try to force a TXG, this can help catch bugs by triggering a panic.
	+ sync
	+
	+ pool=$(cat $TEST_ZFS_POOL_NAME)
	+ if zpool list "$pool" >/dev/null; then
	+ zpool destroy "$pool"
	+ fi
	+
	+ md=$(cat $TEST_MD_DEVICE_FILE)
	+ if [ -c /dev/"$md" ]; then
	+ mdconfig -d -u "$md"
	+ fi
	+}
	+
	+import_image()
	+{
	+ atf_check -e empty -o save:$TEST_MD_DEVICE_FILE -s exit:0 \
	+ mdconfig -a -f $TEST_IMAGE
	+ atf_check -e empty -o empty -s exit:0 \
	+ zpool import -R $TEST_MOUNT_DIR $ZFS_POOL_NAME
	+ echo "$ZFS_POOL_NAME" > $TEST_ZFS_POOL_NAME
	+}
	+
	+#
	+# Test with some default layout defined by the common code.
	+#
	+atf_test_case basic cleanup
	+basic_body()
	+{
	+ create_test_inputs
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+}
	+basic_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+atf_test_case dataset_removal cleanup
	+dataset_removal_body()
	+{
	+ create_test_dirs
	+
	+ cd $TEST_INPUTS_DIR
	+ mkdir dir
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ -o fs=${ZFS_POOL_NAME}/dir \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+
	+ atf_check -o empty -e empty -s exit:0 zfs destroy ${ZFS_POOL_NAME}/dir
	+}
	+dataset_removal_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+#
	+# Make sure that we can create and remove an empty directory.
	+#
	+atf_test_case empty_dir cleanup
	+empty_dir_body()
	+{
	+ create_test_dirs
	+
	+ cd $TEST_INPUTS_DIR
	+ mkdir dir
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+
	+ atf_check -s exit:0 rmdir ${TEST_MOUNT_DIR}/dir
	+}
	+empty_dir_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+atf_test_case empty_fs cleanup
	+empty_fs_body()
	+{
	+ create_test_dirs
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+}
	+empty_fs_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+atf_test_case file_sizes cleanup
	+file_sizes_body()
	+{
	+ local i
	+
	+ create_test_dirs
	+ cd $TEST_INPUTS_DIR
	+
	+ i=1
	+ while [ $i -lt $((1 << 20)) ]; do
	+ truncate -s $i ${i}.1
	+ truncate -s $(($i - 1)) ${i}.2
	+ truncate -s $(($i + 1)) ${i}.3
	+ i=$(($i << 1))
	+ done
	+
	+ cd -
	+
	+ # XXXMJ this creates sparse files, make sure makefs doesn't
	+ # preserve the sparseness.
	+ # XXXMJ need to test with larger files (at least 128MB for L2 indirs)
	+ # XXXMJ try with different ashifts
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+}
	+file_sizes_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+atf_test_case hard_links cleanup
	+hard_links_body()
	+{
	+ local f
	+
	+ create_test_dirs
	+ cd $TEST_INPUTS_DIR
	+
	+ mkdir dir
	+ echo "hello" > 1
	+ ln 1 2
	+ ln 1 dir/1
	+
	+ echo "goodbye" > dir/a
	+ ln dir/a dir/b
	+ ln dir/a a
	+
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+
	+ stat -f '%i' ${TEST_MOUNT_DIR}/1 > ./ino
	+ stat -f '%l' ${TEST_MOUNT_DIR}/1 > ./nlink
	+ for f in 1 2 dir/1; do
	+ atf_check -o file:./nlink -e empty -s exit:0 \
	+ stat -f '%l' ${TEST_MOUNT_DIR}/${f}
	+ atf_check -o file:./ino -e empty -s exit:0 \
	+ stat -f '%i' ${TEST_MOUNT_DIR}/${f}
	+ atf_check -o empty -e empty -s exit:0 \
	+ cmp -s ${TEST_INPUTS_DIR}/1 ${TEST_MOUNT_DIR}/${f}
	+ done
	+
	+ stat -f '%i' ${TEST_MOUNT_DIR}/dir/a > ./ino
	+ stat -f '%l' ${TEST_MOUNT_DIR}/dir/a > ./nlink
	+ for f in dir/a dir/b a; do
	+ atf_check -o file:./nlink -e empty -s exit:0 \
	+ stat -f '%l' ${TEST_MOUNT_DIR}/${f}
	+ atf_check -o file:./ino -e empty -s exit:0 \
	+ stat -f '%i' ${TEST_MOUNT_DIR}/${f}
	+ atf_check -o empty -e empty -s exit:0 \
	+ cmp -s ${TEST_INPUTS_DIR}/dir/a ${TEST_MOUNT_DIR}/${f}
	+ done
	+}
	+hard_links_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+# Allocate enough dnodes from an object set that the meta dnode needs to use
	+# indirect blocks.
	+atf_test_case indirect_dnode_array cleanup
	+indirect_dnode_array_body()
	+{
	+ local i
	+
	+ create_test_dirs
	+ cd $TEST_INPUTS_DIR
	+ # 512 bytes per dnode, 3*128KB of direct blocks => limit of 768 files.
	+ # XXXMJ actual threshold is much lower
	+ for i in $(seq 1 1000); do
	+ touch $i
	+ done
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+}
	+indirect_dnode_array_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+#
	+# Create some files with long names, so as to test fat ZAP handling.
	+#
	+atf_test_case long_file_name cleanup
	+long_file_name_body()
	+{
	+ local dir i
	+
	+ create_test_dirs
	+ cd $TEST_INPUTS_DIR
	+
	+ # micro ZAP keys can be at most 50 bytes.
	+ for i in $(seq 1 60); do
	+ touch $(jot -s '' $i 1 1)
	+ done
	+ dir=$(jot -s '' 61 1 1)
	+ mkdir $dir
	+ for i in $(seq 1 60); do
	+ touch ${dir}/$(jot -s '' $i 1 1)
	+ done
	+
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+
	+ # Add a directory entry in the hope that OpenZFS might catch a bug
	+ # in makefs' fat ZAP encoding.
	+ touch ${TEST_MOUNT_DIR}/foo
	+}
	+long_file_name_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+#
	+# Exercise handling of multiple datasets.
	+#
	+atf_test_case multi_dataset_1 cleanup
	+multi_dataset_1_body()
	+{
	+ create_test_dirs
	+ cd $TEST_INPUTS_DIR
	+
	+ mkdir dir1
	+ echo a > dir1/a
	+ mkdir dir2
	+ echo b > dir2/b
	+
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ -o fs=${ZFS_POOL_NAME}/dir1 -o fs=${ZFS_POOL_NAME}/dir2 \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+
	+ # Make sure that we have three datasets with the expected mount points.
	+ atf_check -o inline:${ZFS_POOL_NAME}\\n -e empty -s exit:0 \
	+ zfs list -H -o name ${ZFS_POOL_NAME}
	+ atf_check -o inline:${TEST_MOUNT_DIR}\\n -e empty -s exit:0 \
	+ zfs list -H -o mountpoint ${ZFS_POOL_NAME}
	+
	+ atf_check -o inline:${ZFS_POOL_NAME}/dir1\\n -e empty -s exit:0 \
	+ zfs list -H -o name ${ZFS_POOL_NAME}/dir1
	+ atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \
	+ zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
	+
	+ atf_check -o inline:${ZFS_POOL_NAME}/dir2\\n -e empty -s exit:0 \
	+ zfs list -H -o name ${ZFS_POOL_NAME}/dir2
	+ atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \
	+ zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2
	+}
	+multi_dataset_1_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+atf_test_case multi_dataset_2 cleanup
	+multi_dataset_2_body()
	+{
	+ create_test_dirs
	+ cd $TEST_INPUTS_DIR
	+
	+ mkdir dir1
	+ echo a > dir1/a
	+ mkdir dir2
	+ echo b > dir2/b
	+
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ -o fs=${ZFS_POOL_NAME}/dir1:mountpoint=/ \
	+ -o fs=${ZFS_POOL_NAME}:mountpoint=/dir1 \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+}
	+multi_dataset_2_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+#
	+# Rudimentary test to verify that two ZFS images created using the same
	+# parameters and input hierarchy are byte-identical. In particular, makefs(1)
	+# does not preserve file access times.
	+#
	+atf_test_case reproducible cleanup
	+reproducible_body()
	+{
	+ create_test_inputs
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ ${TEST_IMAGE}.1 $TEST_INPUTS_DIR
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ ${TEST_IMAGE}.2 $TEST_INPUTS_DIR
	+
	+ # XXX-MJ cmp(1) is really slow
	+ atf_check -o empty -e empty -s exit:0 \
	+ cmp ${TEST_IMAGE}.1 ${TEST_IMAGE}.2
	+}
	+reproducible_cleanup()
	+{
	+}
	+
	+atf_test_case snapshot cleanup
	+snapshot_body()
	+{
	+ create_test_dirs
	+ cd $TEST_INPUTS_DIR
	+
	+ mkdir dir
	+ echo "hello" > dir/hello
	+ echo "goodbye" > goodbye
	+
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ atf_check -o empty -e empty -s exit:0 zfs snapshot ${ZFS_POOL_NAME}@1
	+}
	+snapshot_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+atf_test_case soft_links cleanup
	+soft_links_body()
	+{
	+ create_test_dirs
	+ cd $TEST_INPUTS_DIR
	+
	+ mkdir dir
	+ ln -s a a
	+ ln -s dir/../a a
	+ ln -s dir/b b
	+ echo 'c' > dir
	+ ln -s dir/c c
	+ # XXX-MJ overflows bonus buffer ln -s $(jot -s '' 320 1 1) 1
	+
	+ cd -
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+}
	+soft_links_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+#
	+# Verify that we can set properties on the root dataset.
	+#
	+atf_test_case root_props cleanup
	+root_props_body()
	+{
	+ create_test_inputs
	+
	+ atf_check -o empty -e empty -s exit:0 \
	+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
	+ -o fs=${ZFS_POOL_NAME}:atime=off:setuid=off \
	+ $TEST_IMAGE $TEST_INPUTS_DIR
	+
	+ import_image
	+
	+ check_image_contents
	+
	+ atf_check -o inline:off\\n -e empty -s exit:0 \
	+ zfs get -H -o value atime $ZFS_POOL_NAME
	+ atf_check -o inline:local\\n -e empty -s exit:0 \
	+ zfs get -H -o source atime $ZFS_POOL_NAME
	+ atf_check -o inline:off\\n -e empty -s exit:0 \
	+ zfs get -H -o value setuid $ZFS_POOL_NAME
	+ atf_check -o inline:local\\n -e empty -s exit:0 \
	+ zfs get -H -o source setuid $ZFS_POOL_NAME
	+}
	+root_props_cleanup()
	+{
	+ common_cleanup
	+}
	+
	+atf_init_test_cases()
	+{
	+ atf_add_test_case basic
	+ atf_add_test_case dataset_removal
	+ atf_add_test_case empty_dir
	+ atf_add_test_case empty_fs
	+ atf_add_test_case file_sizes
	+ atf_add_test_case hard_links
	+ atf_add_test_case indirect_dnode_array
	+ atf_add_test_case long_file_name
	+ atf_add_test_case multi_dataset_1
	+ atf_add_test_case multi_dataset_2
	+ # XXX-MJ one to check handling of non-existent mountpoints
	+ # one to check mountpoint "none"
	+ atf_add_test_case reproducible
	+ atf_add_test_case snapshot
	+ atf_add_test_case soft_links
	+ atf_add_test_case root_props
	+
	+ # XXXMJ tests:
	+ # - test with different ashifts (at least, 9 and 12), different image sizes
	+ # - create datasets in imported pool
	+ # - bootenvs
	+}
	Index: usr.sbin/makefs/zfs.c
	===================================================================
	--- /dev/null
	+++ usr.sbin/makefs/zfs.c
	@@ -0,0 +1,3322 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	+ *
	+ * Copyright (c) 2022 The FreeBSD Foundation
	+ *
	+ * This software was developed by Mark Johnston under sponsorship from
	+ * the FreeBSD Foundation.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions are
	+ * met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in
	+ * the documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/dirent.h>
	+#include <sys/endian.h>
	+#include <sys/errno.h>
	+#include <sys/queue.h>
	+
	+#include <assert.h>
	+#include <bitstring.h>
	+#include <fcntl.h>
	+#include <stdbool.h>
	+#include <stddef.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <unistd.h>
	+
	+#include <util.h>
	+
	+#include "makefs.h"
	+#include "zfs/nvlist.h"
	+#include "zfs/zfsimpl.h"
	+
	+#pragma clang diagnostic push
	+#pragma clang diagnostic ignored "-Wunused-function"
	+#include "fletcher.c"
	+#include "sha256.c"
	+#pragma clang diagnostic pop
	+
	+/*
	+ * XXX-MJ
	+ * - documentation
	+ * - split into multiple files?
	+ * - review checksum algorithm selection (most should likely be "inherit"?)
	+ * - review vdev_space_alloc()
	+ * - review type usage (off_t vs. size_t vs. uint64_t)
	+ * - inconsistency in variable/field naming (how to name a dnode vs dnode id)
	+ * - bootfs property, bootenvs
	+ * - ZFS_SHARES_DIR
	+ */
	+
	+#define MAXBLOCKSHIFT 17 /* 128KB */
	+#define MAXBLOCKSIZE ((off_t)(1 << MAXBLOCKSHIFT))
	+_Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, "");
	+#define MINBLOCKSHIFT 9 /* 512B */
	+#define MINBLOCKSIZE ((off_t)(1 << MINBLOCKSHIFT))
	+_Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, "");
	+#define MINDEVSIZE ((off_t)SPA_MINDEVSIZE)
	+
	+#define INDIR_LEVELS 6
	+#define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t))
	+
	+#define VDEV_LABEL_SPACE \
	+ ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
	+_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
	+
	+typedef struct {
	+ const char *name;
	+ unsigned int id;
	+ uint16_t size;
	+ sa_bswap_type_t bs;
	+} zfs_sattr_t;
	+
	+typedef struct zfs_objset {
	+ objset_phys_t *phys;
	+ off_t osloc;
	+ off_t osblksz;
	+ blkptr_t osbp; /* set in objset_write() */
	+
	+ off_t space; /* bytes allocated to this objset */
	+
	+ dnode_phys_t dnodes; / dnode array */
	+ uint64_t dnodenextfree; /* dnode ID bump allocator */
	+ uint64_t dnodecount; /* total number of dnodes */
	+ off_t dnodeloc; /* preallocated vdev space */
	+} zfs_objset_t;
	+
	+typedef struct zfs_zap_entry {
	+ char name; / entry key, private copy */
	+ uint64_t hash; /* key hash */
	+ union {
	+ uint8_t *valp;
	+ uint16_t *val16p;
	+ uint32_t *val32p;
	+ uint64_t *val64p;
	+ }; /* entry value, an integer array */
	+ uint64_t val64; /* embedded value for a common case */
	+ size_t intsz; /* array element size; 1, 2, 4 or 8 */
	+ size_t intcnt; /* array size */
	+ STAILQ_ENTRY(zfs_zap_entry) next;
	+} zfs_zap_entry_t;
	+
	+typedef struct zfs_zap {
	+ STAILQ_HEAD(, zfs_zap_entry) kvps;
	+ uint64_t hashsalt; /* key hash input */
	+ unsigned long kvpcnt; /* number of key-value pairs */
	+ unsigned long chunks; /* count of chunks needed for fat ZAP */
	+ bool micro; /* can this be a micro ZAP? */
	+
	+ dnode_phys_t dnode; / backpointer */
	+ zfs_objset_t os; / backpointer */
	+} zfs_zap_t;
	+
	+struct zfs_dsl_dir;
	+
	+typedef struct zfs_dsl_dataset {
	+ zfs_objset_t os; / referenced objset, may be null */
	+ dsl_dataset_phys_t phys; / on-disk representation */
	+ uint64_t dsid; /* DSL dataset dnode */
	+
	+ struct zfs_dsl_dir dir; / containing parent */
	+} zfs_dsl_dataset_t;
	+
	+typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t;
	+
	+typedef struct zfs_dsl_dir {
	+ char fullname; / full dataset name */
	+ char name; / basename(fullname) */
	+ dsl_dir_phys_t phys; / on-disk representation */
	+ nvlist_t propsnv; / properties saved in propszap */
	+
	+ zfs_dsl_dataset_t headds; / principal dataset, may be null */
	+
	+ uint64_t dirid; /* DSL directory dnode */
	+ zfs_zap_t propszap; /* dataset properties */
	+ zfs_zap_t childzap; /* child directories */
	+
	+ /* DSL directory tree linkage. */
	+ struct zfs_dsl_dir *parent;
	+ zfs_dsl_dir_list_t children;
	+ STAILQ_ENTRY(zfs_dsl_dir) next;
	+} zfs_dsl_dir_t;
	+
	+typedef struct zfs_fs {
	+ zfs_objset_t *os;
	+
	+ /* Offset table for system attributes, indexed by a zpl_attr_t. */
	+ uint16_t *saoffs;
	+ size_t sacnt;
	+ const zfs_sattr_t *satab;
	+} zfs_fs_t;
	+
	+struct dataset_desc {
	+ char *params;
	+ STAILQ_ENTRY(dataset_desc) next;
	+};
	+
	+typedef struct {
	+ /* I/O buffer, just for convenience. */
	+ char filebuf[MAXBLOCKSIZE];
	+
	+ /* Pool parameters. */
	+ const char *poolname;
	+ char rootpath; / implicit mount point prefix */
	+ char bootfs; / bootable dataset, pool property */
	+ int ashift; /* vdev block size */
	+ STAILQ_HEAD(, dataset_desc) datasets; /* non-root dataset descrs */
	+
	+ /* Pool state. */
	+ uint64_t guid; /* pool and vdev GUID */
	+ zfs_zap_t poolprops;
	+
	+ /* MOS state. */
	+ zfs_objset_t mos; /* meta object set */
	+ uint64_t objarrid; /* space map object array */
	+
	+ /* DSL state. */
	+ zfs_dsl_dir_t rootdsldir; /* root DSL directory */
	+ zfs_dsl_dataset_t rootds;
	+ zfs_dsl_dir_t origindsldir; /* $ORIGIN */
	+ zfs_dsl_dataset_t originds;
	+ zfs_dsl_dataset_t snapds;
	+ zfs_zap_t cloneszap;
	+ zfs_dsl_dir_t freedsldir; /* $FREE */
	+ zfs_dsl_dir_t mosdsldir; /* $MOS */
	+
	+ /* vdev state. */
	+ int fd; /* vdev disk fd */
	+ off_t vdevsize; /* vdev size, including labels */
	+ off_t asize; /* vdev size, excluding labels */
	+ bitstr_t spacemap; / space allocation tracking */
	+ int spacemapbits; /* one bit per ashift-sized block */
	+ uint64_t msshift; /* log2(metaslab size) */
	+ uint64_t mscount; /* number of metaslabs for this vdev */
	+} zfs_opt_t;
	+
	+static void zap_init(zfs_zap_t , zfs_objset_t , dnode_phys_t *);
	+static void zap_add_uint64(zfs_zap_t , const char , uint64_t);
	+static void zap_add_string(zfs_zap_t , const char , const char *);
	+static void zap_write(zfs_opt_t , zfs_zap_t );
	+
	+static dnode_phys_t objset_dnode_lookup(zfs_objset_t , uint64_t);
	+static dnode_phys_t objset_dnode_alloc(zfs_objset_t , uint8_t, uint64_t *);
	+static dnode_phys_t objset_dnode_bonus_alloc(zfs_objset_t , uint8_t, uint8_t,
	+ uint16_t, uint64_t *);
	+static off_t objset_space_alloc(zfs_opt_t , zfs_objset_t , off_t *);
	+
	+static void dsl_dir_init(zfs_opt_t , const char , zfs_dsl_dir_t *);
	+static void dsl_dataset_init(zfs_opt_t , zfs_dsl_dir_t , zfs_dsl_dataset_t *);
	+
	+static void spacemap_init(zfs_opt_t *);
	+
	+struct dnode_cursor {
	+ char inddir[INDIR_LEVELS][MAXBLOCKSIZE];
	+ off_t indloc;
	+ off_t indspace;
	+ dnode_phys_t *dnode;
	+ off_t dataoff;
	+ off_t datablksz;
	+};
	+
	+static struct dnode_cursor dnode_cursor_init(zfs_opt_t , zfs_objset_t *,
	+ dnode_phys_t *, off_t, off_t);
	+static blkptr_t dnode_cursor_next(zfs_opt_t , struct dnode_cursor *,
	+ off_t);
	+static void dnode_cursor_finish(zfs_opt_t , struct dnode_cursor );
	+
	+static void fs_build_one(zfs_opt_t , zfs_dsl_dir_t , fsnode *, int);
	+
	+/*
	+ * The order of the attributes doesn't matter, this is simply the one hard-coded
	+ * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
	+ */
	+typedef enum zpl_attr {
	+ ZPL_ATIME,
	+ ZPL_MTIME,
	+ ZPL_CTIME,
	+ ZPL_CRTIME,
	+ ZPL_GEN,
	+ ZPL_MODE,
	+ ZPL_SIZE,
	+ ZPL_PARENT,
	+ ZPL_LINKS,
	+ ZPL_XATTR,
	+ ZPL_RDEV,
	+ ZPL_FLAGS,
	+ ZPL_UID,
	+ ZPL_GID,
	+ ZPL_PAD,
	+ ZPL_ZNODE_ACL,
	+ ZPL_DACL_COUNT,
	+ ZPL_SYMLINK,
	+ ZPL_SCANSTAMP,
	+ ZPL_DACL_ACES,
	+ ZPL_DXATTR,
	+ ZPL_PROJID,
	+} zpl_attr_t;
	+
	+/*
	+ * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
	+ */
	+static const zfs_sattr_t zpl_attrs[] = {
	+#define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b }
	+ _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
	+ _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
	+ _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
	+ _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
	+ _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
	+ _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
	+#undef ZPL_ATTR
	+};
	+
	+/*
	+ * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
	+ * It need not match in general, but FreeBSD's loader doesn't bother parsing the
	+ * layout and just hard-codes attribute offsets.
	+ */
	+static const sa_attr_type_t zpl_attr_layout[] = {
	+ ZPL_MODE,
	+ ZPL_SIZE,
	+ ZPL_GEN,
	+ ZPL_UID,
	+ ZPL_GID,
	+ ZPL_PARENT,
	+ ZPL_FLAGS,
	+ ZPL_ATIME,
	+ ZPL_MTIME,
	+ ZPL_CTIME,
	+ ZPL_CRTIME,
	+ ZPL_LINKS,
	+ ZPL_DACL_COUNT,
	+ ZPL_DACL_ACES,
	+ ZPL_SYMLINK,
	+};
	+
	+/*
	+ * Keys for the ZPL attribute tables in the SA layout ZAP. The first two
	+ * indices are reserved for legacy attribute encoding.
	+ */
	+#define SA_LAYOUT_INDEX_DEFAULT 2
	+#define SA_LAYOUT_INDEX_SYMLINK 3
	+
	+void
	+zfs_prep_opts(fsinfo_t *fsopts)
	+{
	+ zfs_opt_t zfs = ecalloc(1, sizeof(zfs));
	+
	+ const option_t zfs_options[] = {
	+ { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
	+ 0, 0, "Bootable dataset" },
	+ { '\0', "poolname", &zfs->poolname, OPT_STRPTR,
	+ 0, 0, "ZFS pool name" },
	+ { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
	+ 0, 0, "Prefix for all dataset mount points" },
	+ { '\0', "ashift", &zfs->ashift, OPT_INT32,
	+ MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
	+ { .name = NULL }
	+ };
	+
	+ /* Set some default values. */
	+ zfs->ashift = 12;
	+
	+ STAILQ_INIT(&zfs->datasets);
	+
	+ fsopts->fs_specific = zfs;
	+ fsopts->fs_options = copy_opts(zfs_options);
	+}
	+
	+int
	+zfs_parse_opts(const char option, fsinfo_t fsopts)
	+{
	+ zfs_opt_t *zfs;
	+ struct dataset_desc *dsdesc;
	+ char buf[BUFSIZ], opt, val;
	+ int rv;
	+
	+ zfs = fsopts->fs_specific;
	+
	+ opt = val = estrdup(option);
	+ opt = strsep(&val, "=");
	+ if (strcmp(opt, "fs") == 0) {
	+ if (val == NULL)
	+ errx(1, "invalid filesystem parameters `%s'", option);
	+
	+ /*
	+ * Dataset descriptions will be parsed later, in dsl_init().
	+ * Just stash them away for now.
	+ */
	+ dsdesc = ecalloc(1, sizeof(*dsdesc));
	+ dsdesc->params = estrdup(val);
	+ free(opt);
	+ STAILQ_INSERT_TAIL(&zfs->datasets, dsdesc, next);
	+ return (1);
	+ }
	+ free(opt);
	+
	+ rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
	+ return (rv == -1 ? 0 : 1);
	+}
	+
	+static void
	+zfs_check_opts(fsinfo_t *fsopts)
	+{
	+ zfs_opt_t *zfs;
	+
	+ zfs = fsopts->fs_specific;
	+
	+ if (fsopts->offset != 0)
	+ errx(1, "unhandled offset option");
	+ if (zfs->poolname == NULL)
	+ errx(1, "a pool name must be specified");
	+ if (zfs->rootpath == NULL)
	+ easprintf(&zfs->rootpath, "/%s", zfs->poolname);
	+ if (zfs->rootpath[0] != '/')
	+ errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
	+}
	+
	+void
	+zfs_cleanup_opts(fsinfo_t *fsopts)
	+{
	+ struct dataset_desc d, tmp;
	+ zfs_opt_t *zfs;
	+
	+ zfs = fsopts->fs_specific;
	+ free(zfs->rootpath);
	+ free(zfs->bootfs);
	+ free(__DECONST(void *, zfs->poolname));
	+ STAILQ_FOREACH_SAFE(d, &zfs->datasets, next, tmp) {
	+ free(d->params);
	+ free(d);
	+ }
	+ free(zfs);
	+ free(fsopts->fs_options);
	+}
	+
	+static int
	+nvlist_find_string(nvlist_t nvl, const char key, char **retp)
	+{
	+ char *str;
	+ int error, len;
	+
	+ error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len);
	+ if (error == 0) {
	+ *retp = ecalloc(1, len + 1);
	+ memcpy(*retp, str, len);
	+ }
	+ return (error);
	+}
	+
	+static int
	+nvlist_find_uint64(nvlist_t nvl, const char key, uint64_t *retp)
	+{
	+ return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL));
	+}
	+
	+static size_t
	+nvlist_size(const nvlist_t *nvl)
	+{
	+ return (sizeof(nvl->nv_header) + nvl->nv_size);
	+}
	+
	+static void
	+nvlist_copy(const nvlist_t nvl, char buf, size_t sz)
	+{
	+ assert(sz >= nvlist_size(nvl));
	+
	+ memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
	+ memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
	+}
	+
	+static void
	+blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level,
	+ uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum)
	+{
	+ dva_t *dva;
	+
	+ assert(powerof2(size));
	+
	+ BP_ZERO(bp);
	+ BP_SET_LSIZE(bp, size);
	+ BP_SET_PSIZE(bp, size);
	+ BP_SET_CHECKSUM(bp, cksumt);
	+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
	+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
	+ BP_SET_LEVEL(bp, level);
	+ BP_SET_FILL(bp, fill);
	+ BP_SET_TYPE(bp, dntype);
	+
	+ dva = BP_IDENTITY(bp);
	+ DVA_SET_VDEV(dva, 0);
	+ DVA_SET_OFFSET(dva, off);
	+ DVA_SET_ASIZE(dva, size);
	+ memcpy(&bp->blk_cksum, cksum, sizeof(*cksum));
	+}
	+
	+static void
	+vdev_init(zfs_opt_t zfs, size_t size, const char image)
	+{
	+ assert(zfs->ashift >= MINBLOCKSHIFT);
	+
	+ zfs->vdevsize = rounddown2(size, 1 << zfs->ashift);
	+ if (zfs->vdevsize < MINDEVSIZE) {
	+ errx(1, "Maximum image size %ju is too small",
	+ (uintmax_t)zfs->vdevsize);
	+ }
	+ zfs->asize = zfs->vdevsize - VDEV_LABEL_SPACE;
	+
	+ zfs->fd = open(image, O_RDWR \| O_CREAT \| O_TRUNC, 0644);
	+ if (zfs->fd == -1)
	+ err(1, "Can't open `%s' for writing", image);
	+ if (ftruncate(zfs->fd, zfs->vdevsize) != 0)
	+ err(1, "Failed to extend image file `%s'", image);
	+
	+ spacemap_init(zfs);
	+}
	+
	+static void
	+vdev_fini(zfs_opt_t *zfs)
	+{
	+ assert(zfs->spacemap == NULL);
	+
	+ if (zfs->fd != -1) {
	+ if (close(zfs->fd) != 0)
	+ err(1, "close");
	+ zfs->fd = -1;
	+ }
	+}
	+
	+/*
	+ * Write a block of data to the vdev. The offset is always relative to the end
	+ * of the second leading vdev label.
	+ *
	+ * Consumers should generally use the helpers below, which provide block
	+ * pointers and update dnode accounting, rather than calling this function
	+ * directly.
	+ */
	+static void
	+vdev_pwrite(const zfs_opt_t zfs, const void buf, size_t len, off_t off)
	+{
	+ ssize_t n;
	+
	+ assert(off >= 0 && off < zfs->asize);
	+ assert(powerof2(len));
	+ assert((off_t)len > 0 && off + (off_t)len > off &&
	+ off + (off_t)len < zfs->asize);
	+ if (zfs->spacemap != NULL) {
	+ /*
	+ * Verify that the blocks being written were in fact allocated.
	+ *
	+ * The space map isn't available once the on-disk space map is
	+ * finalized, so this check doesn't quite catch everything.
	+ */
	+ assert(bit_ntest(zfs->spacemap, off >> zfs->ashift,
	+ (off + len - 1) >> zfs->ashift, 1));
	+ }
	+
	+ off += VDEV_LABEL_START_SIZE;
	+ for (size_t sofar = 0; sofar < len; sofar += n) {
	+ n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar,
	+ off + sofar);
	+ if (n < 0)
	+ err(1, "pwrite");
	+ assert(n > 0);
	+ }
	+}
	+
	+static void
	+vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
	+ uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
	+ blkptr_t *bp)
	+{
	+ zio_cksum_t cksum;
	+
	+ assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4);
	+
	+ fletcher_4_native(data, sz, NULL, &cksum);
	+ blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum);
	+ vdev_pwrite(zfs, data, sz, loc);
	+}
	+
	+static void
	+vdev_pwrite_dnode_indir(zfs_opt_t zfs, dnode_phys_t dnode, uint8_t level,
	+ uint64_t fill, const void data, off_t sz, off_t loc, blkptr_t bp)
	+{
	+ vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill,
	+ data, sz, loc, bp);
	+
	+ assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0);
	+ dnode->dn_used += sz;
	+}
	+
	+static void
	+vdev_pwrite_dnode_data(zfs_opt_t zfs, dnode_phys_t dnode, const void *data,
	+ off_t sz, off_t loc)
	+{
	+ vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc,
	+ &dnode->dn_blkptr[0]);
	+}
	+
	+static void
	+vdev_label_set_checksum(void *buf, off_t off, off_t size)
	+{
	+ zio_cksum_t cksum;
	+ zio_eck_t *eck;
	+
	+ assert(size > 0 && (size_t)size >= sizeof(zio_eck_t));
	+
	+ eck = (zio_eck_t )((char )buf + size) - 1;
	+ eck->zec_magic = ZEC_MAGIC;
	+ ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0);
	+ zio_checksum_SHA256(buf, size, NULL, &cksum);
	+ eck->zec_cksum = cksum;
	+}
	+
	+/*
	+ * Set embedded checksums and write the label at the specified index.
	+ */
	+static void
	+vdev_label_write(zfs_opt_t zfs, int ind, const vdev_label_t labelp)
	+{
	+ vdev_label_t *label;
	+ ssize_t n;
	+ off_t blksz, loff;
	+
	+ assert(ind >= 0 && ind < VDEV_LABELS);
	+
	+ /*
	+ * Make a copy since we have to modify the label to set checksums.
	+ */
	+ label = ecalloc(1, sizeof(*label));
	+ memcpy(label, labelp, sizeof(*label));
	+
	+ if (ind < 2)
	+ loff = ind * sizeof(*label);
	+ else
	+ loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label);
	+
	+ /*
	+ * Set the verifier checksum for the boot block. We don't use it, but
	+ * the FreeBSD loader reads it and will complain if the checksum isn't
	+ * valid.
	+ */
	+ vdev_label_set_checksum(&label->vl_be,
	+ loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be));
	+
	+ /*
	+ * Set the verifier checksum for the label.
	+ */
	+ vdev_label_set_checksum(&label->vl_vdev_phys,
	+ loff + __offsetof(vdev_label_t, vl_vdev_phys),
	+ sizeof(label->vl_vdev_phys));
	+
	+ /*
	+ * Set the verifier checksum for the uberblocks. There is one uberblock
	+ * per sector; for example, with an ashift of 12 we end up with
	+ * 128KB/4KB=32 copies of the uberblock in the ring.
	+ */
	+ blksz = 1 << zfs->ashift;
	+ assert(sizeof(label->vl_uberblock) % blksz == 0);
	+ for (size_t roff = 0; roff < sizeof(label->vl_uberblock);
	+ roff += blksz) {
	+ vdev_label_set_checksum(&label->vl_uberblock[0] + roff,
	+ loff + __offsetof(vdev_label_t, vl_uberblock) + roff,
	+ blksz);
	+ }
	+
	+ n = pwrite(zfs->fd, label, sizeof(*label), loff);
	+ if (n < 0)
	+ err(1, "writing vdev label");
	+ assert(n == sizeof(*label));
	+
	+ free(label);
	+}
	+
	+/*
	+ * Find a chunk of contiguous free space of length *lenp, according to the
	+ * following rules:
	+ * 1. If the length is less than or equal to 128KB, the returned run's length
	+ * will be the smallest power of 2 equal to or larger than the length.
	+ * 2. If the length is larger than 128KB, the returned run's length will be
	+ * the smallest multiple of 128KB that is larger than the length.
	+ * 3. The returned run's length will be size-aligned up to 128KB.
	+ *
	+ * XXX-MJ the third rule isn't actually required, so this can just be a dumb
	+ * bump allocator. Maybe there's some benefit to keeping large blocks aligned,
	+ * so let's keep it for now and hope we don't get too much fragmentation.
	+ * Alternately we could try to allocate all blocks of a certain size from the
	+ * same metaslab.
	+ */
	+static off_t
	+vdev_space_alloc(zfs_opt_t zfs, off_t lenp)
	+{
	+ off_t len;
	+ int align, loc, minblksz, nbits;
	+
	+ minblksz = 1 << zfs->ashift;
	+ len = roundup2(*lenp, minblksz);
	+
	+ assert(len != 0);
	+ assert(len / minblksz <= INT_MAX);
	+
	+ if (len < MAXBLOCKSIZE) {
	+ if ((len & (len - 1)) != 0)
	+ len = (off_t)1 << flsll(len);
	+ align = len / minblksz;
	+ } else {
	+ len = roundup2(len, MAXBLOCKSIZE);
	+ align = MAXBLOCKSIZE / minblksz;
	+ }
	+
	+ for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) {
	+ bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits,
	+ &loc);
	+ if (loc == -1) {
	+ errx(1, "failed to find %ju bytes of space",
	+ (uintmax_t)len);
	+ }
	+ if ((loc & (align - 1)) == 0)
	+ break;
	+ }
	+ assert(loc + nbits > loc);
	+ bit_nset(zfs->spacemap, loc, loc + nbits - 1);
	+ *lenp = len;
	+
	+ return ((off_t)loc << zfs->ashift);
	+}
	+
	+static void
	+spacemap_init(zfs_opt_t *zfs)
	+{
	+ uint64_t msshift, nbits, slabs;
	+
	+ nbits = zfs->asize >> zfs->ashift;
	+ if (nbits > INT_MAX) {
	+ /*
	+ * With the smallest block size of 512B, the limit on the image
	+ * size is 2TB. That should be enough for anyone.
	+ */
	+ errx(1, "image size is too large");
	+ }
	+ zfs->spacemapbits = (int)nbits;
	+ zfs->spacemap = bit_alloc(zfs->spacemapbits);
	+ if (zfs->spacemap == NULL)
	+ err(1, "bitstring allocation failed");
	+
	+ /*
	+ * Try to choose a metaslab size that gives us a "reasonable" number of
	+ * metaslabs. OpenZFS seems to expect at least 2.
	+ *
	+ * This is simplistic since we expect the pool to be autoexpanded upon
	+ * first use, so OpenZFS will have to reorganize things anyway.
	+ */
	+ for (msshift = 24 /* 16MB /; msshift < 34 / 16GB */; msshift++) {
	+ slabs = zfs->asize / ((uint64_t)1 << msshift);
	+ if (slabs >= 4 && slabs <= 200)
	+ break;
	+ }
	+ if (msshift == 34) {
	+ errx(1,
	+ "failed to find a metaslab size, image size is too large");
	+ }
	+
	+ zfs->msshift = msshift;
	+ zfs->mscount = slabs;
	+}
	+
	+static void
	+spacemap_write(zfs_opt_t *zfs)
	+{
	+ dnode_phys_t *objarr;
	+ zfs_objset_t *mos;
	+ bitstr_t *spacemap;
	+ uint64_t *objarrblk;
	+ off_t smblksz, objarrblksz, objarrloc;
	+
	+ struct {
	+ dnode_phys_t *dnode;
	+ uint64_t dnid;
	+ off_t loc;
	+ } *sma;
	+
	+ mos = &zfs->mos;
	+
	+ objarrblksz = sizeof(uint64_t) * zfs->mscount;
	+ assert(objarrblksz <= MAXBLOCKSIZE);
	+ objarrloc = objset_space_alloc(zfs, mos, &objarrblksz);
	+ objarrblk = ecalloc(1, objarrblksz);
	+
	+ objarr = objset_dnode_lookup(mos, zfs->objarrid);
	+ objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT;
	+
	+ /*
	+ * Use the smallest block size for space maps. The space allocation
	+ * algorithm should aim to minimize the number of holes.
	+ */
	+ smblksz = 1 << zfs->ashift;
	+
	+ /*
	+ * First allocate dnodes and space for all of our space maps. No more
	+ * space can be allocated from the vdev after this point.
	+ */
	+ sma = ecalloc(zfs->mscount, sizeof(*sma));
	+ for (uint64_t i = 0; i < zfs->mscount; i++) {
	+ sma[i].dnode = objset_dnode_bonus_alloc(mos, DMU_OT_SPACE_MAP,
	+ DMU_OT_SPACE_MAP_HEADER, SPACE_MAP_SIZE_V0, &sma[i].dnid);
	+ sma[i].loc = objset_space_alloc(zfs, mos, &smblksz);
	+ }
	+ spacemap = zfs->spacemap;
	+ zfs->spacemap = NULL;
	+
	+ /*
	+ * Now that the set of allocated space is finalized, populate each space
	+ * map and write it to the vdev.
	+ */
	+ for (uint64_t i = 0; i < zfs->mscount; i++) {
	+ space_map_phys_t *sm;
	+ uint64_t alloc, length, *smblk;
	+ int shift, startb, endb, srunb, erunb;
	+
	+ /*
	+ * We only allocate a single block for this space map, but OpenZFS
	+ * assumes that a space map object with sufficient bonus space supports
	+ * histograms.
	+ */
	+ sma[i].dnode->dn_nblkptr = 3;
	+ sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT;
	+
	+ smblk = ecalloc(1, smblksz);
	+
	+ alloc = length = 0;
	+ shift = zfs->msshift - zfs->ashift;
	+ for (srunb = startb = i * (1 << shift),
	+ endb = (i + 1) * (1 << shift);
	+ srunb < endb; srunb = erunb) {
	+ uint64_t runlen, runoff;
	+
	+ /* Find a run of allocated space. */
	+ bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb);
	+ if (srunb == -1 \|\| srunb >= endb)
	+ break;
	+
	+ bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb);
	+ if (erunb == -1 \|\| erunb > endb)
	+ erunb = endb;
	+
	+ /*
	+ * The space represented by [srunb, erunb) has been
	+ * allocated. Add a record to the space map to indicate
	+ * this. Run offsets are relative to the beginning of
	+ * the metaslab.
	+ */
	+ runlen = erunb - srunb;
	+ runoff = srunb - startb;
	+
	+ assert(length * sizeof(uint64_t) < (uint64_t)smblksz);
	+ smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) \|
	+ SM2_RUN_ENCODE(runlen) \| SM2_VDEV_ENCODE(0);
	+ smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) \|
	+ SM2_OFFSET_ENCODE(runoff);
	+
	+ alloc += runlen << zfs->ashift;
	+ length += 2;
	+ }
	+
	+ sm = DN_BONUS(sma[i].dnode);
	+ sm->smp_length = length * sizeof(uint64_t);
	+ sm->smp_alloc = alloc;
	+
	+ vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz,
	+ sma[i].loc);
	+ free(smblk);
	+
	+ /* Record this space map in the space map object array. */
	+ objarrblk[i] = sma[i].dnid;
	+ }
	+
	+ /*
	+ * All of the space maps are written, now write the object array.
	+ */
	+ vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc);
	+ free(objarrblk);
	+
	+ assert(zfs->spacemap == NULL);
	+ free(spacemap);
	+ free(sma);
	+}
	+
	+static void
	+objset_init(zfs_opt_t zfs, zfs_objset_t os, uint64_t type,
	+ uint64_t dnodecount)
	+{
	+ dnode_phys_t *mdnode;
	+ off_t blksz;
	+
	+ /*
	+ * Allocate space on the vdev for the objset and dnode array. For other
	+ * objects we do that only when going to actually write them to the
	+ * vdev, but in this case it simplifies space map accounting to do it
	+ * now.
	+ */
	+ os->osblksz = sizeof(objset_phys_t);
	+ os->osloc = objset_space_alloc(zfs, os, &os->osblksz);
	+
	+ /*
	+ * Object ID zero is always reserved for the meta dnode, which is
	+ * embedded in the objset itself.
	+ */
	+ dnodecount++;
	+
	+ os->dnodenextfree = 1;
	+ os->dnodecount = dnodecount;
	+ blksz = roundup2(dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE);
	+ os->dnodeloc = objset_space_alloc(zfs, os, &blksz);
	+ assert(blksz % DNODE_BLOCK_SIZE == 0);
	+ os->dnodes = ecalloc(1, blksz);
	+
	+ os->phys = ecalloc(1, os->osblksz);
	+ os->phys->os_type = type;
	+
	+ mdnode = &os->phys->os_meta_dnode;
	+ mdnode->dn_indblkshift = MAXBLOCKSHIFT;
	+ mdnode->dn_type = DMU_OT_DNODE;
	+ mdnode->dn_bonustype = DMU_OT_NONE;
	+ mdnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4;
	+ mdnode->dn_datablkszsec = DNODE_BLOCK_SIZE >> MINBLOCKSHIFT;
	+ mdnode->dn_nlevels = 1;
	+ for (uint64_t count = dnodecount / DNODES_PER_BLOCK; count > 1;
	+ count /= BLKPTR_PER_INDIR)
	+ mdnode->dn_nlevels++;
	+ mdnode->dn_nblkptr = 1;
	+ mdnode->dn_maxblkid = howmany(dnodecount, DNODES_PER_BLOCK) - 1;
	+ mdnode->dn_flags = DNODE_FLAG_USED_BYTES;
	+}
	+
	+/*
	+ * Write the dnode array and physical object set to disk.
	+ */
	+static void
	+_objset_write(zfs_opt_t zfs, zfs_objset_t os, struct dnode_cursor *c)
	+{
	+ assert(os->dnodenextfree == os->dnodecount);
	+
	+ /*
	+ * Write out the dnode array, i.e., the meta-dnode. For some reason its
	+ * data blocks must be 16KB in size no matter how large the array is.
	+ */
	+ for (uint64_t i = 0; i < os->dnodecount; i += DNODES_PER_BLOCK) {
	+ dnode_phys_t *blk;
	+ uint64_t fill;
	+ off_t loc;
	+
	+ blk = os->dnodes + i;
	+ loc = os->dnodeloc + i * sizeof(dnode_phys_t);
	+ fill = os->dnodecount - i < DNODES_PER_BLOCK ?
	+ os->dnodecount - i : 0;
	+
	+ vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode,
	+ 0, fill, blk, DNODE_BLOCK_SIZE, loc,
	+ dnode_cursor_next(zfs, c, i * sizeof(dnode_phys_t)));
	+ }
	+ dnode_cursor_finish(zfs, c);
	+ free(os->dnodes);
	+ os->dnodes = NULL;
	+
	+ /*
	+ * Write the object set itself. The saved block pointer will be copied
	+ * into the referencing DSL dataset or the uberblocks.
	+ */
	+ vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1,
	+ os->phys, os->osblksz, os->osloc, &os->osbp);
	+}
	+
	+static void
	+objset_write(zfs_opt_t zfs, zfs_objset_t os)
	+{
	+ struct dnode_cursor *c;
	+
	+ c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode,
	+ os->dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE);
	+ _objset_write(zfs, os, c);
	+}
	+
	+static void
	+objset_mos_write(zfs_opt_t *zfs)
	+{
	+ struct dnode_cursor *c;
	+ zfs_objset_t *mos;
	+
	+ mos = &zfs->mos;
	+
	+ /*
	+ * There is a chicken-and-egg problem here: we cannot write space maps
	+ * before we're finished allocating space from the vdev, and we can't
	+ * write the MOS without having allocated space for indirect dnode
	+ * blocks. Thus, rather than lazily allocating indirect blocks for the
	+ * meta-dnode (which would be simpler), they are allocated up-front and
	+ * before writing space maps.
	+ */
	+ c = dnode_cursor_init(zfs, mos, &mos->phys->os_meta_dnode,
	+ mos->dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE);
	+ spacemap_write(zfs);
	+
	+ /*
	+ * We've finished allocating space, account for it in $MOS.
	+ */
	+ zfs->mosdsldir.phys->dd_used_bytes = mos->space;
	+ zfs->mosdsldir.phys->dd_compressed_bytes = mos->space;
	+ zfs->mosdsldir.phys->dd_uncompressed_bytes = mos->space;
	+
	+ _objset_write(zfs, mos, c);
	+}
	+
	+static dnode_phys_t *
	+objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype,
	+ uint16_t bonuslen, uint64_t *idp)
	+{
	+ dnode_phys_t *dnode;
	+
	+ assert(os->dnodenextfree < os->dnodecount);
	+ assert(bonuslen <= DN_OLD_MAX_BONUSLEN);
	+
	+ *idp = os->dnodenextfree;
	+ dnode = &os->dnodes[os->dnodenextfree++];
	+ dnode->dn_type = type;
	+ dnode->dn_indblkshift = MAXBLOCKSHIFT;
	+ dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT;
	+ dnode->dn_nlevels = 1;
	+ dnode->dn_nblkptr = 1;
	+ dnode->dn_bonustype = bonustype;
	+ dnode->dn_bonuslen = bonuslen;
	+ dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4;
	+ dnode->dn_compress = ZIO_COMPRESS_OFF;
	+ dnode->dn_flags = DNODE_FLAG_USED_BYTES;
	+ return (dnode);
	+}
	+
	+static dnode_phys_t *
	+objset_dnode_alloc(zfs_objset_t os, uint8_t type, uint64_t idp)
	+{
	+ return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp));
	+}
	+
	+static dnode_phys_t *
	+objset_dnode_lookup(zfs_objset_t *os, uint64_t id)
	+{
	+ assert(id > 0 && id <= os->dnodecount);
	+
	+ return (&os->dnodes[id]);
	+}
	+
	+static off_t
	+objset_space_alloc(zfs_opt_t zfs, zfs_objset_t os, off_t *lenp)
	+{
	+ off_t loc;
	+
	+ loc = vdev_space_alloc(zfs, lenp);
	+ os->space += *lenp;
	+ return (loc);
	+}
	+
	+/*
	+ * Return an allocated string containing the head dataset's mountpoint,
	+ * including the root path prefix.
	+ *
	+ * If the dataset has a mountpoint property, it is returned. Otherwise we have
	+ * to follow ZFS' inheritance rules.
	+ */
	+static char *
	+dsl_dir_get_mountpoint(zfs_opt_t zfs, zfs_dsl_dir_t dir)
	+{
	+ zfs_dsl_dir_t *pdir;
	+ char mountpoint, origmountpoint;
	+
	+ if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) {
	+ if (strcmp(mountpoint, "none") == 0)
	+ return (NULL);
	+
	+ /*
	+ * nvlist_find_string() does not make a copy.
	+ */
	+ mountpoint = estrdup(mountpoint);
	+ } else {
	+ /*
	+ * If we don't have a mountpoint, it's inherited from one of our
	+ * ancestors. Walk up the hierarchy until we find it, building
	+ * up our mountpoint along the way. The mountpoint property is
	+ * always set for the root dataset.
	+ */
	+ for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) {
	+ origmountpoint = mountpoint;
	+
	+ if (nvlist_find_string(pdir->propsnv, "mountpoint",
	+ &mountpoint) == 0) {
	+ easprintf(&mountpoint, "%s%s%s", mountpoint,
	+ mountpoint[strlen(mountpoint) - 1] == '/' ?
	+ "" : "/", origmountpoint);
	+ free(origmountpoint);
	+ break;
	+ }
	+
	+ easprintf(&mountpoint, "%s/%s", pdir->name,
	+ origmountpoint);
	+ free(origmountpoint);
	+ pdir = pdir->parent;
	+ }
	+ }
	+ assert(mountpoint[0] == '/');
	+ assert(strstr(mountpoint, zfs->rootpath) == mountpoint);
	+
	+ return (mountpoint);
	+}
	+
	+/*
	+ * Handle dataset properties that we know about; stash them into an nvlist to be
	+ * written later to the properties ZAP object.
	+ *
	+ * If the set of properties we handle grows too much, we should probably explore
	+ * using libzfs to manage them.
	+ */
	+static void
	+dsl_dir_set_prop(zfs_opt_t zfs, zfs_dsl_dir_t dir, const char *key,
	+ const char *val)
	+{
	+ nvlist_t *nvl;
	+
	+ nvl = dir->propsnv;
	+ if (val == NULL \|\| val[0] == '\0')
	+ errx(1, "missing value for property `%s'", key);
	+ if (nvpair_find(nvl, key) != NULL)
	+ errx(1, "property `%s' already set", key);
	+
	+ if (strcmp(key, "mountpoint") == 0) {
	+ if (strcmp(val, "none") != 0) {
	+ if (val[0] != '/')
	+ errx(1, "mountpoint `%s' is not absolute", val);
	+ if (strcmp(val, zfs->rootpath) != 0 &&
	+ strcmp(zfs->rootpath, "/") != 0 &&
	+ (strstr(val, zfs->rootpath) != val \|\|
	+ val[strlen(zfs->rootpath)] != '/')) {
	+ errx(1, "mountpoint `%s' is not prefixed by "
	+ "the root path `%s'", val, zfs->rootpath);
	+ }
	+ }
	+ nvlist_add_string(nvl, key, val);
	+ } else if (strcmp(key, "atime") == 0 \|\| strcmp(key, "exec") == 0 \|\|
	+ strcmp(key, "setuid") == 0) {
	+ if (strcmp(val, "on") == 0)
	+ nvlist_add_uint64(nvl, key, 1);
	+ else if (strcmp(val, "off") == 0)
	+ nvlist_add_uint64(nvl, key, 0);
	+ else
	+ errx(1, "invalid value `%s' for %s", val, key);
	+ } else if (strcmp(key, "canmount") == 0) {
	+ if (strcmp(val, "noauto") == 0)
	+ nvlist_add_uint64(nvl, key, 2);
	+ else if (strcmp(val, "on") == 0)
	+ nvlist_add_uint64(nvl, key, 1);
	+ else if (strcmp(val, "off") == 0)
	+ nvlist_add_uint64(nvl, key, 0);
	+ else
	+ errx(1, "invalid value `%s' for %s", val, key);
	+ } else {
	+ errx(1, "unknown property `%s'", key);
	+ }
	+}
	+
	+static void
	+dsl_init_metadir(zfs_opt_t zfs, const char name, zfs_dsl_dir_t *dir)
	+{
	+ char *path;
	+
	+ easprintf(&path, "%s/%s", zfs->poolname, name);
	+ dsl_dir_init(zfs, path, dir);
	+ free(path);
	+}
	+
	+static void
	+dsl_init_origindir(zfs_opt_t *zfs)
	+{
	+ dnode_phys_t *clones;
	+ uint64_t clonesid;
	+
	+ dsl_init_metadir(zfs, "$ORIGIN", &zfs->origindsldir);
	+ dsl_dataset_init(zfs, &zfs->origindsldir, &zfs->originds);
	+ dsl_dataset_init(zfs, &zfs->origindsldir, &zfs->snapds);
	+
	+ clones = objset_dnode_alloc(&zfs->mos, DMU_OT_DSL_CLONES, &clonesid);
	+ zap_init(&zfs->cloneszap, &zfs->mos, clones);
	+ zfs->origindsldir.phys->dd_clones = clonesid;
	+}
	+
	+static void
	+dsl_init(zfs_opt_t *zfs)
	+{
	+ zfs_dsl_dir_t *dir;
	+ struct dataset_desc *d;
	+
	+ dsl_dir_init(zfs, NULL, &zfs->rootdsldir);
	+
	+ nvlist_add_uint64(zfs->rootdsldir.propsnv, "compression",
	+ ZIO_COMPRESS_OFF);
	+
	+ dsl_dataset_init(zfs, &zfs->rootdsldir, &zfs->rootds);
	+ zfs->rootdsldir.headds = &zfs->rootds;
	+
	+ dsl_init_metadir(zfs, "$MOS", &zfs->mosdsldir);
	+ dsl_init_metadir(zfs, "$FREE", &zfs->freedsldir);
	+ dsl_init_origindir(zfs);
	+
	+ /*
	+ * Go through the list of user-specified datasets and create DSL objects
	+ * for them.
	+ */
	+ STAILQ_FOREACH(d, &zfs->datasets, next) {
	+ char dsname, params, param, nextparam;
	+
	+ params = d->params;
	+ dsname = strsep(&params, ":");
	+
	+ if (strcmp(dsname, zfs->poolname) == 0) {
	+ /*
	+ * This is the root dataset; it's already created, so
	+ * we're just setting options.
	+ */
	+ dir = &zfs->rootdsldir;
	+ } else {
	+ dir = ecalloc(1, sizeof(*dir));
	+ dsl_dir_init(zfs, dsname, dir);
	+ dir->headds = ecalloc(1, sizeof(*dir->headds));
	+ dsl_dataset_init(zfs, dir, dir->headds);
	+ }
	+
	+ for (nextparam = param = params; nextparam != NULL;) {
	+ char key, val;
	+
	+ param = strsep(&nextparam, ":");
	+
	+ key = val = param;
	+ key = strsep(&val, "=");
	+ dsl_dir_set_prop(zfs, dir, key, val);
	+ }
	+ }
	+
	+ /*
	+ * Set the root dataset's mount point if the user didn't override the
	+ * default.
	+ */
	+ if (nvpair_find(zfs->rootdsldir.propsnv, "mountpoint") == NULL) {
	+ nvlist_add_string(zfs->rootdsldir.propsnv, "mountpoint",
	+ zfs->rootpath);
	+ }
	+}
	+
	+static void
	+dsl_dir_foreach_post(zfs_opt_t zfs, zfs_dsl_dir_t dsldir,
	+ void (cb)(zfs_opt_t , zfs_dsl_dir_t , void ), void *arg)
	+{
	+ zfs_dsl_dir_t *cdsldir;
	+
	+ STAILQ_FOREACH(cdsldir, &dsldir->children, next) {
	+ dsl_dir_foreach_post(zfs, cdsldir, cb, arg);
	+ }
	+ cb(zfs, dsldir, arg);
	+}
	+
	+/*
	+ * Used when the caller doesn't care about the order one way or another.
	+ */
	+static void
	+dsl_dir_foreach(zfs_opt_t zfs, zfs_dsl_dir_t dsldir,
	+ void (cb)(zfs_opt_t , zfs_dsl_dir_t , void ), void *arg)
	+{
	+ dsl_dir_foreach_post(zfs, dsldir, cb, arg);
	+}
	+
	+/*
	+ * Create a DSL directory, which is effectively an entry in the ZFS namespace.
	+ * We always create a root DSL directory, whose name is the pool's name, and
	+ * several metadata directories.
	+ *
	+ * Each directory has two ZAP objects, one pointing to child directories, and
	+ * one for properties (which are inherited by children unless overridden).
	+ * Directories typically reference a DSL dataset, the "head dataset", which
	+ * points to an object set.
	+ */
	+static void
	+dsl_dir_init(zfs_opt_t zfs, const char name, zfs_dsl_dir_t *dsldir)
	+{
	+ zfs_dsl_dir_list_t l, *lp;
	+ zfs_dsl_dir_t *parent;
	+ zfs_objset_t *mos;
	+ dnode_phys_t *dnode;
	+ char dirname, nextdir, *origname;
	+ uint64_t childid, propsid;
	+
	+ mos = &zfs->mos;
	+
	+ dnode = objset_dnode_bonus_alloc(mos, DMU_OT_DSL_DIR, DMU_OT_DSL_DIR,
	+ sizeof(dsl_dir_phys_t), &dsldir->dirid);
	+ dsldir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode);
	+
	+ dnode = objset_dnode_alloc(mos, DMU_OT_DSL_PROPS, &propsid);
	+ zap_init(&dsldir->propszap, mos, dnode);
	+
	+ dnode = objset_dnode_alloc(mos, DMU_OT_DSL_DIR_CHILD_MAP, &childid);
	+ zap_init(&dsldir->childzap, mos, dnode);
	+
	+ dsldir->propsnv = nvlist_create(NV_UNIQUE_NAME);
	+ STAILQ_INIT(&dsldir->children);
	+
	+ dsldir->phys->dd_child_dir_zapobj = childid;
	+ dsldir->phys->dd_props_zapobj = propsid;
	+
	+ if (name == NULL) {
	+ /*
	+ * This is the root DSL directory.
	+ */
	+ assert(dsldir == &zfs->rootdsldir);
	+ dsldir->name = estrdup(zfs->poolname);
	+ dsldir->fullname = estrdup(zfs->poolname);
	+ dsldir->parent = NULL;
	+ dsldir->phys->dd_parent_obj = 0;
	+ return;
	+ }
	+
	+ /*
	+ * Insert the new directory into the hierarchy. Currently this must be
	+ * done in order, e.g., when creating pool/a/b, pool/a must already
	+ * exist.
	+ */
	+ STAILQ_INIT(&l);
	+ STAILQ_INSERT_HEAD(&l, &zfs->rootdsldir, next);
	+ origname = dirname = nextdir = estrdup(name);
	+ for (lp = &l;; lp = &parent->children) {
	+ dirname = strsep(&nextdir, "/");
	+ if (nextdir == NULL)
	+ break;
	+
	+ STAILQ_FOREACH(parent, lp, next) {
	+ if (strcmp(parent->name, dirname) == 0)
	+ break;
	+ }
	+ if (parent == NULL) {
	+ errx(1, "no parent at `%s' for filesystem `%s'",
	+ dirname, name);
	+ }
	+ }
	+
	+ dsldir->fullname = estrdup(name);
	+ dsldir->name = estrdup(dirname);
	+ free(origname);
	+ STAILQ_INSERT_TAIL(lp, dsldir, next);
	+ zap_add_uint64(&parent->childzap, dsldir->name, dsldir->dirid);
	+
	+ dsldir->parent = parent;
	+ dsldir->phys->dd_parent_obj = parent->dirid;
	+}
	+
	+/*
	+ * Convert dataset properties into entries in the DSL directory's properties
	+ * ZAP.
	+ */
	+static void
	+dsl_dir_finalize_props(zfs_dsl_dir_t *dir)
	+{
	+ for (nvp_header_t *nvh = NULL;
	+ (nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) {
	+ nv_string_t *nvname;
	+ nv_pair_data_t *nvdata;
	+ const char *name;
	+
	+ nvname = (nv_string_t *)(nvh + 1);
	+ nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] +
	+ NV_ALIGN4(nvname->nv_size));
	+
	+ name = nvstring_get(nvname);
	+ switch (nvdata->nv_type) {
	+ case DATA_TYPE_UINT64: {
	+ uint64_t val;
	+
	+ memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t));
	+ zap_add_uint64(&dir->propszap, name, val);
	+ break;
	+ }
	+ case DATA_TYPE_STRING: {
	+ nv_string_t *nvstr;
	+
	+ nvstr = (nv_string_t *)&nvdata->nv_data[0];
	+ zap_add_string(&dir->propszap, name,
	+ nvstring_get(nvstr));
	+ break;
	+ }
	+ default:
	+ assert(0);
	+ }
	+ }
	+}
	+
	+static void
	+dsl_dir_finalize(zfs_opt_t zfs, zfs_dsl_dir_t dir, void *arg __unused)
	+{
	+ zfs_dsl_dir_t *cdir;
	+ uint64_t bytes;
	+
	+ dsl_dir_finalize_props(dir);
	+ zap_write(zfs, &dir->propszap);
	+ zap_write(zfs, &dir->childzap);
	+
	+ if (dir->headds != NULL && dir->headds->os != NULL) {
	+ char key[32];
	+ zfs_zap_t snapnameszap;
	+ dnode_phys_t *snapnames;
	+ zfs_dsl_dataset_t *headds;
	+ zfs_objset_t *os;
	+ uint64_t snapnamesid;
	+
	+ headds = dir->headds;
	+ os = headds->os;
	+
	+ snapnames = objset_dnode_alloc(&zfs->mos,
	+ DMU_OT_DSL_DS_SNAP_MAP, &snapnamesid);
	+ zap_init(&snapnameszap, &zfs->mos, snapnames);
	+ zap_write(zfs, &snapnameszap);
	+
	+ dir->phys->dd_head_dataset_obj = headds->dsid;
	+ dir->phys->dd_clone_parent_obj = zfs->snapds.dsid;
	+ headds->phys->ds_prev_snap_obj = zfs->snapds.dsid;
	+ headds->phys->ds_snapnames_zapobj = snapnamesid;
	+ memcpy(&headds->phys->ds_bp, &os->osbp, sizeof(blkptr_t));
	+
	+ zfs->snapds.phys->ds_num_children++;
	+ snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid);
	+ zap_add_uint64(&zfs->cloneszap, key, headds->dsid);
	+
	+ bytes = os->space;
	+ headds->phys->ds_used_bytes = bytes;
	+ /* XXX-MJ not sure what the difference is here... */
	+ headds->phys->ds_uncompressed_bytes = bytes;
	+ headds->phys->ds_compressed_bytes = bytes;
	+
	+ STAILQ_FOREACH(cdir, &dir->children, next) {
	+ bytes += cdir->phys->dd_used_bytes;
	+ }
	+ dir->phys->dd_used_bytes = bytes;
	+ dir->phys->dd_compressed_bytes = bytes;
	+ dir->phys->dd_uncompressed_bytes = bytes;
	+ }
	+}
	+
	+static void
	+dsl_write(zfs_opt_t *zfs)
	+{
	+ zfs_zap_t snapnameszap;
	+ zfs_objset_t *mos;
	+ dnode_phys_t *snapnames;
	+ uint64_t snapmapid;
	+
	+ mos = &zfs->mos;
	+
	+ /*
	+ * Perform accounting, starting from the leaves of the DSL directory
	+ * tree. Accounting for $MOS is done later, once we've finished
	+ * allocating space.
	+ */
	+ dsl_dir_foreach_post(zfs, &zfs->rootdsldir, dsl_dir_finalize, NULL);
	+
	+ snapnames = objset_dnode_alloc(mos, DMU_OT_DSL_DS_SNAP_MAP, &snapmapid);
	+
	+ zfs->origindsldir.phys->dd_head_dataset_obj = zfs->originds.dsid;
	+ zfs->originds.phys->ds_prev_snap_obj = zfs->snapds.dsid;
	+ zfs->originds.phys->ds_snapnames_zapobj = snapmapid;
	+ zfs->snapds.phys->ds_next_snap_obj = zfs->originds.dsid;
	+ assert(zfs->snapds.phys->ds_num_children > 0);
	+ zfs->snapds.phys->ds_num_children++;
	+
	+ zap_init(&snapnameszap, mos, snapnames);
	+ zap_add_uint64(&snapnameszap, "$ORIGIN", zfs->snapds.dsid);
	+ zap_write(zfs, &snapnameszap);
	+
	+ zap_write(zfs, &zfs->cloneszap);
	+}
	+
	+static void
	+dsl_dataset_init(zfs_opt_t zfs, zfs_dsl_dir_t dir, zfs_dsl_dataset_t *ds)
	+{
	+ zfs_zap_t deadlistzap;
	+ dnode_phys_t *dnode;
	+ uint64_t deadlistid;
	+
	+ dnode = objset_dnode_bonus_alloc(&zfs->mos, DMU_OT_DSL_DATASET,
	+ DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid);
	+ ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode);
	+
	+ dnode = objset_dnode_bonus_alloc(&zfs->mos, DMU_OT_DEADLIST,
	+ DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid);
	+ zap_init(&deadlistzap, &zfs->mos, dnode);
	+ zap_write(zfs, &deadlistzap);
	+
	+ ds->phys->ds_dir_obj = dir->dirid;
	+ ds->phys->ds_deadlist_obj = deadlistid;
	+ ds->phys->ds_creation_txg = TXG_INITIAL - 1;
	+ if (ds != &zfs->snapds)
	+ ds->phys->ds_prev_snap_txg = TXG_INITIAL - 1;
	+
	+ ds->dir = dir;
	+}
	+
	+static uint16_t
	+zap_entry_chunks(zfs_zap_entry_t *ent)
	+{
	+ return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) +
	+ howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES));
	+}
	+
	+static uint64_t
	+zap_hash(uint64_t salt, const char *name)
	+{
	+ static uint64_t crc64_table[256];
	+ const uint64_t crc64_poly = 0xC96C5795D7870F42UL;
	+ const uint8_t *cp;
	+ uint64_t crc;
	+ uint8_t c;
	+
	+ assert(salt != 0);
	+ if (crc64_table[128] == 0) {
	+ for (int i = 0; i < 256; i++) {
	+ uint64_t *t;
	+
	+ t = crc64_table + i;
	+ *t = i;
	+ for (int j = 8; j > 0; j--)
	+ t = (t >> 1) ^ (-(*t & 1) & crc64_poly);
	+ }
	+ }
	+ assert(crc64_table[128] == crc64_poly);
	+
	+ for (cp = (const uint8_t )name, crc = salt; (c = cp) != '\0'; cp++)
	+ crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF];
	+
	+ /*
	+ * Only use 28 bits, since we need 4 bits in the cookie for the
	+ * collision differentiator. We MUST use the high bits, since
	+ * those are the ones that we first pay attention to when
	+ * choosing the bucket.
	+ */
	+ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
	+
	+ return (crc);
	+}
	+
	+static void
	+zap_init(zfs_zap_t zap, zfs_objset_t os, dnode_phys_t *dnode)
	+{
	+ STAILQ_INIT(&zap->kvps);
	+ zap->hashsalt = ((uint64_t)random() << 32) \| random();
	+ zap->micro = true;
	+ zap->kvpcnt = 0;
	+ zap->chunks = 0;
	+ zap->dnode = dnode;
	+ zap->os = os;
	+}
	+
	+static void
	+zap_add(zfs_zap_t zap, const char name, size_t intsz, size_t intcnt,
	+ const uint8_t *val)
	+{
	+ zfs_zap_entry_t *ent;
	+
	+ assert(intsz == 1 \|\| intsz == 2 \|\| intsz == 4 \|\| intsz == 8);
	+ assert(strlen(name) + 1 <= ZAP_MAXNAMELEN);
	+ assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN);
	+
	+ ent = ecalloc(1, sizeof(*ent));
	+ ent->name = estrdup(name);
	+ ent->hash = zap_hash(zap->hashsalt, ent->name);
	+ ent->intsz = intsz;
	+ ent->intcnt = intcnt;
	+ if (intsz == sizeof(uint64_t) && intcnt == 1) {
	+ /*
	+ * Micro-optimization to elide a memory allocation in that most
	+ * common case where this is a directory entry.
	+ */
	+ ent->val64p = &ent->val64;
	+ } else {
	+ ent->valp = ecalloc(intcnt, intsz);
	+ }
	+ memcpy(ent->valp, val, intcnt * intsz);
	+ zap->kvpcnt++;
	+ zap->chunks += zap_entry_chunks(ent);
	+ STAILQ_INSERT_TAIL(&zap->kvps, ent, next);
	+
	+ if (zap->micro && (intcnt != 1 \|\| intsz != sizeof(uint64_t) \|\|
	+ strlen(name) + 1 > MZAP_NAME_LEN \|\| zap->kvpcnt > MZAP_ENT_MAX))
	+ zap->micro = false;
	+}
	+
	+static void
	+zap_add_uint64(zfs_zap_t zap, const char name, uint64_t val)
	+{
	+ zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val);
	+}
	+
	+static void
	+zap_add_string(zfs_zap_t zap, const char name, const char *val)
	+{
	+ zap_add(zap, name, 1, strlen(val) + 1, val);
	+}
	+
	+static bool
	+zap_entry_exists(zfs_zap_t zap, const char name)
	+{
	+ zfs_zap_entry_t *ent;
	+
	+ STAILQ_FOREACH(ent, &zap->kvps, next) {
	+ if (strcmp(ent->name, name) == 0)
	+ return (true);
	+ }
	+ return (false);
	+}
	+
	+static void
	+zap_micro_write(zfs_opt_t zfs, zfs_zap_t zap)
	+{
	+ dnode_phys_t *dnode;
	+ zfs_zap_entry_t *ent;
	+ mzap_phys_t *mzap;
	+ mzap_ent_phys_t *ment;
	+ off_t bytes, loc;
	+
	+ memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
	+ mzap = (mzap_phys_t *)&zfs->filebuf[0];
	+ mzap->mz_block_type = ZBT_MICRO;
	+ mzap->mz_salt = zap->hashsalt;
	+ mzap->mz_normflags = 0;
	+
	+ bytes = sizeof(mzap) + (zap->kvpcnt - 1) sizeof(*ment);
	+ assert(bytes <= (off_t)MZAP_MAX_BLKSZ);
	+
	+ ment = &mzap->mz_chunk[0];
	+ STAILQ_FOREACH(ent, &zap->kvps, next) {
	+ memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt);
	+ ment->mze_cd = 0; /* XXX-MJ */
	+ strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name));
	+ ment++;
	+ }
	+
	+ loc = objset_space_alloc(zfs, zap->os, &bytes);
	+
	+ dnode = zap->dnode;
	+ dnode->dn_maxblkid = 0;
	+ dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT;
	+ dnode->dn_flags = DNODE_FLAG_USED_BYTES;
	+
	+ vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc);
	+}
	+
	+/*
	+ * Write some data to the fat ZAP leaf chunk starting at index "li".
	+ *
	+ * Note that individual integers in the value may be split among consecutive
	+ * leaves.
	+ */
	+static void
	+zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz,
	+ const uint8_t *val)
	+{
	+ struct zap_leaf_array *la;
	+
	+ assert(sz <= ZAP_MAXVALUELEN);
	+
	+ for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) {
	+ n = MIN(resid, ZAP_LEAF_ARRAY_BYTES);
	+
	+ la = &ZAP_LEAF_CHUNK(l, li).l_array;
	+ assert(la->la_type == ZAP_CHUNK_FREE);
	+ la->la_type = ZAP_CHUNK_ARRAY;
	+ memcpy(la->la_array, val, n);
	+ la->la_next = li + 1;
	+ }
	+ la->la_next = 0xffff;
	+}
	+
	+/*
	+ * Find the shortest hash prefix length which lets us distribute keys without
	+ * overflowing a leaf block. This is not (space) optimal, but is simple, and
	+ * directories large enough to overflow a single 128KB leaf block are uncommon.
	+ */
	+static unsigned int
	+zap_fat_write_prefixlen(zfs_zap_t zap, zap_leaf_t l)
	+{
	+ zfs_zap_entry_t *ent;
	+ unsigned int prefixlen;
	+
	+ if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) {
	+ /*
	+ * All chunks will fit in a single leaf block.
	+ */
	+ return (0);
	+ }
	+
	+ for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) {
	+ uint32_t *leafchunks;
	+
	+ leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks));
	+ STAILQ_FOREACH(ent, &zap->kvps, next) {
	+ uint64_t li;
	+ uint16_t chunks;
	+
	+ li = ZAP_HASH_IDX(ent->hash, prefixlen);
	+
	+ chunks = zap_entry_chunks(ent);
	+ if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) {
	+ /*
	+ * Not enough space, grow the prefix and retry.
	+ */
	+ break;
	+ }
	+ leafchunks[li] += chunks;
	+ }
	+ free(leafchunks);
	+
	+ if (ent == NULL) {
	+ /*
	+ * Everything fits, we're done.
	+ */
	+ break;
	+ }
	+ }
	+
	+ /*
	+ * If this fails, then we need to expand the pointer table. For now
	+ * this situation is unhandled since it is hard to trigger.
	+ */
	+ assert(prefixlen < (unsigned int)l->l_bs);
	+
	+ return (prefixlen);
	+}
	+
	+/*
	+ * Initialize a fat ZAP leaf block.
	+ */
	+static void
	+zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen)
	+{
	+ zap_leaf_phys_t *leaf;
	+
	+ leaf = l->l_phys;
	+
	+ leaf->l_hdr.lh_block_type = ZBT_LEAF;
	+ leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
	+ leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
	+ leaf->l_hdr.lh_prefix = prefix;
	+ leaf->l_hdr.lh_prefix_len = prefixlen;
	+
	+ /* Initialize the leaf hash table. */
	+ assert(leaf->l_hdr.lh_nfree < 0xffff);
	+ memset(leaf->l_hash, 0xff,
	+ ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash));
	+
	+ /* Initialize the leaf chunks. */
	+ for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
	+ struct zap_leaf_free *lf;
	+
	+ lf = &ZAP_LEAF_CHUNK(l, i).l_free;
	+ lf->lf_type = ZAP_CHUNK_FREE;
	+ if (i + 1 == ZAP_LEAF_NUMCHUNKS(l))
	+ lf->lf_next = 0xffff;
	+ else
	+ lf->lf_next = i + 1;
	+ }
	+}
	+
	+static void
	+zap_fat_write(zfs_opt_t zfs, zfs_zap_t zap)
	+{
	+ struct dnode_cursor *c;
	+ zap_leaf_t l;
	+ zap_phys_t *zaphdr;
	+ struct zap_table_phys *zt;
	+ zfs_zap_entry_t *ent;
	+ dnode_phys_t *dnode;
	+ uint8_t *leafblks;
	+ uint64_t lblkcnt, *ptrhasht;
	+ off_t loc, blksz;
	+ size_t blkshift;
	+ unsigned int prefixlen;
	+ int ptrcnt;
	+
	+ /*
	+ * For simplicity, always use the largest block size. This should be ok
	+ * since most directories will be micro ZAPs, but it's space inefficient
	+ * for small ZAPs and might need to be revisited.
	+ */
	+ blkshift = MAXBLOCKSHIFT;
	+ blksz = (off_t)1 << blkshift;
	+
	+ /*
	+ * Embedded pointer tables give up to 8192 entries. This ought to be
	+ * enough for anything except massive directories.
	+ */
	+ ptrcnt = (blksz / 2) / sizeof(uint64_t);
	+
	+ memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
	+ zaphdr = (zap_phys_t *)&zfs->filebuf[0];
	+ zaphdr->zap_block_type = ZBT_HEADER;
	+ zaphdr->zap_magic = ZAP_MAGIC;
	+ zaphdr->zap_num_entries = zap->kvpcnt;
	+ zaphdr->zap_salt = zap->hashsalt;
	+
	+ l.l_bs = blkshift;
	+ l.l_phys = NULL;
	+
	+ zt = &zaphdr->zap_ptrtbl;
	+ zt->zt_blk = 0;
	+ zt->zt_numblks = 0;
	+ zt->zt_shift = flsl(ptrcnt) - 1;
	+ zt->zt_nextblk = 0;
	+ zt->zt_blks_copied = 0;
	+
	+ /*
	+ * How many leaf blocks do we need? Initialize them and update the
	+ * header.
	+ */
	+ prefixlen = zap_fat_write_prefixlen(zap, &l);
	+ lblkcnt = 1 << prefixlen;
	+ leafblks = ecalloc(lblkcnt, blksz);
	+ for (unsigned int li = 0; li < lblkcnt; li++) {
	+ l.l_phys = (zap_leaf_phys_t )(leafblks + li blksz);
	+ zap_fat_write_leaf_init(&l, li, prefixlen);
	+ }
	+ zaphdr->zap_num_leafs = lblkcnt;
	+ zaphdr->zap_freeblk = lblkcnt + 1;
	+
	+ /*
	+ * For each entry, figure out which leaf block it belongs to based on
	+ * the upper bits of its hash, allocate chunks from that leaf, and fill
	+ * them out.
	+ */
	+ ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2);
	+ STAILQ_FOREACH(ent, &zap->kvps, next) {
	+ struct zap_leaf_entry *le;
	+ uint16_t *lptr;
	+ uint64_t hi, li;
	+ uint16_t namelen, nchunks, nnamechunks, nvalchunks;
	+
	+ hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift);
	+ li = ZAP_HASH_IDX(ent->hash, prefixlen);
	+ assert(ptrhasht[hi] == 0 \|\| ptrhasht[hi] == li + 1);
	+ ptrhasht[hi] = li + 1;
	+ l.l_phys = (zap_leaf_phys_t )(leafblks + li blksz);
	+
	+ namelen = strlen(ent->name) + 1;
	+
	+ /*
	+ * How many leaf chunks do we need for this entry?
	+ */
	+ nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES);
	+ nvalchunks = howmany(ent->intcnt,
	+ ZAP_LEAF_ARRAY_BYTES / ent->intsz);
	+ nchunks = 1 + nnamechunks + nvalchunks;
	+
	+ /*
	+ * Allocate a run of free leaf chunks for this entry,
	+ * potentially extending a hash chain.
	+ */
	+ assert(l.l_phys->l_hdr.lh_nfree >= nchunks);
	+ l.l_phys->l_hdr.lh_nfree -= nchunks;
	+ l.l_phys->l_hdr.lh_nentries++;
	+ lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash);
	+ while (*lptr != 0xffff) {
	+ assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l));
	+ le = ZAP_LEAF_ENTRY(&l, *lptr);
	+ assert(le->le_type == ZAP_CHUNK_ENTRY);
	+ le->le_cd++;
	+ lptr = &le->le_next;
	+ }
	+ *lptr = l.l_phys->l_hdr.lh_freelist;
	+ l.l_phys->l_hdr.lh_freelist += nchunks;
	+ assert(l.l_phys->l_hdr.lh_freelist <=
	+ ZAP_LEAF_NUMCHUNKS(&l));
	+ if (l.l_phys->l_hdr.lh_freelist ==
	+ ZAP_LEAF_NUMCHUNKS(&l))
	+ l.l_phys->l_hdr.lh_freelist = 0xffff;
	+
	+ /*
	+ * Integer values must be stored in big-endian format.
	+ */
	+ switch (ent->intsz) {
	+ case 1:
	+ break;
	+ case 2:
	+ for (uint16_t *v = ent->val16p;
	+ v - ent->val16p < (ptrdiff_t)ent->intcnt;
	+ v++)
	+ v = htobe16(v);
	+ break;
	+ case 4:
	+ for (uint32_t *v = ent->val32p;
	+ v - ent->val32p < (ptrdiff_t)ent->intcnt;
	+ v++)
	+ v = htobe32(v);
	+ break;
	+ case 8:
	+ for (uint64_t *v = ent->val64p;
	+ v - ent->val64p < (ptrdiff_t)ent->intcnt;
	+ v++)
	+ v = htobe64(v);
	+ break;
	+ default:
	+ assert(0);
	+ }
	+
	+ /*
	+ * Finally, write out the leaf chunks for this entry.
	+ */
	+ le = ZAP_LEAF_ENTRY(&l, *lptr);
	+ assert(le->le_type == ZAP_CHUNK_FREE);
	+ le->le_type = ZAP_CHUNK_ENTRY;
	+ le->le_next = 0xffff;
	+ le->le_name_chunk = *lptr + 1;
	+ le->le_name_numints = namelen;
	+ le->le_value_chunk = *lptr + 1 + nnamechunks;
	+ le->le_value_intlen = ent->intsz;
	+ le->le_value_numints = ent->intcnt;
	+ le->le_hash = ent->hash;
	+ zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name);
	+ zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks,
	+ ent->intcnt * ent->intsz, ent->valp);
	+ }
	+
	+ /*
	+ * Initialize unused slots of the pointer table.
	+ */
	+ for (int i = 0; i < ptrcnt; i++)
	+ if (ptrhasht[i] == 0)
	+ ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1;
	+
	+ /*
	+ * Write the whole thing to disk.
	+ */
	+ dnode = zap->dnode;
	+ dnode->dn_nblkptr = 1;
	+ dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
	+ dnode->dn_maxblkid = lblkcnt + 1;
	+ dnode->dn_flags = DNODE_FLAG_USED_BYTES;
	+
	+ c = dnode_cursor_init(zfs, zap->os, zap->dnode,
	+ (lblkcnt + 1) * blksz, blksz);
	+
	+ loc = objset_space_alloc(zfs, zap->os, &blksz);
	+ vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc,
	+ dnode_cursor_next(zfs, c, 0));
	+
	+ for (uint64_t i = 0; i < lblkcnt; i++) {
	+ loc = objset_space_alloc(zfs, zap->os, &blksz);
	+ vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz,
	+ blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz));
	+ }
	+
	+ dnode_cursor_finish(zfs, c);
	+
	+ free(leafblks);
	+}
	+
	+static void
	+zap_write(zfs_opt_t zfs, zfs_zap_t zap)
	+{
	+ zfs_zap_entry_t *ent;
	+
	+ if (zap->micro) {
	+ zap_micro_write(zfs, zap);
	+ } else {
	+ assert(!STAILQ_EMPTY(&zap->kvps));
	+ assert(zap->kvpcnt > 0);
	+ zap_fat_write(zfs, zap);
	+ }
	+
	+ while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) {
	+ STAILQ_REMOVE_HEAD(&zap->kvps, next);
	+ if (ent->val64p != &ent->val64)
	+ free(ent->valp);
	+ free(ent->name);
	+ free(ent);
	+ }
	+}
	+
	+static nvlist_t *
	+pool_config_nvcreate(zfs_opt_t *zfs)
	+{
	+ nvlist_t featuresnv, poolnv;
	+
	+ poolnv = nvlist_create(NV_UNIQUE_NAME);
	+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG_INITIAL);
	+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
	+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
	+ nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
	+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->guid);
	+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->guid);
	+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->guid);
	+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
	+
	+ featuresnv = nvlist_create(NV_UNIQUE_NAME);
	+ nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
	+ nvlist_destroy(featuresnv);
	+
	+ return (poolnv);
	+}
	+
	+static nvlist_t *
	+pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
	+{
	+ nvlist_t *diskvdevnv;
	+
	+ assert(zfs->objarrid != 0);
	+
	+ diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
	+ nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
	+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
	+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
	+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->guid);
	+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
	+ nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
	+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
	+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG_INITIAL);
	+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
	+ zfs->objarrid);
	+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
	+ zfs->msshift);
	+
	+ return (diskvdevnv);
	+}
	+
	+static nvlist_t *
	+pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
	+{
	+ nvlist_t diskvdevnv, rootvdevnv;
	+
	+ diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
	+ rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
	+
	+ nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
	+ nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->guid);
	+ nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
	+ nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG_INITIAL);
	+ nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
	+ 1);
	+ nvlist_destroy(diskvdevnv);
	+
	+ return (rootvdevnv);
	+}
	+
	+/*
	+ * Create the pool's "config" object, which contains an nvlist describing pool
	+ * parameters and the vdev topology. It is similar but not identical to the
	+ * nvlist stored in vdev labels. The main difference is that vdev labels do not
	+ * describe the full vdev tree and in particular do not contain the "root"
	+ * meta-vdev.
	+ */
	+static void
	+pool_init_objdir_config(zfs_opt_t zfs, zfs_zap_t objdir)
	+{
	+ dnode_phys_t *dnode;
	+ nvlist_t poolconfig, vdevconfig;
	+ zfs_objset_t *mos;
	+ void *configbuf;
	+ uint64_t dnid;
	+ off_t configloc, configblksz;
	+ int error;
	+
	+ mos = &zfs->mos;
	+
	+ dnode = objset_dnode_bonus_alloc(mos, DMU_OT_PACKED_NVLIST,
	+ DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
	+
	+ poolconfig = pool_config_nvcreate(zfs);
	+
	+ vdevconfig = pool_root_vdev_config_nvcreate(zfs);
	+ nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
	+ nvlist_destroy(vdevconfig);
	+
	+ error = nvlist_export(poolconfig);
	+ if (error != 0)
	+ errc(1, error, "nvlist_export");
	+
	+ configblksz = nvlist_size(poolconfig);
	+ configloc = objset_space_alloc(zfs, mos, &configblksz);
	+ configbuf = ecalloc(1, configblksz);
	+ nvlist_copy(poolconfig, configbuf, configblksz);
	+
	+ vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
	+
	+ dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
	+ dnode->dn_flags = DNODE_FLAG_USED_BYTES;
	+ (uint64_t )DN_BONUS(dnode) = nvlist_size(poolconfig);
	+
	+ zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
	+
	+ nvlist_destroy(poolconfig);
	+ free(configbuf);
	+}
	+
	+/*
	+ * Add objects block pointer list objects, used for deferred frees. We don't do
	+ * anything with them, but they need to be present or OpenZFS will refuse to
	+ * import the pool.
	+ */
	+static void
	+pool_init_objdir_bplists(zfs_opt_t zfs __unused, zfs_zap_t objdir)
	+{
	+ zfs_objset_t *mos;
	+ uint64_t dnid;
	+
	+ mos = &zfs->mos;
	+
	+ (void)objset_dnode_bonus_alloc(mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
	+ BPOBJ_SIZE_V2, &dnid);
	+ zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
	+
	+ (void)objset_dnode_bonus_alloc(mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
	+ BPOBJ_SIZE_V2, &dnid);
	+ zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
	+}
	+
	+/*
	+ * Add required feature metadata objects. We don't know anything about ZFS
	+ * features, so the objects are just empty ZAPs.
	+ */
	+static void
	+pool_init_objdir_feature_maps(zfs_opt_t zfs, zfs_zap_t objdir)
	+{
	+ zfs_zap_t zap;
	+ zfs_objset_t *mos;
	+ dnode_phys_t *dnode;
	+ uint64_t dnid;
	+
	+ mos = &zfs->mos;
	+
	+ dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid);
	+ zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
	+ zap_init(&zap, mos, dnode);
	+ zap_write(zfs, &zap);
	+
	+ dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid);
	+ zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
	+ zap_init(&zap, mos, dnode);
	+ zap_write(zfs, &zap);
	+
	+ dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid);
	+ zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
	+ zap_init(&zap, mos, dnode);
	+ zap_write(zfs, &zap);
	+}
	+
	+static void
	+pool_init_objdir_dsl(zfs_opt_t zfs, zfs_zap_t objdir)
	+{
	+ uint64_t id;
	+
	+ id = zfs->rootdsldir.dirid;
	+ assert(id > 0);
	+ zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, id);
	+}
	+
	+static void
	+pool_init_objdir_poolprops(zfs_opt_t zfs, zfs_zap_t objdir)
	+{
	+ dnode_phys_t *dnode;
	+ uint64_t id;
	+
	+ dnode = objset_dnode_alloc(&zfs->mos, DMU_OT_POOL_PROPS, &id);
	+ zap_init(&zfs->poolprops, &zfs->mos, dnode);
	+ zap_add_uint64(objdir, DMU_POOL_PROPS, id);
	+}
	+
	+/*
	+ * Initialize the MOS object directory, the root of virtually all of the pool's
	+ * data and metadata.
	+ */
	+static void
	+pool_init_objdir(zfs_opt_t *zfs)
	+{
	+ zfs_zap_t zap;
	+ dnode_phys_t *objdir;
	+
	+ objdir = objset_dnode_lookup(&zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
	+
	+ zap_init(&zap, &zfs->mos, objdir);
	+ pool_init_objdir_config(zfs, &zap);
	+ pool_init_objdir_bplists(zfs, &zap);
	+ pool_init_objdir_feature_maps(zfs, &zap);
	+ pool_init_objdir_dsl(zfs, &zap);
	+ pool_init_objdir_poolprops(zfs, &zap);
	+ zap_write(zfs, &zap);
	+}
	+
	+/*
	+ * Initialize the meta-object set and immediately write out several special
	+ * objects whose contents are already finalized, including the object directory.
	+ */
	+static void
	+pool_init(zfs_opt_t *zfs)
	+{
	+ struct dataset_desc *d;
	+ zfs_objset_t *mos;
	+ uint64_t dnid, dnodecount;
	+
	+ zfs->guid = 0xdeadfacec0debeef;
	+
	+ mos = &zfs->mos;
	+
	+ /*
	+ * Figure out how many dnodes will be allocated from the MOS.
	+ */
	+ dnodecount = 0;
	+ dnodecount++; /* object directory (ZAP) */
	+ dnodecount++; /* \|-> vdev config object (nvlist) */
	+ dnodecount++; /* \|-> features for read */
	+ dnodecount++; /* \|-> features for write */
	+ dnodecount++; /* \|-> feature descriptions */
	+ dnodecount++; /* \|-> sync bplist */
	+ dnodecount++; /* \|-> free bplist */
	+ dnodecount++; /* \|-> pool properties */
	+ dnodecount++; /* L-> root DSL directory */
	+ dnodecount++; /* \|-> DSL child directory (ZAP) */
	+ dnodecount++; /* \| \|-> $MOS (DSL dir) */
	+ dnodecount++; /* \| \| \|-> child map */
	+ dnodecount++; /* \| \| L-> props (ZAP) */
	+ dnodecount++; /* \| \|-> $FREE (DSL dir) */
	+ dnodecount++; /* \| \| \|-> child map */
	+ dnodecount++; /* \| \| L-> props (ZAP) */
	+ dnodecount++; /* \| L-> $ORIGIN (DSL dir) */
	+ dnodecount++; /* \| \|-> child map */
	+ dnodecount++; /* \| \|-> dataset */
	+ dnodecount++; /* \| \| L-> deadlist */
	+ dnodecount++; /* \| \|-> snapshot */
	+ dnodecount++; /* \| \| \|-> deadlist */
	+ dnodecount++; /* \| \| L-> snapshot names */
	+ dnodecount++; /* \| \|-> props (ZAP) */
	+ dnodecount++; /* \| L-> clones (ZAP) */
	+ dnodecount++; /* \|-> DSL root dataset */
	+ dnodecount++; /* \| \|-> snapshot names */
	+ dnodecount++; /* \| L-> deadlist */
	+ dnodecount++; /* L-> props (ZAP) */
	+ /*
	+ * Space map stuff.
	+ */
	+ dnodecount++; /* space map object array */
	+ dnodecount += zfs->mscount; /* space maps */
	+ /*
	+ * Child datasets.
	+ */
	+ STAILQ_FOREACH(d, &zfs->datasets, next) {
	+ char buf[BUFSIZ];
	+
	+ /* Ugly hack to skip over root dataset parameters. */
	+ snprintf(buf, sizeof(buf), "%s:", zfs->poolname);
	+ if (strncmp(buf, d->params, strlen(buf)) == 0)
	+ continue;
	+
	+ dnodecount++; /* DSL directory */
	+ dnodecount++; /* \|-> DSL dataset */
	+ dnodecount++; /* \| \|-> snapshot names */
	+ dnodecount++; /* \| L-> deadlist */
	+ dnodecount++; /* \|-> child map */
	+ dnodecount++; /* \|-> props */
	+ }
	+
	+ objset_init(zfs, mos, DMU_OST_META, dnodecount);
	+
	+ (void)objset_dnode_alloc(mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
	+ assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
	+
	+ (void)objset_dnode_alloc(mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
	+
	+ dsl_init(zfs);
	+
	+ pool_init_objdir(zfs);
	+}
	+
	+static void
	+pool_labels_write(zfs_opt_t *zfs)
	+{
	+ uberblock_t *ub;
	+ vdev_label_t *label;
	+ nvlist_t poolconfig, vdevconfig;
	+ int error;
	+
	+ label = ecalloc(1, sizeof(*label));
	+
	+ /*
	+ * Assemble the vdev configuration and store it in the label.
	+ */
	+ poolconfig = pool_config_nvcreate(zfs);
	+ vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
	+ nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
	+ nvlist_destroy(vdevconfig);
	+
	+ error = nvlist_export(poolconfig);
	+ if (error != 0)
	+ errc(1, error, "nvlist_export");
	+ nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
	+ sizeof(label->vl_vdev_phys.vp_nvlist));
	+ nvlist_destroy(poolconfig);
	+
	+ /*
	+ * Fill out the uberblock. Just make each one the same. The embedded
	+ * checksum is calculated in vdev_label_write().
	+ */
	+ for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
	+ uoff += (1 << zfs->ashift)) {
	+ ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
	+ ub->ub_magic = UBERBLOCK_MAGIC;
	+ ub->ub_version = SPA_VERSION;
	+ ub->ub_txg = TXG_INITIAL;
	+ ub->ub_guid_sum = zfs->guid + zfs->guid; /* root + disk */
	+ ub->ub_timestamp = 0; /* XXX-MJ */
	+
	+ ub->ub_software_version = SPA_VERSION;
	+ ub->ub_mmp_magic = MMP_MAGIC;
	+ ub->ub_mmp_delay = 0;
	+ ub->ub_mmp_config = 0;
	+ ub->ub_checkpoint_txg = 0;
	+ memcpy(&ub->ub_rootbp, &zfs->mos.osbp, sizeof(blkptr_t));
	+ }
	+
	+ /*
	+ * Write out four copies of the label: two at the beginning of the vdev
	+ * and two at the end.
	+ */
	+ for (int i = 0; i < VDEV_LABELS; i++)
	+ vdev_label_write(zfs, i, label);
	+
	+ free(label);
	+}
	+
	+static void
	+pool_fini(zfs_opt_t *zfs)
	+{
	+ zap_write(zfs, &zfs->poolprops);
	+ dsl_write(zfs);
	+ objset_mos_write(zfs);
	+ pool_labels_write(zfs);
	+}
	+
	+/*
	+ * Visit each node in a directory hierarchy, in pre-order depth-first order.
	+ */
	+static void
	+fsnode_foreach(fsnode root, int (cb)(fsnode , void ), void *arg)
	+{
	+ assert(root->type == S_IFDIR);
	+
	+ for (fsnode *cur = root; cur != NULL; cur = cur->next) {
	+ assert(cur->type == S_IFREG \|\| cur->type == S_IFDIR \|\|
	+ cur->type == S_IFLNK);
	+
	+ if (cb(cur, arg) == 0)
	+ continue;
	+ if (cur->type == S_IFDIR && cur->child != NULL)
	+ fsnode_foreach(cur->child, cb, arg);
	+ }
	+}
	+
	+static bool
	+fsnode_isroot(const fsnode *cur)
	+{
	+ return (strcmp(cur->name, ".") == 0);
	+}
	+
	+static struct dnode_cursor *
	+dnode_cursor_init(zfs_opt_t zfs, zfs_objset_t os, dnode_phys_t *dnode,
	+ off_t size, off_t blksz)
	+{
	+ struct dnode_cursor *c;
	+ uint64_t nbppindir, indlevel, ndatablks, nindblks;
	+
	+ assert(dnode->dn_nblkptr == 1);
	+ assert(blksz <= MAXBLOCKSIZE);
	+
	+ if (blksz == 0) {
	+ /* Must be between 1<<ashift and 128KB. */
	+ blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
	+ powerof2(size) ? size : (1ul << flsl(size))));
	+ }
	+ assert(powerof2(blksz));
	+
	+ /*
	+ * Do we need indirect blocks? Figure out how many levels are needed
	+ * (indlevel == 1 means no indirect blocks) and how much space is needed
	+ * (it has to be allocated up-front to break the dependency cycle
	+ * described in objset_mos_write()).
	+ */
	+ ndatablks = size == 0 ? 0 : howmany(size, blksz);
	+ nindblks = 0;
	+ for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
	+ nbppindir *= BLKPTR_PER_INDIR;
	+ nindblks += howmany(ndatablks, indlevel * nbppindir);
	+ }
	+ assert(indlevel < INDIR_LEVELS);
	+
	+ dnode->dn_nlevels = (uint8_t)indlevel;
	+ dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
	+ dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
	+
	+ c = ecalloc(1, sizeof(*c));
	+ if (nindblks > 0) {
	+ c->indspace = nindblks * MAXBLOCKSIZE;
	+ c->indloc = objset_space_alloc(zfs, os, &c->indspace);
	+ }
	+ c->dnode = dnode;
	+ c->dataoff = 0;
	+ c->datablksz = blksz;
	+
	+ return (c);
	+}
	+
	+static void
	+_dnode_cursor_flush(zfs_opt_t zfs, struct dnode_cursor c, int levels)
	+{
	+ blkptr_t bp, pbp;
	+ void *buf;
	+ uint64_t fill;
	+ off_t blkid, blksz, loc;
	+
	+ assert(levels > 0);
	+ assert(levels <= c->dnode->dn_nlevels - 1);
	+
	+ blksz = MAXBLOCKSIZE;
	+ blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
	+ for (int level = 1; level <= levels; level++) {
	+ buf = c->inddir[level - 1];
	+
	+ if (level == c->dnode->dn_nlevels - 1) {
	+ pbp = &c->dnode->dn_blkptr[0];
	+ } else {
	+ uint64_t iblkid;
	+
	+ iblkid = blkid & (BLKPTR_PER_INDIR - 1);
	+ pbp = (blkptr_t *)
	+ &c->inddir[level][iblkid * sizeof(blkptr_t)];
	+ }
	+
	+ /*
	+ * Space for indirect blocks is allocated up-front; see the
	+ * comment in objset_mos_write().
	+ */
	+ loc = c->indloc;
	+ c->indloc += blksz;
	+ assert(c->indspace >= blksz);
	+ c->indspace -= blksz;
	+
	+ bp = buf;
	+ fill = 0;
	+ for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
	+ fill += BP_GET_FILL(&bp[i]);
	+
	+ vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
	+ loc, pbp);
	+ memset(buf, 0, MAXBLOCKSIZE);
	+
	+ blkid /= BLKPTR_PER_INDIR;
	+ }
	+}
	+
	+static blkptr_t *
	+dnode_cursor_next(zfs_opt_t zfs, struct dnode_cursor c, off_t off)
	+{
	+ off_t blkid, l1id;
	+ int levels;
	+
	+ if (c->dnode->dn_nlevels == 1) {
	+ assert(off < MAXBLOCKSIZE);
	+ return (&c->dnode->dn_blkptr[0]);
	+ }
	+
	+ assert(off % c->datablksz == 0);
	+
	+ /* Do we need to flush any full indirect blocks? */
	+ if (off > 0) {
	+ blkid = off / c->datablksz;
	+ for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) {
	+ if (blkid % BLKPTR_PER_INDIR != 0)
	+ break;
	+ blkid /= BLKPTR_PER_INDIR;
	+ }
	+ if (levels > 0)
	+ _dnode_cursor_flush(zfs, c, levels);
	+ }
	+
	+ c->dataoff = off;
	+ l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
	+ return ((blkptr_t )&c->inddir[0][l1id sizeof(blkptr_t)]);
	+}
	+
	+static void
	+dnode_cursor_finish(zfs_opt_t zfs, struct dnode_cursor c)
	+{
	+ int levels;
	+
	+ levels = c->dnode->dn_nlevels - 1;
	+ if (levels > 0)
	+ _dnode_cursor_flush(zfs, c, levels);
	+ assert(c->indspace == 0);
	+ free(c);
	+}
	+
	+struct fs_populate_dir {
	+ SLIST_ENTRY(fs_populate_dir) next;
	+ int dirfd;
	+ uint64_t objid;
	+ zfs_zap_t zap;
	+};
	+
	+struct fs_populate_arg {
	+ zfs_opt_t *zfs;
	+ zfs_fs_t fs; / owning filesystem */
	+ int dirfd; /* current directory fd */
	+ uint64_t rootdirid; /* root directory dnode ID */
	+ SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */
	+};
	+
	+static void
	+fs_populate_dirent(struct fs_populate_arg arg, fsnode cur, uint64_t dnid)
	+{
	+ struct fs_populate_dir *dir;
	+ uint64_t type;
	+
	+ switch (cur->type) {
	+ case S_IFREG:
	+ type = DT_REG;
	+ break;
	+ case S_IFDIR:
	+ type = DT_DIR;
	+ break;
	+ case S_IFLNK:
	+ type = DT_LNK;
	+ break;
	+ default:
	+ assert(0);
	+ }
	+
	+ dir = SLIST_FIRST(&arg->dirs);
	+ zap_add_uint64(&dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
	+}
	+
	+static void
	+fs_populate_attr(zfs_fs_t fs, char attrbuf, const void *val, uint16_t ind,
	+ size_t *szp)
	+{
	+ assert(ind < fs->sacnt);
	+ assert(fs->saoffs[ind] != 0xffff);
	+
	+ memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
	+ *szp += fs->satab[ind].size;
	+}
	+
	+static void
	+fs_populate_varszattr(zfs_fs_t fs, char attrbuf, const void *val,
	+ size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
	+{
	+ assert(ind < fs->sacnt);
	+ assert(fs->saoffs[ind] != 0xffff);
	+ assert(fs->satab[ind].size == 0);
	+
	+ memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
	+ *szp += valsz;
	+}
	+
	+static void
	+fs_populate_sattrs(struct fs_populate_arg arg, const fsnode cur,
	+ dnode_phys_t *dnode)
	+{
	+ char target[PATH_MAX];
	+ zfs_fs_t *fs;
	+ zfs_ace_hdr_t aces[3];
	+ struct stat *sb;
	+ sa_hdr_phys_t *sahdr;
	+ uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
	+ char *attrbuf;
	+ size_t bonussz, hdrsz;
	+ int layout;
	+
	+ assert(dnode->dn_bonustype == DMU_OT_SA);
	+ assert(dnode->dn_nblkptr == 1);
	+
	+ fs = arg->fs;
	+ sb = &cur->inode->st;
	+
	+ switch (cur->type) {
	+ case S_IFREG:
	+ layout = SA_LAYOUT_INDEX_DEFAULT;
	+ links = cur->inode->nlink;
	+ objsize = sb->st_size;
	+ parent = SLIST_FIRST(&arg->dirs)->objid;
	+ break;
	+ case S_IFDIR:
	+ layout = SA_LAYOUT_INDEX_DEFAULT;
	+ links = 1; /* .. */
	+ objsize = 1; /* .. */
	+
	+ /*
	+ * The size of a ZPL directory is the number of entries
	+ * (including "." and ".."), and the link count is the number of
	+ * entries which are directories (including "." and "..").
	+ */
	+ for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
	+ c != NULL; c = c->next) {
	+ if (c->type == S_IFDIR)
	+ links++;
	+ objsize++;
	+ }
	+
	+ /* The root directory is its own parent. */
	+ parent = SLIST_EMPTY(&arg->dirs) ?
	+ arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
	+ break;
	+ case S_IFLNK: {
	+ ssize_t n;
	+
	+ if ((n = readlinkat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
	+ target, sizeof(target) - 1)) == -1)
	+ err(1, "readlinkat(%s)", cur->name);
	+ target[n] = '\0';
	+
	+ layout = SA_LAYOUT_INDEX_SYMLINK;
	+ links = 1;
	+ objsize = strlen(target);
	+ parent = SLIST_FIRST(&arg->dirs)->objid;
	+ break;
	+ }
	+ default:
	+ assert(0);
	+ }
	+
	+ daclcount = nitems(aces);
	+ flags = ZFS_ACL_TRIVIAL \| ZFS_ACL_AUTO_INHERIT \| ZFS_NO_EXECS_DENIED \|
	+ ZFS_ARCHIVE \| ZFS_AV_MODIFIED; /* XXX-MJ */
	+ gen = 1;
	+ gid = sb->st_gid;
	+ mode = sb->st_mode;
	+ uid = sb->st_uid;
	+
	+ /* XXX-MJ need to review these */
	+ memset(aces, 0, sizeof(aces));
	+ aces[0].z_flags = ACE_OWNER;
	+ aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
	+ aces[0].z_access_mask = ACE_READ_DATA \| ACE_WRITE_ATTRIBUTES \|
	+ ACE_WRITE_OWNER \| ACE_WRITE_ACL \| ACE_WRITE_NAMED_ATTRS \|
	+ ACE_READ_ACL \| ACE_READ_ATTRIBUTES \| ACE_READ_NAMED_ATTRS \|
	+ ACE_SYNCHRONIZE;
	+ aces[1].z_flags = ACE_GROUP \| ACE_IDENTIFIER_GROUP;
	+ aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
	+ aces[1].z_access_mask = ACE_READ_DATA \| ACE_READ_ACL \|
	+ ACE_READ_ATTRIBUTES \| ACE_READ_NAMED_ATTRS \| ACE_SYNCHRONIZE;
	+ aces[2].z_flags = ACE_EVERYONE;
	+ aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
	+ aces[2].z_access_mask = ACE_READ_DATA \| ACE_READ_ACL \|
	+ ACE_READ_ATTRIBUTES \| ACE_READ_NAMED_ATTRS \| ACE_SYNCHRONIZE;
	+
	+ switch (layout) {
	+ case SA_LAYOUT_INDEX_DEFAULT:
	+ /* At most one variable-length attribute. */
	+ hdrsz = sizeof(uint64_t);
	+ break;
	+ case SA_LAYOUT_INDEX_SYMLINK:
	+ /* At most five variable-length attributes. */
	+ hdrsz = sizeof(uint64_t) * 2;
	+ break;
	+ default:
	+ assert(0);
	+ }
	+
	+ sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
	+ sahdr->sa_magic = SA_MAGIC;
	+ SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
	+
	+ bonussz = SA_HDR_SIZE(sahdr);
	+ attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
	+
	+ fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
	+ fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
	+ fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
	+ fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
	+ fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
	+ fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
	+ fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
	+ fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
	+ fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
	+
	+ /*
	+ * We deliberately set atime = mtime here to ensure that images are
	+ * reproducible.
	+ */
	+ assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size);
	+ fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
	+ assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size);
	+ fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
	+ assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size);
	+ fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
	+ assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size);
	+ fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
	+
	+ fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
	+ ZPL_DACL_ACES, &bonussz);
	+ sahdr->sa_lengths[0] = sizeof(aces);
	+
	+ if (cur->type == S_IFLNK) {
	+ assert(layout == SA_LAYOUT_INDEX_SYMLINK);
	+ /* Need to use a spill block pointer if the target is long. */
	+ assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
	+ fs_populate_varszattr(fs, attrbuf, target, objsize,
	+ sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
	+ sahdr->sa_lengths[1] = (uint16_t)objsize;
	+ }
	+
	+ dnode->dn_bonuslen = bonussz;
	+}
	+
	+static void
	+fs_populate_file(fsnode cur, struct fs_populate_arg arg)
	+{
	+ struct dnode_cursor *c;
	+ dnode_phys_t *dnode;
	+ zfs_opt_t *zfs;
	+ char *buf;
	+ uint64_t dnid;
	+ ssize_t n;
	+ size_t bufsz;
	+ off_t size, target;
	+ int fd;
	+
	+ assert(cur->type == S_IFREG);
	+ assert((cur->inode->flags & FI_ROOT) == 0);
	+
	+ zfs = arg->zfs;
	+
	+ assert(cur->inode->ino != 0);
	+ if ((cur->inode->flags & FI_ALLOCATED) != 0) {
	+ /*
	+ * This is a hard link of an existing file.
	+ *
	+ * XXX-MJ need to check whether it crosses datasets, add a test
	+ * case for that
	+ */
	+ fs_populate_dirent(arg, cur, cur->inode->ino);
	+ return;
	+ }
	+
	+ dnode = objset_dnode_bonus_alloc(arg->fs->os,
	+ DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
	+ cur->inode->ino = dnid;
	+ cur->inode->flags \|= FI_ALLOCATED;
	+
	+ fd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, O_RDONLY);
	+ if (fd == -1)
	+ err(1, "openat(%s)", cur->name);
	+
	+ buf = zfs->filebuf;
	+ bufsz = sizeof(zfs->filebuf);
	+ size = cur->inode->st.st_size;
	+ c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
	+ for (off_t foff = 0; foff < size; foff += target) {
	+ off_t loc, sofar;
	+
	+ /* Fill up our buffer, handling partial reads. */
	+ sofar = 0;
	+ target = MIN(size - foff, (off_t)bufsz);
	+ do {
	+ n = read(fd, buf + sofar, target);
	+ if (n < 0)
	+ err(1, "reading from '%s'", cur->name);
	+ if (n == 0)
	+ errx(1, "unexpected EOF reading '%s'",
	+ cur->name);
	+ sofar += n;
	+ } while (sofar < target);
	+
	+ if (target < (off_t)bufsz)
	+ memset(buf + target, 0, bufsz - target);
	+
	+ loc = objset_space_alloc(zfs, arg->fs->os, &target);
	+ vdev_pwrite_dnode_indir(zfs, c->dnode, 0, 1, buf, target, loc,
	+ dnode_cursor_next(zfs, c, foff));
	+ }
	+ if (close(fd) != 0)
	+ err(1, "close");
	+ dnode_cursor_finish(zfs, c);
	+
	+ fs_populate_sattrs(arg, cur, dnode);
	+ fs_populate_dirent(arg, cur, dnid);
	+}
	+
	+static void
	+fs_populate_dir(fsnode cur, struct fs_populate_arg arg)
	+{
	+ dnode_phys_t *dnode;
	+ zfs_objset_t *os;
	+ uint64_t dnid;
	+ int dirfd;
	+
	+ assert(cur->type == S_IFDIR);
	+ assert((cur->inode->flags & FI_ALLOCATED) == 0);
	+
	+ os = arg->fs->os;
	+
	+ dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
	+ DMU_OT_SA, 0, &dnid);
	+
	+ /*
	+ * Add an entry to the parent directory and open this directory.
	+ */
	+ if (!SLIST_EMPTY(&arg->dirs)) {
	+ fs_populate_dirent(arg, cur, dnid);
	+ dirfd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
	+ O_DIRECTORY);
	+ if (dirfd < 0)
	+ err(1, "open(%s)", cur->name);
	+ } else {
	+ arg->rootdirid = dnid;
	+ dirfd = arg->dirfd;
	+ }
	+
	+ fs_populate_sattrs(arg, cur, dnode);
	+
	+ /*
	+ * If this is a root directory, then its children belong to a different
	+ * dataset and this directory remains empty in the current objset.
	+ */
	+ if ((cur->inode->flags & FI_ROOT) == 0) {
	+ struct fs_populate_dir *dir;
	+
	+ dir = ecalloc(1, sizeof(*dir));
	+ dir->dirfd = dirfd;
	+ dir->objid = dnid;
	+ zap_init(&dir->zap, os, dnode);
	+ SLIST_INSERT_HEAD(&arg->dirs, dir, next);
	+ } else {
	+ zfs_zap_t dirzap;
	+
	+ zap_init(&dirzap, os, dnode);
	+ zap_write(arg->zfs, &dirzap);
	+
	+ fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
	+ }
	+}
	+
	+static void
	+fs_populate_symlink(fsnode cur, struct fs_populate_arg arg)
	+{
	+ dnode_phys_t *dnode;
	+ uint64_t dnid;
	+
	+ assert(cur->type == S_IFLNK);
	+ assert((cur->inode->flags & (FI_ALLOCATED \| FI_ROOT)) == 0);
	+
	+ dnode = objset_dnode_bonus_alloc(arg->fs->os,
	+ DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
	+
	+ fs_populate_dirent(arg, cur, dnid);
	+
	+ fs_populate_sattrs(arg, cur, dnode);
	+}
	+
	+static int
	+fs_foreach_populate(fsnode cur, void _arg)
	+{
	+ struct fs_populate_arg *arg;
	+ struct fs_populate_dir *dir;
	+ int ret;
	+
	+ arg = _arg;
	+ switch (cur->type) {
	+ case S_IFREG:
	+ fs_populate_file(cur, arg);
	+ break;
	+ case S_IFDIR:
	+ if (fsnode_isroot(cur))
	+ break;
	+ fs_populate_dir(cur, arg);
	+ break;
	+ case S_IFLNK:
	+ fs_populate_symlink(cur, arg);
	+ break;
	+ default:
	+ assert(0);
	+ }
	+
	+ ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
	+
	+ if (cur->next == NULL &&
	+ (cur->child == NULL \|\| (cur->inode->flags & FI_ROOT) != 0)) {
	+ /*
	+ * We reached a terminal node in a subtree. Walk back up and
	+ * write out directories. We're done once we hit the root of a
	+ * dataset or find a level where we're not on the edge of the
	+ * tree.
	+ */
	+ do {
	+ dir = SLIST_FIRST(&arg->dirs);
	+ SLIST_REMOVE_HEAD(&arg->dirs, next);
	+ zap_write(arg->zfs, &dir->zap);
	+ if (dir->dirfd != -1 && close(dir->dirfd) != 0)
	+ err(1, "close");
	+ free(dir);
	+ cur = cur->parent;
	+ } while (cur != NULL && cur->next == NULL &&
	+ (cur->inode->flags & FI_ROOT) == 0);
	+ }
	+
	+ return (ret);
	+}
	+
	+static void
	+fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
	+ const sa_attr_type_t layout[], size_t sacnt)
	+{
	+ char ti[16];
	+
	+ assert(sizeof(layout[0]) == 2);
	+
	+ snprintf(ti, sizeof(ti), "%u", index);
	+ zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
	+ (const uint8_t *)layout);
	+}
	+
	+/*
	+ * Initialize system attribute tables.
	+ *
	+ * There are two elements to this. First, we write the zpl_attrs[] and
	+ * zpl_attr_layout[] tables to disk. Then we create a lookup table which
	+ * allows us to set file attributes quickly.
	+ */
	+static uint64_t
	+fs_set_zpl_attrs(zfs_opt_t zfs, zfs_fs_t fs)
	+{
	+ zfs_zap_t sazap, salzap, sarzap;
	+ zfs_objset_t *os;
	+ dnode_phys_t saobj, salobj, *sarobj;
	+ uint64_t saobjid, salobjid, sarobjid;
	+ uint16_t offset;
	+
	+ os = fs->os;
	+
	+ /*
	+ * The on-disk tables are stored in two ZAP objects, the registry object
	+ * and the layout object. Individual attributes are described by
	+ * entries in the registry object; for example, the value for the
	+ * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
	+ * The attributes of a file are ordered according to one of the layouts
	+ * defined in the layout object. The master node object is simply used
	+ * to locate the registry and layout objects.
	+ */
	+ saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
	+ salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
	+ sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
	+
	+ zap_init(&sarzap, os, sarobj);
	+ for (size_t i = 0; i < nitems(zpl_attrs); i++) {
	+ const zfs_sattr_t *sa;
	+ uint64_t attr;
	+
	+ attr = 0;
	+ sa = &zpl_attrs[i];
	+ SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
	+ zap_add_uint64(&sarzap, sa->name, attr);
	+ }
	+ zap_write(zfs, &sarzap);
	+
	+ /*
	+ * Layouts are arrays of indices into the registry. We define two
	+ * layouts for use by the ZPL, one for non-symlinks and one for
	+ * symlinks. They are identical except that the symlink layout includes
	+ * ZPL_SYMLINK as its final attribute.
	+ */
	+ zap_init(&salzap, os, salobj);
	+ assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
	+ fs_add_zpl_attr_layout(&salzap, SA_LAYOUT_INDEX_DEFAULT,
	+ zpl_attr_layout, nitems(zpl_attr_layout) - 1);
	+ fs_add_zpl_attr_layout(&salzap, SA_LAYOUT_INDEX_SYMLINK,
	+ zpl_attr_layout, nitems(zpl_attr_layout));
	+ zap_write(zfs, &salzap);
	+
	+ zap_init(&sazap, os, saobj);
	+ zap_add_uint64(&sazap, SA_LAYOUTS, salobjid);
	+ zap_add_uint64(&sazap, SA_REGISTRY, sarobjid);
	+ zap_write(zfs, &sazap);
	+
	+ /* Sanity check. */
	+ for (size_t i = 0; i < nitems(zpl_attrs); i++)
	+ assert(i == zpl_attrs[i].id);
	+
	+ /*
	+ * Build the offset table used when setting file attributes. File
	+ * attributes are stored in the object's bonus buffer; this table
	+ * provides the buffer offset of attributes referenced by the layout
	+ * table.
	+ */
	+ fs->sacnt = nitems(zpl_attrs);
	+ fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
	+ for (size_t i = 0; i < fs->sacnt; i++)
	+ fs->saoffs[i] = 0xffff;
	+ offset = 0;
	+ for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
	+ uint16_t size;
	+
	+ assert(zpl_attr_layout[i] < fs->sacnt);
	+
	+ fs->saoffs[zpl_attr_layout[i]] = offset;
	+ size = zpl_attrs[zpl_attr_layout[i]].size;
	+ offset += size;
	+ }
	+ fs->satab = zpl_attrs;
	+
	+ return (saobjid);
	+}
	+
	+static void
	+fs_layout_one(zfs_opt_t zfs, zfs_dsl_dir_t dsldir, void *arg)
	+{
	+ char mountpoint, origmountpoint, name, next;
	+ fsnode cur, root;
	+ uint64_t canmount;
	+
	+ if (dsldir->headds == NULL)
	+ return;
	+
	+ mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
	+ if (mountpoint == NULL)
	+ return;
	+ if (nvlist_find_uint64(dsldir->propsnv, "canmount", &canmount) == 0 &&
	+ canmount == 0)
	+ return;
	+
	+ /*
	+ * If we were asked to specify a bootfs, set it here.
	+ */
	+ if (zfs->bootfs != NULL && strcmp(zfs->bootfs, dsldir->fullname) == 0)
	+ zap_add_uint64(&zfs->poolprops, "bootfs", dsldir->headds->dsid);
	+
	+ origmountpoint = mountpoint;
	+
	+ /*
	+ * Figure out which fsnode corresponds to our mountpoint.
	+ */
	+ root = arg;
	+ cur = root;
	+ if (strcmp(mountpoint, zfs->rootpath) != 0) {
	+ mountpoint += strlen(zfs->rootpath);
	+
	+ /*
	+ * Look up the directory in the staged tree. For example, if
	+ * the dataset's mount point is /foo/bar/baz, we'll search the
	+ * root directory for "foo", search "foo" for "baz", and so on.
	+ * Each intermediate name must refer to a directory; the final
	+ * component need not exist.
	+ */
	+ cur = root;
	+ for (next = name = mountpoint; next != NULL;) {
	+ for (; *next == '/'; next++)
	+ ;
	+ name = strsep(&next, "/");
	+
	+ for (; cur != NULL && strcmp(cur->name, name) != 0;
	+ cur = cur->next)
	+ ;
	+ if (cur == NULL) {
	+ if (next == NULL)
	+ break;
	+ errx(1, "missing mountpoint directory for `%s'",
	+ dsldir->fullname);
	+ }
	+ if (cur->type != S_IFDIR) {
	+ errx(1,
	+ "mountpoint for `%s' is not a directory",
	+ dsldir->fullname);
	+ }
	+ if (next != NULL)
	+ cur = cur->child;
	+ }
	+ }
	+
	+ if (cur != NULL) {
	+ assert(cur->type == S_IFDIR);
	+
	+ /*
	+ * Multiple datasets shouldn't share a mountpoint. It's
	+ * technically allowed, but it's not clear what makefs should do
	+ * in that case.
	+ */
	+ assert((cur->inode->flags & FI_ROOT) == 0);
	+ if (cur != root)
	+ cur->inode->flags \|= FI_ROOT;
	+ assert(cur->inode->param == NULL);
	+ cur->inode->param = dsldir;
	+ }
	+
	+ free(origmountpoint);
	+}
	+
	+static int
	+fs_foreach_count(fsnode cur, void arg)
	+{
	+ uint64_t *countp;
	+
	+ countp = arg;
	+ if (cur->type == S_IFDIR && fsnode_isroot(cur))
	+ return (1);
	+
	+ if (cur->inode->ino == 0) {
	+ cur->inode->ino = ++(*countp);
	+ cur->inode->nlink = 1;
	+ } else {
	+ cur->inode->nlink++;
	+ }
	+
	+ return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
	+}
	+
	+/*
	+ * Create a filesystem dataset. More specifically:
	+ * - create an object set for the dataset,
	+ * - add required metadata (SA tables, property definitions, etc.) to that
	+ * object set,
	+ * - optionally populate the object set with file objects, using "root" as the
	+ * root directory.
	+ *
	+ * "dirfd" is a directory descriptor for the directory referenced by "root". It
	+ * is closed before returning.
	+ */
	+static void
	+fs_build_one(zfs_opt_t zfs, zfs_dsl_dir_t dsldir, fsnode *root, int dirfd)
	+{
	+ struct fs_populate_arg arg;
	+ zfs_fs_t fs;
	+ zfs_zap_t deleteqzap, masterzap;
	+ zfs_objset_t *os;
	+ dnode_phys_t deleteq, masterobj;
	+ uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
	+ bool fakedroot;
	+
	+ if (root != NULL) {
	+ assert(root->type == S_IFDIR);
	+ assert(fsnode_isroot(root));
	+ }
	+
	+ os = ecalloc(1, sizeof(*os));
	+
	+ memset(&fs, 0, sizeof(fs));
	+ fs.os = os;
	+
	+ /*
	+ * This dataset's mountpoint doesn't exist in the staging tree. Fake up
	+ * a root fsnode to handle this case.
	+ */
	+ fakedroot = root == NULL;
	+ if (fakedroot) {
	+ struct stat *stp;
	+
	+ assert(dirfd == -1);
	+
	+ root = ecalloc(1, sizeof(*root));
	+ root->inode = ecalloc(1, sizeof(*root->inode));
	+ root->name = estrdup(".");
	+ root->type = S_IFDIR;
	+
	+ stp = &root->inode->st;
	+ stp->st_uid = 0;
	+ stp->st_gid = 0;
	+ stp->st_mode = S_IFDIR \| 0755;
	+ }
	+
	+ /*
	+ * How many dnodes do we need? One for each file/directory/symlink plus
	+ * several metadata objects.
	+ */
	+ dnodecount = 1; /* root directory */
	+ fsnode_foreach(root, fs_foreach_count, &dnodecount);
	+ dnodecount++; /* master object */
	+ dnodecount++; /* delete queue */
	+ dnodecount++; /* system attributes master node */
	+ dnodecount++; /* system attributes registry */
	+ dnodecount++; /* system attributes layout */
	+
	+ objset_init(zfs, os, DMU_OST_ZFS, dnodecount);
	+ masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
	+ assert(moid == MASTER_NODE_OBJ);
	+
	+ /*
	+ * Create the ZAP SA layout now since filesystem object dnodes will
	+ * refer to those attributes.
	+ */
	+ saobjid = fs_set_zpl_attrs(zfs, &fs);
	+
	+ /*
	+ * Populate the dataset with files from the staging directory. Most of
	+ * our runtime is spent here.
	+ */
	+ arg.dirfd = dirfd;
	+ arg.zfs = zfs;
	+ arg.fs = &fs;
	+ SLIST_INIT(&arg.dirs);
	+ fs_populate_dir(root, &arg);
	+ assert(!SLIST_EMPTY(&arg.dirs));
	+ fsnode_foreach(root, fs_foreach_populate, &arg);
	+ assert(SLIST_EMPTY(&arg.dirs));
	+ rootdirid = arg.rootdirid;
	+
	+ /*
	+ * Create an empty delete queue. We don't do anything with it, but
	+ * OpenZFS will refuse to mount filesystems that don't have one.
	+ */
	+ deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
	+ zap_init(&deleteqzap, os, deleteq);
	+ zap_write(zfs, &deleteqzap);
	+
	+ /*
	+ * Populate and write the master node object. This is a ZAP object
	+ * containing various dataset properties and the object IDs of the root
	+ * directory and delete queue.
	+ */
	+ zap_init(&masterzap, os, masterobj);
	+ zap_add_uint64(&masterzap, ZFS_ROOT_OBJ, rootdirid);
	+ zap_add_uint64(&masterzap, ZFS_UNLINKED_SET, deleteqid);
	+ zap_add_uint64(&masterzap, ZFS_SA_ATTRS, saobjid);
	+ zap_add_uint64(&masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
	+ zap_add_uint64(&masterzap, "normalization", 0 /* off */);
	+ zap_add_uint64(&masterzap, "utf8only", 0 /* off */);
	+ zap_add_uint64(&masterzap, "casesensitivity", 0 /* case sensitive */);
	+ zap_add_uint64(&masterzap, "acltype", 2 /* NFSv4 */);
	+ zap_write(zfs, &masterzap);
	+
	+ /*
	+ * All finished with this object set, we may as well write it now.
	+ * The DSL layer will sum up the bytes consumed by each dataset using
	+ * information stored in the object set, so it can't be freed just yet.
	+ */
	+ assert(dsldir != NULL);
	+ dsldir->headds->os = os;
	+ objset_write(zfs, os);
	+
	+ if (fakedroot) {
	+ free(root->inode);
	+ free(root->name);
	+ free(root);
	+ }
	+ free(fs.saoffs);
	+}
	+
	+static void
	+fs_build_unmounted(zfs_opt_t zfs, zfs_dsl_dir_t dsldir, void *arg __unused)
	+{
	+ if (dsldir->headds == NULL)
	+ return;
	+ if (dsldir->headds->os != NULL)
	+ return;
	+ fs_build_one(zfs, dsldir, NULL, -1);
	+}
	+
	+/*
	+ * Create our datasets and populate them with files.
	+ */
	+static void
	+fs_build(zfs_opt_t zfs, int dirfd, fsnode root)
	+{
	+ /*
	+ * Run through our datasets and find the root fsnode for each one. Each
	+ * root fsnode is flagged so that we can figure out which dataset it
	+ * belongs to.
	+ */
	+ dsl_dir_foreach(zfs, &zfs->rootdsldir, fs_layout_one, root);
	+
	+ /*
	+ * Did we find our boot filesystem?
	+ */
	+ if (zfs->bootfs != NULL && !zap_entry_exists(&zfs->poolprops, "bootfs"))
	+ errx(1, "no mounted dataset matches bootfs property `%s'",
	+ zfs->bootfs);
	+
	+ /*
	+ * Traverse the file hierarchy starting from the root fsnode. One
	+ * dataset, not necessarily the root dataset, must "own" the root
	+ * directory by having its mountpoint be equal to the root path.
	+ *
	+ * As roots of other datasets are encountered during the traversal,
	+ * fs_build_one() recursively creates the corresponding object sets and
	+ * populates them. Once this function has returned, all datasets will
	+ * have been fully populated.
	+ */
	+ fs_build_one(zfs, root->inode->param, root, dirfd);
	+
	+ /*
	+ * Now create object sets for datasets whose mountpoints weren't found
	+ * in the staging directory, either because there is no mountpoint, or
	+ * because the mountpoint doesn't correspond to an existing directory.
	+ */
	+ dsl_dir_foreach(zfs, &zfs->rootdsldir, fs_build_unmounted, NULL);
	+}
	+
	+/*
	+ * The entry point to all other code in this file.
	+ */
	+void
	+zfs_makefs(const char image, const char dir, fsnode root, fsinfo_t fsopts)
	+{
	+ zfs_opt_t *zfs;
	+ int dirfd;
	+
	+ zfs = fsopts->fs_specific;
	+
	+ /*
	+ * Use a fixed seed to provide reproducible pseudo-random numbers for
	+ * on-disk structures when needed (e.g., ZAP hash salts).
	+ */
	+ srandom(1729);
	+
	+ zfs_check_opts(fsopts);
	+
	+ dirfd = open(dir, O_DIRECTORY \| O_RDONLY);
	+ if (dirfd < 0)
	+ err(1, "open(%s)", dir);
	+
	+ vdev_init(zfs, fsopts->maxsize, image);
	+ pool_init(zfs);
	+ fs_build(zfs, dirfd, root);
	+ pool_fini(zfs);
	+ vdev_fini(zfs);
	+}
	Index: usr.sbin/makefs/zfs/Makefile.inc
	===================================================================
	--- /dev/null
	+++ usr.sbin/makefs/zfs/Makefile.inc
	@@ -0,0 +1,5 @@
	+.PATH: ${SRCDIR}/zfs
	+
	+SRCS+= nvlist.c
	+
	+CFLAGS.nvlist.c+= -Wno-cast-qual
	Index: usr.sbin/makefs/zfs/nvlist.h
	===================================================================
	--- /dev/null
	+++ usr.sbin/makefs/zfs/nvlist.h
	@@ -0,0 +1,167 @@
	+/*-
	+ * Copyright (c) 2012 Andriy Gapon <avg@FreeBSD.org>
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef _NVLIST_H_
	+#define _NVLIST_H_
	+
	+/* nvp implementation version */
	+#define NV_VERSION 0
	+
	+/* nvlist persistent unique name flags, stored in nvl_nvflags */
	+#define NV_UNIQUE_NAME 0x1
	+#define NV_UNIQUE_NAME_TYPE 0x2
	+
	+#define NV_ALIGN4(x) (((x) + 3) & ~3)
	+#define NV_ALIGN(x) (((x) + 7) & ~7)
	+
	+/* nvlist pack encoding */
	+#define NV_ENCODE_NATIVE 0
	+#define NV_ENCODE_XDR 1
	+
	+typedef enum {
	+ DATA_TYPE_UNKNOWN = 0,
	+ DATA_TYPE_BOOLEAN,
	+ DATA_TYPE_BYTE,
	+ DATA_TYPE_INT16,
	+ DATA_TYPE_UINT16,
	+ DATA_TYPE_INT32,
	+ DATA_TYPE_UINT32,
	+ DATA_TYPE_INT64,
	+ DATA_TYPE_UINT64,
	+ DATA_TYPE_STRING,
	+ DATA_TYPE_BYTE_ARRAY,
	+ DATA_TYPE_INT16_ARRAY,
	+ DATA_TYPE_UINT16_ARRAY,
	+ DATA_TYPE_INT32_ARRAY,
	+ DATA_TYPE_UINT32_ARRAY,
	+ DATA_TYPE_INT64_ARRAY,
	+ DATA_TYPE_UINT64_ARRAY,
	+ DATA_TYPE_STRING_ARRAY,
	+ DATA_TYPE_HRTIME,
	+ DATA_TYPE_NVLIST,
	+ DATA_TYPE_NVLIST_ARRAY,
	+ DATA_TYPE_BOOLEAN_VALUE,
	+ DATA_TYPE_INT8,
	+ DATA_TYPE_UINT8,
	+ DATA_TYPE_BOOLEAN_ARRAY,
	+ DATA_TYPE_INT8_ARRAY,
	+ DATA_TYPE_UINT8_ARRAY
	+} data_type_t;
	+
	+/*
	+ * nvlist header.
	+ * nvlist has 4 bytes header followed by version and flags, then nvpairs
	+ * and the list is terminated by double zero.
	+ */
	+typedef struct {
	+ char nvh_encoding;
	+ char nvh_endian;
	+ char nvh_reserved1;
	+ char nvh_reserved2;
	+} nvs_header_t;
	+
	+typedef struct {
	+ nvs_header_t nv_header;
	+ size_t nv_asize;
	+ size_t nv_size;
	+ uint8_t *nv_data;
	+ uint8_t *nv_idx;
	+} nvlist_t;
	+
	+/*
	+ * nvpair header.
	+ * nvpair has encoded and decoded size
	+ * name string (size and data)
	+ * data type and number of elements
	+ * data
	+ */
	+typedef struct {
	+ unsigned encoded_size;
	+ unsigned decoded_size;
	+} nvp_header_t;
	+
	+/*
	+ * nvlist stream head.
	+ */
	+typedef struct {
	+ unsigned nvl_version;
	+ unsigned nvl_nvflag;
	+ nvp_header_t nvl_pair;
	+} nvs_data_t;
	+
	+typedef struct {
	+ unsigned nv_size;
	+ uint8_t nv_data[]; /* NV_ALIGN4(string) */
	+} nv_string_t;
	+
	+typedef struct {
	+ unsigned nv_type; /* data_type_t */
	+ unsigned nv_nelem; /* number of elements */
	+ uint8_t nv_data[]; /* data stream */
	+} nv_pair_data_t;
	+
	+nvlist_t *nvlist_create(int);
	+void nvlist_destroy(nvlist_t *);
	+nvlist_t nvlist_import(const char , size_t);
	+int nvlist_export(nvlist_t *);
	+int nvlist_remove(nvlist_t , const char , data_type_t);
	+int nvpair_type_from_name(const char *);
	+nvp_header_t nvpair_find(nvlist_t , const char *);
	+void nvpair_print(nvp_header_t *, unsigned int);
	+void nvlist_print(const nvlist_t *, unsigned int);
	+char nvstring_get(nv_string_t );
	+int nvlist_find(const nvlist_t , const char , data_type_t,
	+ int , void , int *);
	+nvp_header_t nvlist_next_nvpair(nvlist_t , nvp_header_t *);
	+
	+int nvlist_add_boolean_value(nvlist_t , const char , bool);
	+int nvlist_add_byte(nvlist_t , const char , uint8_t);
	+int nvlist_add_int8(nvlist_t , const char , int8_t);
	+int nvlist_add_uint8(nvlist_t , const char , uint8_t);
	+int nvlist_add_int16(nvlist_t , const char , int16_t);
	+int nvlist_add_uint16(nvlist_t , const char , uint16_t);
	+int nvlist_add_int32(nvlist_t , const char , int32_t);
	+int nvlist_add_uint32(nvlist_t , const char , uint32_t);
	+int nvlist_add_int64(nvlist_t , const char , int64_t);
	+int nvlist_add_uint64(nvlist_t , const char , uint64_t);
	+int nvlist_add_string(nvlist_t , const char , const char *);
	+int nvlist_add_boolean_array(nvlist_t , const char , bool *, uint32_t);
	+int nvlist_add_byte_array(nvlist_t , const char , uint8_t *, uint32_t);
	+int nvlist_add_int8_array(nvlist_t , const char , int8_t *, uint32_t);
	+int nvlist_add_uint8_array(nvlist_t , const char , uint8_t *, uint32_t);
	+int nvlist_add_int16_array(nvlist_t , const char , int16_t *, uint32_t);
	+int nvlist_add_uint16_array(nvlist_t , const char , uint16_t *, uint32_t);
	+int nvlist_add_int32_array(nvlist_t , const char , int32_t *, uint32_t);
	+int nvlist_add_uint32_array(nvlist_t , const char , uint32_t *, uint32_t);
	+int nvlist_add_int64_array(nvlist_t , const char , int64_t *, uint32_t);
	+int nvlist_add_uint64_array(nvlist_t , const char , uint64_t *, uint32_t);
	+int nvlist_add_string_array(nvlist_t , const char , char * const *, uint32_t);
	+int nvlist_add_nvlist(nvlist_t , const char , nvlist_t *);
	+int nvlist_add_nvlist_array(nvlist_t , const char , nvlist_t **, uint32_t);
	+
	+#endif /* !_NVLIST_H_ */
	Index: usr.sbin/makefs/zfs/nvlist.c
	===================================================================
	--- /dev/null
	+++ usr.sbin/makefs/zfs/nvlist.c
	@@ -0,0 +1,1699 @@
	+/*-
	+ * Copyright 2020 Toomas Soome <tsoome@me.com>
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/cdefs.h>
	+__FBSDID("$FreeBSD$");
	+
	+#include <sys/param.h>
	+#include <sys/endian.h>
	+#include <sys/stdint.h>
	+
	+#include <errno.h>
	+#include <stdbool.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+
	+#include "zfs/nvlist.h"
	+
	+enum xdr_op {
	+ XDR_OP_ENCODE = 1,
	+ XDR_OP_DECODE = 2
	+};
	+
	+typedef struct xdr {
	+ enum xdr_op xdr_op;
	+ int (xdr_getint)(struct xdr , int *);
	+ int (xdr_putint)(struct xdr , int);
	+ int (xdr_getuint)(struct xdr , unsigned *);
	+ int (xdr_putuint)(struct xdr , unsigned);
	+ const uint8_t *xdr_buf;
	+ uint8_t *xdr_idx;
	+ size_t xdr_buf_size;
	+} xdr_t;
	+
	+static int nvlist_xdr_nvlist(xdr_t , nvlist_t );
	+static bool nvlist_size_xdr(xdr_t , size_t );
	+static bool nvlist_size_native(xdr_t , size_t );
	+static bool xdr_int(xdr_t , int );
	+static bool xdr_u_int(xdr_t , unsigned );
	+
	+typedef bool (xdrproc_t)(xdr_t , void *);
	+
	+/* Basic primitives for XDR translation operations, getint and putint. */
	+static int
	+_getint(struct xdr xdr, int ip)
	+{
	+ *ip = be32dec(xdr->xdr_idx);
	+ return (sizeof(int));
	+}
	+
	+static int
	+_putint(struct xdr *xdr, int i)
	+{
	+ int ip = (int )xdr->xdr_idx;
	+
	+ *ip = htobe32(i);
	+ return (sizeof(int));
	+}
	+
	+static int
	+_getuint(struct xdr xdr, unsigned ip)
	+{
	+ *ip = be32dec(xdr->xdr_idx);
	+ return (sizeof(unsigned));
	+}
	+
	+static int
	+_putuint(struct xdr *xdr, unsigned i)
	+{
	+ unsigned up = (unsigned )xdr->xdr_idx;
	+
	+ *up = htobe32(i);
	+ return (sizeof(int));
	+}
	+
	+static int
	+_getint_mem(struct xdr xdr, int ip)
	+{
	+ ip = (int *)xdr->xdr_idx;
	+ return (sizeof(int));
	+}
	+
	+static int
	+_putint_mem(struct xdr *xdr, int i)
	+{
	+ int ip = (int )xdr->xdr_idx;
	+
	+ *ip = i;
	+ return (sizeof(int));
	+}
	+
	+static int
	+_getuint_mem(struct xdr xdr, unsigned ip)
	+{
	+ ip = (unsigned *)xdr->xdr_idx;
	+ return (sizeof(unsigned));
	+}
	+
	+static int
	+_putuint_mem(struct xdr *xdr, unsigned i)
	+{
	+ unsigned up = (unsigned )xdr->xdr_idx;
	+
	+ *up = i;
	+ return (sizeof(int));
	+}
	+
	+/*
	+ * XDR data translations.
	+ */
	+static bool
	+xdr_short(xdr_t xdr, short ip)
	+{
	+ int i;
	+ bool rv;
	+
	+ i = *ip;
	+ if ((rv = xdr_int(xdr, &i))) {
	+ if (xdr->xdr_op == XDR_OP_DECODE)
	+ *ip = i;
	+ }
	+ return (rv);
	+}
	+
	+static bool
	+xdr_u_short(xdr_t xdr, unsigned short ip)
	+{
	+ unsigned u;
	+ bool rv;
	+
	+ u = *ip;
	+ if ((rv = xdr_u_int(xdr, &u))) {
	+ if (xdr->xdr_op == XDR_OP_DECODE)
	+ *ip = u;
	+ }
	+ return (rv);
	+}
	+
	+/*
	+ * translate xdr->xdr_idx, increment it by size of int.
	+ */
	+static bool
	+xdr_int(xdr_t xdr, int ip)
	+{
	+ bool rv = false;
	+ int i = (int )xdr->xdr_idx;
	+
	+ if (xdr->xdr_idx + sizeof(int) > xdr->xdr_buf + xdr->xdr_buf_size)
	+ return (rv);
	+
	+ switch (xdr->xdr_op) {
	+ case XDR_OP_ENCODE:
	+ /* Encode value ip, store to buf /
	+ xdr->xdr_idx += xdr->xdr_putint(xdr, *ip);
	+ rv = true;
	+ break;
	+
	+ case XDR_OP_DECODE:
	+ /* Decode buf, return value to ip /
	+ xdr->xdr_idx += xdr->xdr_getint(xdr, i);
	+ ip = i;
	+ rv = true;
	+ break;
	+ }
	+ return (rv);
	+}
	+
	+/*
	+ * translate xdr->xdr_idx, increment it by size of unsigned int.
	+ */
	+static bool
	+xdr_u_int(xdr_t xdr, unsigned ip)
	+{
	+ bool rv = false;
	+ unsigned u = (unsigned )xdr->xdr_idx;
	+
	+ if (xdr->xdr_idx + sizeof(unsigned) > xdr->xdr_buf + xdr->xdr_buf_size)
	+ return (rv);
	+
	+ switch (xdr->xdr_op) {
	+ case XDR_OP_ENCODE:
	+ /* Encode value ip, store to buf /
	+ xdr->xdr_idx += xdr->xdr_putuint(xdr, *ip);
	+ rv = true;
	+ break;
	+
	+ case XDR_OP_DECODE:
	+ /* Decode buf, return value to ip /
	+ xdr->xdr_idx += xdr->xdr_getuint(xdr, u);
	+ ip = u;
	+ rv = true;
	+ break;
	+ }
	+ return (rv);
	+}
	+
	+static bool
	+xdr_int64(xdr_t xdr, int64_t lp)
	+{
	+ bool rv = false;
	+
	+ if (xdr->xdr_idx + sizeof(int64_t) > xdr->xdr_buf + xdr->xdr_buf_size)
	+ return (rv);
	+
	+ switch (xdr->xdr_op) {
	+ case XDR_OP_ENCODE:
	+ /* Encode value lp, store to buf /
	+ if (xdr->xdr_putint == _putint)
	+ (int64_t )xdr->xdr_idx = htobe64(*lp);
	+ else
	+ (int64_t )xdr->xdr_idx = *lp;
	+ xdr->xdr_idx += sizeof(int64_t);
	+ rv = true;
	+ break;
	+
	+ case XDR_OP_DECODE:
	+ /* Decode buf, return value to ip /
	+ if (xdr->xdr_getint == _getint)
	+ lp = be64toh((int64_t *)xdr->xdr_idx);
	+ else
	+ lp = (int64_t *)xdr->xdr_idx;
	+ xdr->xdr_idx += sizeof(int64_t);
	+ rv = true;
	+ }
	+ return (rv);
	+}
	+
	+static bool
	+xdr_uint64(xdr_t xdr, uint64_t lp)
	+{
	+ bool rv = false;
	+
	+ if (xdr->xdr_idx + sizeof(uint64_t) > xdr->xdr_buf + xdr->xdr_buf_size)
	+ return (rv);
	+
	+ switch (xdr->xdr_op) {
	+ case XDR_OP_ENCODE:
	+ /* Encode value ip, store to buf /
	+ if (xdr->xdr_putint == _putint)
	+ (uint64_t )xdr->xdr_idx = htobe64(*lp);
	+ else
	+ (uint64_t )xdr->xdr_idx = *lp;
	+ xdr->xdr_idx += sizeof(uint64_t);
	+ rv = true;
	+ break;
	+
	+ case XDR_OP_DECODE:
	+ /* Decode buf, return value to ip /
	+ if (xdr->xdr_getuint == _getuint)
	+ lp = be64toh((uint64_t *)xdr->xdr_idx);
	+ else
	+ lp = (uint64_t *)xdr->xdr_idx;
	+ xdr->xdr_idx += sizeof(uint64_t);
	+ rv = true;
	+ }
	+ return (rv);
	+}
	+
	+static bool
	+xdr_char(xdr_t xdr, char cp)
	+{
	+ int i;
	+ bool rv = false;
	+
	+ i = *cp;
	+ if ((rv = xdr_int(xdr, &i))) {
	+ if (xdr->xdr_op == XDR_OP_DECODE)
	+ *cp = i;
	+ }
	+ return (rv);
	+}
	+
	+static bool
	+xdr_string(xdr_t xdr, nv_string_t s)
	+{
	+ int size = 0;
	+ bool rv = false;
	+
	+ switch (xdr->xdr_op) {
	+ case XDR_OP_ENCODE:
	+ size = s->nv_size;
	+ if (xdr->xdr_idx + sizeof(unsigned) + NV_ALIGN4(size) >
	+ xdr->xdr_buf + xdr->xdr_buf_size)
	+ break;
	+ xdr->xdr_idx += xdr->xdr_putuint(xdr, s->nv_size);
	+ xdr->xdr_idx += NV_ALIGN4(size);
	+ rv = true;
	+ break;
	+
	+ case XDR_OP_DECODE:
	+ if (xdr->xdr_idx + sizeof(unsigned) >
	+ xdr->xdr_buf + xdr->xdr_buf_size)
	+ break;
	+ size = xdr->xdr_getuint(xdr, &s->nv_size);
	+ size = NV_ALIGN4(size + s->nv_size);
	+ if (xdr->xdr_idx + size > xdr->xdr_buf + xdr->xdr_buf_size)
	+ break;
	+ xdr->xdr_idx += size;
	+ rv = true;
	+ break;
	+ }
	+ return (rv);
	+}
	+
	+static bool
	+xdr_array(xdr_t *xdr, const unsigned nelem, const xdrproc_t elproc)
	+{
	+ bool rv = true;
	+ unsigned c = nelem;
	+
	+ if (!xdr_u_int(xdr, &c))
	+ return (false);
	+
	+ for (unsigned i = 0; i < nelem; i++) {
	+ if (!elproc(xdr, xdr->xdr_idx))
	+ return (false);
	+ }
	+ return (rv);
	+}
	+
	+/*
	+ * nvlist management functions.
	+ */
	+void
	+nvlist_destroy(nvlist_t *nvl)
	+{
	+ if (nvl != NULL) {
	+ /* Free data if it was allocated by us. */
	+ if (nvl->nv_asize > 0)
	+ free(nvl->nv_data);
	+ }
	+ free(nvl);
	+}
	+
	+char *
	+nvstring_get(nv_string_t *nvs)
	+{
	+ char *s;
	+
	+ s = malloc(nvs->nv_size + 1);
	+ if (s != NULL) {
	+ bcopy(nvs->nv_data, s, nvs->nv_size);
	+ s[nvs->nv_size] = '\0';
	+ }
	+ return (s);
	+}
	+
	+/*
	+ * Create empty nvlist.
	+ * The nvlist is terminated by 2x zeros (8 bytes).
	+ */
	+nvlist_t *
	+nvlist_create(int flag)
	+{
	+ nvlist_t *nvl;
	+ nvs_data_t *nvs;
	+
	+ nvl = calloc(1, sizeof(*nvl));
	+ if (nvl == NULL)
	+ return (nvl);
	+
	+ nvl->nv_header.nvh_encoding = NV_ENCODE_XDR;
	+ nvl->nv_header.nvh_endian = _BYTE_ORDER == _LITTLE_ENDIAN;
	+
	+ nvl->nv_asize = nvl->nv_size = sizeof(*nvs);
	+ nvs = calloc(1, nvl->nv_asize);
	+ if (nvs == NULL) {
	+ free(nvl);
	+ return (NULL);
	+ }
	+ /* data in nvlist is byte stream */
	+ nvl->nv_data = (uint8_t *)nvs;
	+
	+ nvs->nvl_version = NV_VERSION;
	+ nvs->nvl_nvflag = flag;
	+ return (nvl);
	+}
	+
	+static bool
	+nvlist_xdr_nvp(xdr_t xdr, nvlist_t nvl)
	+{
	+ nv_string_t *nv_string;
	+ nv_pair_data_t *nvp_data;
	+ nvlist_t nvlist;
	+ unsigned type, nelem;
	+ xdr_t nv_xdr;
	+
	+ nv_string = (nv_string_t *)xdr->xdr_idx;
	+ if (!xdr_string(xdr, nv_string)) {
	+ return (false);
	+ }
	+ nvp_data = (nv_pair_data_t *)xdr->xdr_idx;
	+
	+ type = nvp_data->nv_type;
	+ nelem = nvp_data->nv_nelem;
	+ if (!xdr_u_int(xdr, &type) \|\| !xdr_u_int(xdr, &nelem))
	+ return (false);
	+
	+ switch (type) {
	+ case DATA_TYPE_NVLIST:
	+ case DATA_TYPE_NVLIST_ARRAY:
	+ bzero(&nvlist, sizeof(nvlist));
	+ nvlist.nv_data = xdr->xdr_idx;
	+ nvlist.nv_idx = nvlist.nv_data;
	+
	+ /* Set up xdr for this nvlist. */
	+ nv_xdr = *xdr;
	+ nv_xdr.xdr_buf = nvlist.nv_data;
	+ nv_xdr.xdr_idx = nvlist.nv_data;
	+ nv_xdr.xdr_buf_size =
	+ nvl->nv_data + nvl->nv_size - nvlist.nv_data;
	+
	+ for (unsigned i = 0; i < nelem; i++) {
	+ if (xdr->xdr_op == XDR_OP_ENCODE) {
	+ if (!nvlist_size_native(&nv_xdr,
	+ &nvlist.nv_size))
	+ return (false);
	+ } else {
	+ if (!nvlist_size_xdr(&nv_xdr,
	+ &nvlist.nv_size))
	+ return (false);
	+ }
	+ if (nvlist_xdr_nvlist(xdr, &nvlist) != 0)
	+ return (false);
	+
	+ nvlist.nv_data = nv_xdr.xdr_idx;
	+ nvlist.nv_idx = nv_xdr.xdr_idx;
	+
	+ nv_xdr.xdr_buf = nv_xdr.xdr_idx;
	+ nv_xdr.xdr_buf_size =
	+ nvl->nv_data + nvl->nv_size - nvlist.nv_data;
	+ }
	+ break;
	+
	+ case DATA_TYPE_BOOLEAN:
	+ /* BOOLEAN does not take value space */
	+ break;
	+ case DATA_TYPE_BYTE:
	+ case DATA_TYPE_INT8:
	+ case DATA_TYPE_UINT8:
	+ if (!xdr_char(xdr, (char *)&nvp_data->nv_data[0]))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_INT16:
	+ if (!xdr_short(xdr, (short *)&nvp_data->nv_data[0]))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_UINT16:
	+ if (!xdr_u_short(xdr, (unsigned short *)&nvp_data->nv_data[0]))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_BOOLEAN_VALUE:
	+ case DATA_TYPE_INT32:
	+ if (!xdr_int(xdr, (int *)&nvp_data->nv_data[0]))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_UINT32:
	+ if (!xdr_u_int(xdr, (unsigned *)&nvp_data->nv_data[0]))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_HRTIME:
	+ case DATA_TYPE_INT64:
	+ if (!xdr_int64(xdr, (int64_t *)&nvp_data->nv_data[0]))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_UINT64:
	+ if (!xdr_uint64(xdr, (uint64_t *)&nvp_data->nv_data[0]))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_BYTE_ARRAY:
	+ case DATA_TYPE_STRING:
	+ nv_string = (nv_string_t *)&nvp_data->nv_data[0];
	+ if (!xdr_string(xdr, nv_string))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_STRING_ARRAY:
	+ nv_string = (nv_string_t *)&nvp_data->nv_data[0];
	+ for (unsigned i = 0; i < nelem; i++) {
	+ if (!xdr_string(xdr, nv_string))
	+ return (false);
	+ nv_string = (nv_string_t *)xdr->xdr_idx;
	+ }
	+ break;
	+
	+ case DATA_TYPE_INT8_ARRAY:
	+ case DATA_TYPE_UINT8_ARRAY:
	+ case DATA_TYPE_INT16_ARRAY:
	+ case DATA_TYPE_UINT16_ARRAY:
	+ case DATA_TYPE_BOOLEAN_ARRAY:
	+ case DATA_TYPE_INT32_ARRAY:
	+ case DATA_TYPE_UINT32_ARRAY:
	+ if (!xdr_array(xdr, nelem, (xdrproc_t)xdr_u_int))
	+ return (false);
	+ break;
	+
	+ case DATA_TYPE_INT64_ARRAY:
	+ case DATA_TYPE_UINT64_ARRAY:
	+ if (!xdr_array(xdr, nelem, (xdrproc_t)xdr_uint64))
	+ return (false);
	+ break;
	+ }
	+ return (true);
	+}
	+
	+static int
	+nvlist_xdr_nvlist(xdr_t xdr, nvlist_t nvl)
	+{
	+ nvp_header_t *nvph;
	+ nvs_data_t *nvs;
	+ unsigned encoded_size, decoded_size;
	+ int rv;
	+
	+ nvs = (nvs_data_t *)xdr->xdr_idx;
	+ nvph = &nvs->nvl_pair;
	+
	+ if (!xdr_u_int(xdr, &nvs->nvl_version))
	+ return (EINVAL);
	+ if (!xdr_u_int(xdr, &nvs->nvl_nvflag))
	+ return (EINVAL);
	+
	+ encoded_size = nvph->encoded_size;
	+ decoded_size = nvph->decoded_size;
	+
	+ if (xdr->xdr_op == XDR_OP_ENCODE) {
	+ if (!xdr_u_int(xdr, &nvph->encoded_size))
	+ return (EINVAL);
	+ if (!xdr_u_int(xdr, &nvph->decoded_size))
	+ return (EINVAL);
	+ } else {
	+ xdr->xdr_idx += 2 * sizeof(unsigned);
	+ }
	+
	+ rv = 0;
	+ while (encoded_size && decoded_size) {
	+ if (!nvlist_xdr_nvp(xdr, nvl))
	+ return (EINVAL);
	+
	+ nvph = (nvp_header_t *)(xdr->xdr_idx);
	+ encoded_size = nvph->encoded_size;
	+ decoded_size = nvph->decoded_size;
	+ if (xdr->xdr_op == XDR_OP_ENCODE) {
	+ if (!xdr_u_int(xdr, &nvph->encoded_size))
	+ return (EINVAL);
	+ if (!xdr_u_int(xdr, &nvph->decoded_size))
	+ return (EINVAL);
	+ } else {
	+ xdr->xdr_idx += 2 * sizeof(unsigned);
	+ }
	+ }
	+ return (rv);
	+}
	+
	+/*
	+ * Calculate nvlist size, translating encoded_size and decoded_size.
	+ */
	+static bool
	+nvlist_size_xdr(xdr_t xdr, size_t size)
	+{
	+ uint8_t *pair;
	+ unsigned encoded_size, decoded_size;
	+
	+ xdr->xdr_idx += 2 * sizeof(unsigned);
	+
	+ pair = xdr->xdr_idx;
	+ if (!xdr_u_int(xdr, &encoded_size) \|\| !xdr_u_int(xdr, &decoded_size))
	+ return (false);
	+
	+ while (encoded_size && decoded_size) {
	+ xdr->xdr_idx = pair + encoded_size;
	+ pair = xdr->xdr_idx;
	+ if (!xdr_u_int(xdr, &encoded_size) \|\|
	+ !xdr_u_int(xdr, &decoded_size))
	+ return (false);
	+ }
	+ *size = xdr->xdr_idx - xdr->xdr_buf;
	+
	+ return (true);
	+}
	+
	+nvp_header_t *
	+nvlist_next_nvpair(nvlist_t nvl, nvp_header_t nvh)
	+{
	+ uint8_t *pair;
	+ unsigned encoded_size, decoded_size;
	+ xdr_t xdr;
	+
	+ if (nvl == NULL)
	+ return (NULL);
	+
	+ xdr.xdr_buf = nvl->nv_data;
	+ xdr.xdr_idx = nvl->nv_data;
	+ xdr.xdr_buf_size = nvl->nv_size;
	+
	+ xdr.xdr_idx += 2 * sizeof(unsigned);
	+
	+ /* Skip tp current pair */
	+ if (nvh != NULL) {
	+ xdr.xdr_idx = (uint8_t *)nvh;
	+ }
	+
	+ pair = xdr.xdr_idx;
	+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
	+ return (NULL);
	+
	+ encoded_size = (unsigned )xdr.xdr_idx;
	+ xdr.xdr_idx += sizeof(unsigned);
	+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
	+ return (NULL);
	+
	+ decoded_size = (unsigned )xdr.xdr_idx;
	+ xdr.xdr_idx += sizeof(unsigned);
	+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
	+ return (NULL);
	+
	+ while (encoded_size && decoded_size) {
	+ if (nvh == NULL)
	+ return ((nvp_header_t *)pair);
	+
	+ xdr.xdr_idx = pair + encoded_size;
	+ nvh = (nvp_header_t *)xdr.xdr_idx;
	+
	+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
	+ return (NULL);
	+
	+ encoded_size = (unsigned )xdr.xdr_idx;
	+ xdr.xdr_idx += sizeof(unsigned);
	+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
	+ return (NULL);
	+ decoded_size = (unsigned )xdr.xdr_idx;
	+ xdr.xdr_idx += sizeof(unsigned);
	+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
	+ return (NULL);
	+
	+ if (encoded_size != 0 && decoded_size != 0) {
	+ return (nvh);
	+ }
	+ }
	+ return (NULL);
	+}
	+
	+/*
	+ * Calculate nvlist size by walking in memory data.
	+ */
	+static bool
	+nvlist_size_native(xdr_t xdr, size_t size)
	+{
	+ uint8_t *pair;
	+ unsigned encoded_size, decoded_size;
	+
	+ xdr->xdr_idx += 2 * sizeof(unsigned);
	+
	+ pair = xdr->xdr_idx;
	+ if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size)
	+ return (false);
	+
	+ encoded_size = (unsigned )xdr->xdr_idx;
	+ xdr->xdr_idx += sizeof(unsigned);
	+ if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size)
	+ return (false);
	+ decoded_size = (unsigned )xdr->xdr_idx;
	+ xdr->xdr_idx += sizeof(unsigned);
	+ while (encoded_size && decoded_size) {
	+ xdr->xdr_idx = pair + encoded_size;
	+ pair = xdr->xdr_idx;
	+ if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size)
	+ return (false);
	+ encoded_size = (unsigned )xdr->xdr_idx;
	+ xdr->xdr_idx += sizeof(unsigned);
	+ if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size)
	+ return (false);
	+ decoded_size = (unsigned )xdr->xdr_idx;
	+ xdr->xdr_idx += sizeof(unsigned);
	+ }
	+ *size = xdr->xdr_idx - xdr->xdr_buf;
	+
	+ return (true);
	+}
	+
	+/*
	+ * Export nvlist to byte stream format.
	+ */
	+int
	+nvlist_export(nvlist_t *nvl)
	+{
	+ int rv;
	+ xdr_t xdr = {
	+ .xdr_op = XDR_OP_ENCODE,
	+ .xdr_putint = _putint,
	+ .xdr_putuint = _putuint,
	+ .xdr_buf = nvl->nv_data,
	+ .xdr_idx = nvl->nv_data,
	+ .xdr_buf_size = nvl->nv_size
	+ };
	+
	+ if (nvl->nv_header.nvh_encoding != NV_ENCODE_XDR)
	+ return (ENOTSUP);
	+
	+ nvl->nv_idx = nvl->nv_data;
	+ rv = nvlist_xdr_nvlist(&xdr, nvl);
	+
	+ return (rv);
	+}
	+
	+/*
	+ * Import nvlist from byte stream.
	+ * Determine the stream size and allocate private copy.
	+ * Then translate the data.
	+ */
	+nvlist_t *
	+nvlist_import(const char *stream, size_t size)
	+{
	+ nvlist_t *nvl;
	+ xdr_t xdr = {
	+ .xdr_op = XDR_OP_DECODE,
	+ .xdr_getint = _getint,
	+ .xdr_getuint = _getuint
	+ };
	+
	+ /* Check the nvlist head. */
	+ if (stream[0] != NV_ENCODE_XDR \|\|
	+ (stream[1] != '\0' && stream[1] != '\1') \|\|
	+ stream[2] != '\0' \|\| stream[3] != '\0' \|\|
	+ be32toh((uint32_t )(stream + 4)) != NV_VERSION \|\|
	+ be32toh((uint32_t )(stream + 8)) != NV_UNIQUE_NAME)
	+ return (NULL);
	+
	+ nvl = malloc(sizeof(*nvl));
	+ if (nvl == NULL)
	+ return (nvl);
	+
	+ nvl->nv_header.nvh_encoding = stream[0];
	+ nvl->nv_header.nvh_endian = stream[1];
	+ nvl->nv_header.nvh_reserved1 = stream[2];
	+ nvl->nv_header.nvh_reserved2 = stream[3];
	+
	+ xdr.xdr_buf = xdr.xdr_idx = (uint8_t *)stream + 4;
	+ xdr.xdr_buf_size = size - 4;
	+
	+ if (!nvlist_size_xdr(&xdr, &nvl->nv_asize)) {
	+ free(nvl);
	+ return (NULL);
	+ }
	+ nvl->nv_size = nvl->nv_asize;
	+ nvl->nv_data = malloc(nvl->nv_asize);
	+ if (nvl->nv_data == NULL) {
	+ free(nvl);
	+ return (NULL);
	+ }
	+ nvl->nv_idx = nvl->nv_data;
	+ bcopy(stream + 4, nvl->nv_data, nvl->nv_asize);
	+
	+ xdr.xdr_buf = xdr.xdr_idx = nvl->nv_data;
	+ xdr.xdr_buf_size = nvl->nv_asize;
	+
	+ if (nvlist_xdr_nvlist(&xdr, nvl) != 0) {
	+ free(nvl->nv_data);
	+ free(nvl);
	+ nvl = NULL;
	+ }
	+
	+ return (nvl);
	+}
	+
	+/*
	+ * remove pair from this nvlist.
	+ */
	+int
	+nvlist_remove(nvlist_t nvl, const char name, data_type_t type)
	+{
	+ uint8_t head, tail;
	+ nvs_data_t *data;
	+ nvp_header_t *nvp;
	+ nv_string_t *nvp_name;
	+ nv_pair_data_t *nvp_data;
	+ size_t size;
	+ xdr_t xdr;
	+
	+ if (nvl == NULL \|\| nvl->nv_data == NULL \|\| name == NULL)
	+ return (EINVAL);
	+
	+ /* Make sure the nvlist size is set correct */
	+ xdr.xdr_idx = nvl->nv_data;
	+ xdr.xdr_buf = xdr.xdr_idx;
	+ xdr.xdr_buf_size = nvl->nv_size;
	+ if (!nvlist_size_native(&xdr, &nvl->nv_size))
	+ return (EINVAL);
	+
	+ data = (nvs_data_t *)nvl->nv_data;
	+ nvp = &data->nvl_pair; /* first pair in nvlist */
	+ head = (uint8_t *)nvp;
	+
	+ while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
	+ nvp_name = (nv_string_t *)(nvp + 1);
	+
	+ nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
	+ NV_ALIGN4(nvp_name->nv_size));
	+
	+ if (strlen(name) == nvp_name->nv_size &&
	+ memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0 &&
	+ (nvp_data->nv_type == type \|\| type == DATA_TYPE_UNKNOWN)) {
	+ /*
	+ * set tail to point to next nvpair and size
	+ * is the length of the tail.
	+ */
	+ tail = head + nvp->encoded_size;
	+ size = nvl->nv_size - (tail - nvl->nv_data);
	+
	+ /* adjust the size of the nvlist. */
	+ nvl->nv_size -= nvp->encoded_size;
	+ bcopy(tail, head, size);
	+ return (0);
	+ }
	+ /* Not our pair, skip to next. */
	+ head = head + nvp->encoded_size;
	+ nvp = (nvp_header_t *)head;
	+ }
	+ return (ENOENT);
	+}
	+
	+static int
	+clone_nvlist(const nvlist_t nvl, const uint8_t ptr, unsigned size,
	+ nvlist_t **nvlist)
	+{
	+ nvlist_t *nv;
	+
	+ nv = calloc(1, sizeof(*nv));
	+ if (nv == NULL)
	+ return (ENOMEM);
	+
	+ nv->nv_header = nvl->nv_header;
	+ nv->nv_asize = size;
	+ nv->nv_size = size;
	+ nv->nv_data = malloc(nv->nv_asize);
	+ if (nv->nv_data == NULL) {
	+ free(nv);
	+ return (ENOMEM);
	+ }
	+
	+ bcopy(ptr, nv->nv_data, nv->nv_asize);
	+ *nvlist = nv;
	+ return (0);
	+}
	+
	+/*
	+ * Return the next nvlist in an nvlist array.
	+ */
	+static uint8_t *
	+nvlist_next(const uint8_t *ptr)
	+{
	+ nvs_data_t *data;
	+ nvp_header_t *nvp;
	+
	+ data = (nvs_data_t *)ptr;
	+ nvp = &data->nvl_pair; /* first pair in nvlist */
	+
	+ while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
	+ nvp = (nvp_header_t )((uint8_t )nvp + nvp->encoded_size);
	+ }
	+ return ((uint8_t )nvp + sizeof(nvp));
	+}
	+
	+/*
	+ * Note: nvlist and nvlist array must be freed by caller.
	+ */
	+int
	+nvlist_find(const nvlist_t nvl, const char name, data_type_t type,
	+ int elementsp, void valuep, int *sizep)
	+{
	+ nvs_data_t *data;
	+ nvp_header_t *nvp;
	+ nv_string_t *nvp_name;
	+ nv_pair_data_t *nvp_data;
	+ nvlist_t *nvlist, nv;
	+ uint8_t *ptr;
	+ int rv;
	+
	+ if (nvl == NULL \|\| nvl->nv_data == NULL \|\| name == NULL)
	+ return (EINVAL);
	+
	+ data = (nvs_data_t *)nvl->nv_data;
	+ nvp = &data->nvl_pair; /* first pair in nvlist */
	+
	+ while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
	+ nvp_name = (nv_string_t )((uint8_t )nvp + sizeof(*nvp));
	+ if (nvl->nv_data + nvl->nv_size <
	+ nvp_name->nv_data + nvp_name->nv_size)
	+ return (EIO);
	+
	+ nvp_data = (nv_pair_data_t *)
	+ NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] +
	+ nvp_name->nv_size);
	+
	+ if (strlen(name) == nvp_name->nv_size &&
	+ memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0 &&
	+ (nvp_data->nv_type == type \|\| type == DATA_TYPE_UNKNOWN)) {
	+ if (elementsp != NULL)
	+ *elementsp = nvp_data->nv_nelem;
	+ switch (nvp_data->nv_type) {
	+ case DATA_TYPE_UINT64:
	+ bcopy(nvp_data->nv_data, valuep,
	+ sizeof(uint64_t));
	+ return (0);
	+ case DATA_TYPE_STRING:
	+ nvp_name = (nv_string_t *)nvp_data->nv_data;
	+ if (sizep != NULL) {
	+ *sizep = nvp_name->nv_size;
	+ }
	+ (const uint8_t *)valuep =
	+ &nvp_name->nv_data[0];
	+ return (0);
	+ case DATA_TYPE_NVLIST:
	+ ptr = &nvp_data->nv_data[0];
	+ rv = clone_nvlist(nvl, ptr,
	+ nvlist_next(ptr) - ptr, &nv);
	+ if (rv == 0) {
	+ (nvlist_t *)valuep = nv;
	+ }
	+ return (rv);
	+
	+ case DATA_TYPE_NVLIST_ARRAY:
	+ nvlist = calloc(nvp_data->nv_nelem,
	+ sizeof(nvlist_t *));
	+ if (nvlist == NULL)
	+ return (ENOMEM);
	+ ptr = &nvp_data->nv_data[0];
	+ rv = 0;
	+ for (unsigned i = 0; i < nvp_data->nv_nelem;
	+ i++) {
	+ rv = clone_nvlist(nvl, ptr,
	+ nvlist_next(ptr) - ptr, &nvlist[i]);
	+ if (rv != 0)
	+ goto error;
	+ ptr = nvlist_next(ptr);
	+ }
	+ (nvlist_t **)valuep = nvlist;
	+ return (rv);
	+ }
	+ return (EIO);
	+ }
	+ /* Not our pair, skip to next. */
	+ nvp = (nvp_header_t )((uint8_t )nvp + nvp->encoded_size);
	+ if (nvl->nv_data + nvl->nv_size < (uint8_t *)nvp)
	+ return (EIO);
	+ }
	+ return (ENOENT);
	+error:
	+ for (unsigned i = 0; i < nvp_data->nv_nelem; i++) {
	+ free(nvlist[i]->nv_data);
	+ free(nvlist[i]);
	+ }
	+ free(nvlist);
	+ return (rv);
	+}
	+
	+static int
	+get_value_size(data_type_t type, const void *data, uint32_t nelem)
	+{
	+ uint64_t value_sz = 0;
	+
	+ switch (type) {
	+ case DATA_TYPE_BOOLEAN:
	+ value_sz = 0;
	+ break;
	+ case DATA_TYPE_BOOLEAN_VALUE:
	+ case DATA_TYPE_BYTE:
	+ case DATA_TYPE_INT8:
	+ case DATA_TYPE_UINT8:
	+ case DATA_TYPE_INT16:
	+ case DATA_TYPE_UINT16:
	+ case DATA_TYPE_INT32:
	+ case DATA_TYPE_UINT32:
	+ /* Our smallest data unit is 32-bit */
	+ value_sz = sizeof(uint32_t);
	+ break;
	+ case DATA_TYPE_HRTIME:
	+ case DATA_TYPE_INT64:
	+ value_sz = sizeof(int64_t);
	+ break;
	+ case DATA_TYPE_UINT64:
	+ value_sz = sizeof(uint64_t);
	+ break;
	+ case DATA_TYPE_STRING:
	+ if (data == NULL)
	+ value_sz = 0;
	+ else
	+ value_sz = strlen(data) + 1;
	+ break;
	+ case DATA_TYPE_BYTE_ARRAY:
	+ value_sz = nelem * sizeof(uint8_t);
	+ break;
	+ case DATA_TYPE_BOOLEAN_ARRAY:
	+ case DATA_TYPE_INT8_ARRAY:
	+ case DATA_TYPE_UINT8_ARRAY:
	+ case DATA_TYPE_INT16_ARRAY:
	+ case DATA_TYPE_UINT16_ARRAY:
	+ case DATA_TYPE_INT32_ARRAY:
	+ case DATA_TYPE_UINT32_ARRAY:
	+ value_sz = (uint64_t)nelem * sizeof(uint32_t);
	+ break;
	+ case DATA_TYPE_INT64_ARRAY:
	+ value_sz = (uint64_t)nelem * sizeof(int64_t);
	+ break;
	+ case DATA_TYPE_UINT64_ARRAY:
	+ value_sz = (uint64_t)nelem * sizeof(uint64_t);
	+ break;
	+ case DATA_TYPE_STRING_ARRAY:
	+ value_sz = (uint64_t)nelem * sizeof(uint64_t);
	+
	+ if (data != NULL) {
	+ char const strs = data;
	+ uint32_t i;
	+
	+ for (i = 0; i < nelem; i++) {
	+ if (strs[i] == NULL)
	+ return (-1);
	+ value_sz += strlen(strs[i]) + 1;
	+ }
	+ }
	+ break;
	+ case DATA_TYPE_NVLIST:
	+ /*
	+ * The decoded size of nvlist is constant.
	+ */
	+ value_sz = NV_ALIGN(6 * 4); /* sizeof nvlist_t */
	+ break;
	+ case DATA_TYPE_NVLIST_ARRAY:
	+ value_sz = (uint64_t)nelem * sizeof(uint64_t) +
	+ (uint64_t)nelem * NV_ALIGN(6 * 4); /* sizeof nvlist_t */
	+ break;
	+ default:
	+ return (-1);
	+ }
	+
	+ return (value_sz > INT32_MAX ? -1 : (int)value_sz);
	+}
	+
	+static int
	+get_nvp_data_size(data_type_t type, const void *data, uint32_t nelem)
	+{
	+ uint64_t value_sz = 0;
	+ xdr_t xdr;
	+ size_t size;
	+
	+ switch (type) {
	+ case DATA_TYPE_BOOLEAN:
	+ value_sz = 0;
	+ break;
	+ case DATA_TYPE_BOOLEAN_VALUE:
	+ case DATA_TYPE_BYTE:
	+ case DATA_TYPE_INT8:
	+ case DATA_TYPE_UINT8:
	+ case DATA_TYPE_INT16:
	+ case DATA_TYPE_UINT16:
	+ case DATA_TYPE_INT32:
	+ case DATA_TYPE_UINT32:
	+ /* Our smallest data unit is 32-bit */
	+ value_sz = sizeof(uint32_t);
	+ break;
	+ case DATA_TYPE_HRTIME:
	+ case DATA_TYPE_INT64:
	+ case DATA_TYPE_UINT64:
	+ value_sz = sizeof(uint64_t);
	+ break;
	+ case DATA_TYPE_STRING:
	+ value_sz = 4 + NV_ALIGN4(strlen(data));
	+ break;
	+ case DATA_TYPE_BYTE_ARRAY:
	+ value_sz = NV_ALIGN4(nelem);
	+ break;
	+ case DATA_TYPE_BOOLEAN_ARRAY:
	+ case DATA_TYPE_INT8_ARRAY:
	+ case DATA_TYPE_UINT8_ARRAY:
	+ case DATA_TYPE_INT16_ARRAY:
	+ case DATA_TYPE_UINT16_ARRAY:
	+ case DATA_TYPE_INT32_ARRAY:
	+ case DATA_TYPE_UINT32_ARRAY:
	+ value_sz = 4 + (uint64_t)nelem * sizeof(uint32_t);
	+ break;
	+ case DATA_TYPE_INT64_ARRAY:
	+ case DATA_TYPE_UINT64_ARRAY:
	+ value_sz = 4 + (uint64_t)nelem * sizeof(uint64_t);
	+ break;
	+ case DATA_TYPE_STRING_ARRAY:
	+ if (data != NULL) {
	+ char const strs = data;
	+ uint32_t i;
	+
	+ for (i = 0; i < nelem; i++) {
	+ value_sz += 4 + NV_ALIGN4(strlen(strs[i]));
	+ }
	+ }
	+ break;
	+ case DATA_TYPE_NVLIST:
	+ xdr.xdr_idx = ((nvlist_t *)data)->nv_data;
	+ xdr.xdr_buf = xdr.xdr_idx;
	+ xdr.xdr_buf_size = ((nvlist_t *)data)->nv_size;
	+
	+ if (!nvlist_size_native(&xdr, &size))
	+ return (-1);
	+
	+ value_sz = size;
	+ break;
	+ case DATA_TYPE_NVLIST_ARRAY:
	+ value_sz = 0;
	+ for (uint32_t i = 0; i < nelem; i++) {
	+ xdr.xdr_idx = ((nvlist_t **)data)[i]->nv_data;
	+ xdr.xdr_buf = xdr.xdr_idx;
	+ xdr.xdr_buf_size = ((nvlist_t **)data)[i]->nv_size;
	+
	+ if (!nvlist_size_native(&xdr, &size))
	+ return (-1);
	+ value_sz += size;
	+ }
	+ break;
	+ default:
	+ return (-1);
	+ }
	+
	+ return (value_sz > INT32_MAX ? -1 : (int)value_sz);
	+}
	+
	+#define NVPE_SIZE(name_len, data_len) \
	+ (4 + 4 + 4 + NV_ALIGN4(name_len) + 4 + 4 + data_len)
	+#define NVP_SIZE(name_len, data_len) \
	+ (NV_ALIGN((4 * 4) + (name_len)) + NV_ALIGN(data_len))
	+
	+static int
	+nvlist_add_common(nvlist_t nvl, const char name, data_type_t type,
	+ uint32_t nelem, const void *data)
	+{
	+ nvs_data_t *nvs;
	+ nvp_header_t head, *hp;
	+ uint8_t *ptr;
	+ size_t namelen;
	+ int decoded_size, encoded_size;
	+ xdr_t xdr = {
	+ .xdr_op = XDR_OP_ENCODE,
	+ .xdr_putint = _putint_mem,
	+ .xdr_putuint = _putuint_mem,
	+ .xdr_buf = nvl->nv_data,
	+ .xdr_idx = nvl->nv_data,
	+ .xdr_buf_size = nvl->nv_size
	+ };
	+
	+ nvs = (nvs_data_t *)nvl->nv_data;
	+ if (nvs->nvl_nvflag & NV_UNIQUE_NAME)
	+ (void) nvlist_remove(nvl, name, type);
	+
	+ xdr.xdr_buf = nvl->nv_data;
	+ xdr.xdr_idx = nvl->nv_data;
	+ xdr.xdr_buf_size = nvl->nv_size;
	+ if (!nvlist_size_native(&xdr, &nvl->nv_size))
	+ return (EINVAL);
	+
	+ namelen = strlen(name);
	+ if ((decoded_size = get_value_size(type, data, nelem)) < 0)
	+ return (EINVAL);
	+ if ((encoded_size = get_nvp_data_size(type, data, nelem)) < 0)
	+ return (EINVAL);
	+
	+ /*
	+ * The encoded size is calculated as:
	+ * encode_size (4) + decode_size (4) +
	+ * name string size (4 + NV_ALIGN4(namelen) +
	+ * data type (4) + nelem size (4) + datalen
	+ *
	+ * The decoded size is calculated as:
	+ * Note: namelen is with terminating 0.
	+ * NV_ALIGN(sizeof(nvpair_t) (4 * 4) + namelen + 1) +
	+ * NV_ALIGN(data_len)
	+ */
	+
	+ head.encoded_size = NVPE_SIZE(namelen, encoded_size);
	+ head.decoded_size = NVP_SIZE(namelen + 1, decoded_size);
	+
	+ if (nvl->nv_asize - nvl->nv_size < head.encoded_size + 8) {
	+ ptr = realloc(nvl->nv_data, nvl->nv_asize + head.encoded_size);
	+ if (ptr == NULL)
	+ return (ENOMEM);
	+ nvl->nv_data = ptr;
	+ nvl->nv_asize += head.encoded_size;
	+ }
	+ nvl->nv_idx = nvl->nv_data + nvl->nv_size - sizeof(*hp);
	+ bzero(nvl->nv_idx, head.encoded_size + 8);
	+ hp = (nvp_header_t *)nvl->nv_idx;
	+ *hp = head;
	+ nvl->nv_idx += sizeof(*hp);
	+
	+ xdr.xdr_buf = nvl->nv_data;
	+ xdr.xdr_buf_size = nvl->nv_asize;
	+ xdr.xdr_idx = nvl->nv_idx;
	+
	+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, namelen);
	+ strlcpy((char *)xdr.xdr_idx, name, namelen + 1);
	+ xdr.xdr_idx += NV_ALIGN4(namelen);
	+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, type);
	+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, nelem);
	+
	+ switch (type) {
	+ case DATA_TYPE_BOOLEAN:
	+ break;
	+
	+ case DATA_TYPE_BYTE_ARRAY:
	+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size);
	+ bcopy(data, xdr.xdr_idx, nelem);
	+ xdr.xdr_idx += NV_ALIGN4(encoded_size);
	+ break;
	+
	+ case DATA_TYPE_STRING:
	+ encoded_size = strlen(data);
	+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size);
	+ strlcpy((char *)xdr.xdr_idx, data, encoded_size + 1);
	+ xdr.xdr_idx += NV_ALIGN4(encoded_size);
	+ break;
	+
	+ case DATA_TYPE_STRING_ARRAY:
	+ for (uint32_t i = 0; i < nelem; i++) {
	+ encoded_size = strlen(((char **)data)[i]);
	+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size);
	+ strlcpy((char )xdr.xdr_idx, ((char *)data)[i],
	+ encoded_size + 1);
	+ xdr.xdr_idx += NV_ALIGN4(encoded_size);
	+ }
	+ break;
	+
	+ case DATA_TYPE_BYTE:
	+ case DATA_TYPE_INT8:
	+ case DATA_TYPE_UINT8:
	+ xdr_char(&xdr, (char *)data);
	+ break;
	+
	+ case DATA_TYPE_INT8_ARRAY:
	+ case DATA_TYPE_UINT8_ARRAY:
	+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_char);
	+ break;
	+
	+ case DATA_TYPE_INT16:
	+ xdr_short(&xdr, (short *)data);
	+ break;
	+
	+ case DATA_TYPE_UINT16:
	+ xdr_u_short(&xdr, (unsigned short *)data);
	+ break;
	+
	+ case DATA_TYPE_INT16_ARRAY:
	+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_short);
	+ break;
	+
	+ case DATA_TYPE_UINT16_ARRAY:
	+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_u_short);
	+ break;
	+
	+ case DATA_TYPE_BOOLEAN_VALUE:
	+ case DATA_TYPE_INT32:
	+ xdr_int(&xdr, (int *)data);
	+ break;
	+
	+ case DATA_TYPE_UINT32:
	+ xdr_u_int(&xdr, (unsigned int *)data);
	+ break;
	+
	+ case DATA_TYPE_BOOLEAN_ARRAY:
	+ case DATA_TYPE_INT32_ARRAY:
	+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_int);
	+ break;
	+
	+ case DATA_TYPE_UINT32_ARRAY:
	+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_u_int);
	+ break;
	+
	+ case DATA_TYPE_INT64:
	+ xdr_int64(&xdr, (int64_t *)data);
	+ break;
	+
	+ case DATA_TYPE_UINT64:
	+ xdr_uint64(&xdr, (uint64_t *)data);
	+ break;
	+
	+ case DATA_TYPE_INT64_ARRAY:
	+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_int64);
	+ break;
	+
	+ case DATA_TYPE_UINT64_ARRAY:
	+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_uint64);
	+ break;
	+
	+ case DATA_TYPE_NVLIST:
	+ bcopy(((nvlist_t *)data)->nv_data, xdr.xdr_idx, encoded_size);
	+ break;
	+
	+ case DATA_TYPE_NVLIST_ARRAY: {
	+ size_t size;
	+ xdr_t xdr_nv;
	+
	+ for (uint32_t i = 0; i < nelem; i++) {
	+ xdr_nv.xdr_idx = ((nvlist_t **)data)[i]->nv_data;
	+ xdr_nv.xdr_buf = xdr_nv.xdr_idx;
	+ xdr_nv.xdr_buf_size = ((nvlist_t **)data)[i]->nv_size;
	+
	+ if (!nvlist_size_native(&xdr_nv, &size))
	+ return (EINVAL);
	+
	+ bcopy(((nvlist_t **)data)[i]->nv_data, xdr.xdr_idx,
	+ size);
	+ xdr.xdr_idx += size;
	+ }
	+ break;
	+ }
	+ default:
	+ bcopy(data, xdr.xdr_idx, encoded_size);
	+ }
	+
	+ nvl->nv_size += head.encoded_size;
	+
	+ return (0);
	+}
	+
	+int
	+nvlist_add_boolean_value(nvlist_t nvl, const char name, bool value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1,
	+ &value));
	+}
	+
	+int
	+nvlist_add_byte(nvlist_t nvl, const char name, uint8_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &value));
	+}
	+
	+int
	+nvlist_add_int8(nvlist_t nvl, const char name, int8_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &value));
	+}
	+
	+int
	+nvlist_add_uint8(nvlist_t nvl, const char name, uint8_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &value));
	+}
	+
	+int
	+nvlist_add_int16(nvlist_t nvl, const char name, int16_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &value));
	+}
	+
	+int
	+nvlist_add_uint16(nvlist_t nvl, const char name, uint16_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &value));
	+}
	+
	+int
	+nvlist_add_int32(nvlist_t nvl, const char name, int32_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &value));
	+}
	+
	+int
	+nvlist_add_uint32(nvlist_t nvl, const char name, uint32_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &value));
	+}
	+
	+int
	+nvlist_add_int64(nvlist_t nvl, const char name, int64_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &value));
	+}
	+
	+int
	+nvlist_add_uint64(nvlist_t nvl, const char name, uint64_t value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &value));
	+}
	+
	+int
	+nvlist_add_string(nvlist_t nvl, const char name, const char *value)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, value));
	+}
	+
	+int
	+nvlist_add_boolean_array(nvlist_t nvl, const char name,
	+ bool *a, uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_byte_array(nvlist_t nvl, const char name, uint8_t *a, uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_int8_array(nvlist_t nvl, const char name, int8_t *a, uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_uint8_array(nvlist_t nvl, const char name, uint8_t *a, uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_int16_array(nvlist_t nvl, const char name, int16_t *a, uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_uint16_array(nvlist_t nvl, const char name, uint16_t *a,
	+ uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_int32_array(nvlist_t nvl, const char name, int32_t *a, uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_uint32_array(nvlist_t nvl, const char name, uint32_t *a,
	+ uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_int64_array(nvlist_t nvl, const char name, int64_t *a, uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_uint64_array(nvlist_t nvl, const char name, uint64_t *a,
	+ uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_string_array(nvlist_t nvl, const char name,
	+ char * const *a, uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
	+}
	+
	+int
	+nvlist_add_nvlist(nvlist_t nvl, const char name, nvlist_t *val)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
	+}
	+
	+int
	+nvlist_add_nvlist_array(nvlist_t nvl, const char name, nvlist_t **a,
	+ uint32_t n)
	+{
	+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
	+}
	+
	+static const char *typenames[] = {
	+ "DATA_TYPE_UNKNOWN",
	+ "DATA_TYPE_BOOLEAN",
	+ "DATA_TYPE_BYTE",
	+ "DATA_TYPE_INT16",
	+ "DATA_TYPE_UINT16",
	+ "DATA_TYPE_INT32",
	+ "DATA_TYPE_UINT32",
	+ "DATA_TYPE_INT64",
	+ "DATA_TYPE_UINT64",
	+ "DATA_TYPE_STRING",
	+ "DATA_TYPE_BYTE_ARRAY",
	+ "DATA_TYPE_INT16_ARRAY",
	+ "DATA_TYPE_UINT16_ARRAY",
	+ "DATA_TYPE_INT32_ARRAY",
	+ "DATA_TYPE_UINT32_ARRAY",
	+ "DATA_TYPE_INT64_ARRAY",
	+ "DATA_TYPE_UINT64_ARRAY",
	+ "DATA_TYPE_STRING_ARRAY",
	+ "DATA_TYPE_HRTIME",
	+ "DATA_TYPE_NVLIST",
	+ "DATA_TYPE_NVLIST_ARRAY",
	+ "DATA_TYPE_BOOLEAN_VALUE",
	+ "DATA_TYPE_INT8",
	+ "DATA_TYPE_UINT8",
	+ "DATA_TYPE_BOOLEAN_ARRAY",
	+ "DATA_TYPE_INT8_ARRAY",
	+ "DATA_TYPE_UINT8_ARRAY"
	+};
	+
	+int
	+nvpair_type_from_name(const char *name)
	+{
	+ unsigned i;
	+
	+ for (i = 0; i < nitems(typenames); i++) {
	+ if (strcmp(name, typenames[i]) == 0)
	+ return (i);
	+ }
	+ return (0);
	+}
	+
	+nvp_header_t *
	+nvpair_find(nvlist_t nv, const char name)
	+{
	+ nvp_header_t *nvh;
	+
	+ nvh = NULL;
	+ while ((nvh = nvlist_next_nvpair(nv, nvh)) != NULL) {
	+ nv_string_t *nvp_name;
	+
	+ nvp_name = (nv_string_t *)(nvh + 1);
	+ if (nvp_name->nv_size == strlen(name) &&
	+ memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0)
	+ break;
	+ }
	+ return (nvh);
	+}
	+
	+void
	+nvpair_print(nvp_header_t *nvp, unsigned int indent)
	+{
	+ nv_string_t *nvp_name;
	+ nv_pair_data_t *nvp_data;
	+ nvlist_t nvlist;
	+ unsigned i, j;
	+ xdr_t xdr = {
	+ .xdr_op = XDR_OP_DECODE,
	+ .xdr_getint = _getint_mem,
	+ .xdr_getuint = _getuint_mem,
	+ .xdr_buf = (const uint8_t *)nvp,
	+ .xdr_idx = NULL,
	+ .xdr_buf_size = nvp->encoded_size
	+ };
	+
	+ nvp_name = (nv_string_t )((uintptr_t)nvp + sizeof(nvp));
	+ nvp_data = (nv_pair_data_t *)
	+ NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] + nvp_name->nv_size);
	+
	+ for (i = 0; i < indent; i++)
	+ printf(" ");
	+
	+ printf("%s [%d] %.*s", typenames[nvp_data->nv_type],
	+ nvp_data->nv_nelem, nvp_name->nv_size, nvp_name->nv_data);
	+
	+ xdr.xdr_idx = nvp_data->nv_data;
	+ switch (nvp_data->nv_type) {
	+ case DATA_TYPE_BYTE:
	+ case DATA_TYPE_INT8:
	+ case DATA_TYPE_UINT8: {
	+ char c;
	+
	+ if (xdr_char(&xdr, &c))
	+ printf(" = 0x%x\n", c);
	+ break;
	+ }
	+
	+ case DATA_TYPE_INT16:
	+ case DATA_TYPE_UINT16: {
	+ unsigned short u;
	+
	+ if (xdr_u_short(&xdr, &u))
	+ printf(" = 0x%hx\n", u);
	+ break;
	+ }
	+
	+ case DATA_TYPE_BOOLEAN_VALUE:
	+ case DATA_TYPE_INT32:
	+ case DATA_TYPE_UINT32: {
	+ unsigned u;
	+
	+ if (xdr_u_int(&xdr, &u))
	+ printf(" = 0x%x\n", u);
	+ break;
	+ }
	+
	+ case DATA_TYPE_INT64:
	+ case DATA_TYPE_UINT64: {
	+ uint64_t u;
	+
	+ if (xdr_uint64(&xdr, &u))
	+ printf(" = 0x%jx\n", (uintmax_t)u);
	+ break;
	+ }
	+
	+ case DATA_TYPE_INT64_ARRAY:
	+ case DATA_TYPE_UINT64_ARRAY: {
	+ uint64_t *u;
	+
	+ if (xdr_array(&xdr, nvp_data->nv_nelem,
	+ (xdrproc_t)xdr_uint64)) {
	+ u = (uint64_t *)(nvp_data->nv_data + sizeof(unsigned));
	+ for (i = 0; i < nvp_data->nv_nelem; i++)
	+ printf(" [%u] = 0x%jx", i, (uintmax_t)u[i]);
	+ printf("\n");
	+ }
	+
	+ break;
	+ }
	+
	+ case DATA_TYPE_STRING:
	+ case DATA_TYPE_STRING_ARRAY:
	+ nvp_name = (nv_string_t *)&nvp_data->nv_data[0];
	+ for (i = 0; i < nvp_data->nv_nelem; i++) {
	+ printf(" = \"%.*s\"\n", nvp_name->nv_size,
	+ nvp_name->nv_data);
	+ }
	+ break;
	+
	+ case DATA_TYPE_NVLIST:
	+ printf("\n");
	+ nvlist.nv_data = &nvp_data->nv_data[0];
	+ nvlist_print(&nvlist, indent + 2);
	+ break;
	+
	+ case DATA_TYPE_NVLIST_ARRAY:
	+ nvlist.nv_data = &nvp_data->nv_data[0];
	+ for (j = 0; j < nvp_data->nv_nelem; j++) {
	+ size_t size;
	+
	+ printf("[%d]\n", j);
	+ nvlist_print(&nvlist, indent + 2);
	+ if (j != nvp_data->nv_nelem - 1) {
	+ for (i = 0; i < indent; i++)
	+ printf(" ");
	+ printf("%s %.*s",
	+ typenames[nvp_data->nv_type],
	+ nvp_name->nv_size,
	+ nvp_name->nv_data);
	+ }
	+ xdr.xdr_idx = nvlist.nv_data;
	+ xdr.xdr_buf = xdr.xdr_idx;
	+ xdr.xdr_buf_size = nvp->encoded_size -
	+ (xdr.xdr_idx - (uint8_t *)nvp);
	+
	+ if (!nvlist_size_native(&xdr, &size))
	+ return;
	+
	+ nvlist.nv_data += size;
	+ }
	+ break;
	+
	+ default:
	+ printf("\n");
	+ }
	+}
	+
	+void
	+nvlist_print(const nvlist_t *nvl, unsigned int indent)
	+{
	+ nvs_data_t *data;
	+ nvp_header_t *nvp;
	+
	+ data = (nvs_data_t *)nvl->nv_data;
	+ nvp = &data->nvl_pair; /* first pair in nvlist */
	+ while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
	+ nvpair_print(nvp, indent);
	+ nvp = (nvp_header_t )((uint8_t )nvp + nvp->encoded_size);
	+ }
	+ printf("%*s\n", indent + 13, "End of nvlist");
	+}
	Index: usr.sbin/makefs/zfs/zfsimpl.h
	===================================================================
	--- /dev/null
	+++ usr.sbin/makefs/zfs/zfsimpl.h
	@@ -0,0 +1,2119 @@
	+/*-
	+ * Copyright (c) 2002 McAfee, Inc.
	+ * All rights reserved.
	+ *
	+ * This software was developed for the FreeBSD Project by Marshall
	+ * Kirk McKusick and McAfee Research,, the Security Research Division of
	+ * McAfee, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as
	+ * part of the DARPA CHATS research program
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+/*
	+ * CDDL HEADER START
	+ *
	+ * The contents of this file are subject to the terms of the
	+ * Common Development and Distribution License (the "License").
	+ * You may not use this file except in compliance with the License.
	+ *
	+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	+ * or http://www.opensolaris.org/os/licensing.
	+ * See the License for the specific language governing permissions
	+ * and limitations under the License.
	+ *
	+ * When distributing Covered Code, include this CDDL HEADER in each
	+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	+ * If applicable, add the following below this CDDL HEADER, with the
	+ * fields enclosed by brackets "[]" replaced with your own identifying
	+ * information: Portions Copyright [yyyy] [name of copyright owner]
	+ *
	+ * CDDL HEADER END
	+ */
	+/*
	+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	+ * Use is subject to license terms.
	+ */
	+/*
	+ * Copyright 2013 by Saso Kiselkov. All rights reserved.
	+ */
	+/*
	+ * Copyright (c) 2020 by Delphix. All rights reserved.
	+ */
	+
	+#include <sys/queue.h>
	+
	+#ifndef _ZFSIMPL_H_
	+#define _ZFSIMPL_H_
	+
	+#define MAXNAMELEN 256
	+
	+#define _NOTE(s)
	+
	+/*
	+ * AVL comparator helpers
	+ */
	+#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
	+#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
	+#define AVL_PCMP(a, b) \
	+ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
	+
	+typedef enum { B_FALSE, B_TRUE } boolean_t;
	+
	+/* CRC64 table */
	+#define ZFS_CRC64_POLY 0xC96C5795D7870F42UL /* ECMA-182, reflected form */
	+
	+/*
	+ * Macros for various sorts of alignment and rounding when the alignment
	+ * is known to be a power of 2.
	+ */
	+#define P2ALIGN(x, align) ((x) & -(align))
	+#define P2PHASE(x, align) ((x) & ((align) - 1))
	+#define P2NPHASE(x, align) (-(x) & ((align) - 1))
	+#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
	+#define P2END(x, align) (-(~(x) & -(align)))
	+#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
	+#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1)
	+
	+/*
	+ * General-purpose 32-bit and 64-bit bitfield encodings.
	+ */
	+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
	+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
	+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
	+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
	+
	+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
	+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
	+
	+#define BF32_SET(x, low, len, val) \
	+ ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
	+#define BF64_SET(x, low, len, val) \
	+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
	+
	+#define BF32_GET_SB(x, low, len, shift, bias) \
	+ ((BF32_GET(x, low, len) + (bias)) << (shift))
	+#define BF64_GET_SB(x, low, len, shift, bias) \
	+ ((BF64_GET(x, low, len) + (bias)) << (shift))
	+
	+#define BF32_SET_SB(x, low, len, shift, bias, val) \
	+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
	+#define BF64_SET_SB(x, low, len, shift, bias, val) \
	+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
	+
	+/*
	+ * Macros to reverse byte order
	+ */
	+#define BSWAP_8(x) ((x) & 0xff)
	+#define BSWAP_16(x) ((BSWAP_8(x) << 8) \| BSWAP_8((x) >> 8))
	+#define BSWAP_32(x) ((BSWAP_16(x) << 16) \| BSWAP_16((x) >> 16))
	+#define BSWAP_64(x) ((BSWAP_32(x) << 32) \| BSWAP_32((x) >> 32))
	+
	+#define SPA_MINBLOCKSHIFT 9
	+#define SPA_OLDMAXBLOCKSHIFT 17
	+#define SPA_MAXBLOCKSHIFT 24
	+#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
	+#define SPA_OLDMAXBLOCKSIZE (1ULL << SPA_OLDMAXBLOCKSHIFT)
	+#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
	+
	+/*
	+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
	+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
	+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
	+ * overhead, three DVAs per bp, plus one more bit in case we do anything
	+ * else that expands the ASIZE.
	+ */
	+#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
	+#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
	+#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
	+
	+/*
	+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
	+ * The members of the dva_t should be considered opaque outside the SPA.
	+ */
	+typedef struct dva {
	+ uint64_t dva_word[2];
	+} dva_t;
	+
	+/*
	+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
	+ */
	+typedef struct zio_cksum {
	+ uint64_t zc_word[4];
	+} zio_cksum_t;
	+
	+/*
	+ * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
	+ * secret and is suitable for use in MAC algorithms as the key.
	+ */
	+typedef struct zio_cksum_salt {
	+ uint8_t zcs_bytes[32];
	+} zio_cksum_salt_t;
	+
	+/*
	+ * Each block is described by its DVAs, time of birth, checksum, etc.
	+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
	+ *
	+ * 64 56 48 40 32 24 16 8 0
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 0 \| vdev1 \| GRID \| ASIZE \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 1 \|G\| offset1 \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 2 \| vdev2 \| GRID \| ASIZE \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 3 \|G\| offset2 \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 4 \| vdev3 \| GRID \| ASIZE \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 5 \|G\| offset3 \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 6 \|BDX\|lvl\| type \| cksum \|E\| comp\| PSIZE \| LSIZE \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 7 \| padding \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 8 \| padding \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 9 \| physical birth txg \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * a \| logical birth txg \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * b \| fill count \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * c \| checksum[0] \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * d \| checksum[1] \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * e \| checksum[2] \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * f \| checksum[3] \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ *
	+ * Legend:
	+ *
	+ * vdev virtual device ID
	+ * offset offset into virtual device
	+ * LSIZE logical size
	+ * PSIZE physical size (after compression)
	+ * ASIZE allocated size (including RAID-Z parity and gang block headers)
	+ * GRID RAID-Z layout information (reserved for future use)
	+ * cksum checksum function
	+ * comp compression function
	+ * G gang block indicator
	+ * B byteorder (endianness)
	+ * D dedup
	+ * X encryption (on version 30, which is not supported)
	+ * E blkptr_t contains embedded data (see below)
	+ * lvl level of indirection
	+ * type DMU object type
	+ * phys birth txg of block allocation; zero if same as logical birth txg
	+ * log. birth transaction group in which the block was logically born
	+ * fill count number of non-zero blocks under this bp
	+ * checksum[4] 256-bit checksum of the data this bp describes
	+ */
	+
	+/*
	+ * "Embedded" blkptr_t's don't actually point to a block, instead they
	+ * have a data payload embedded in the blkptr_t itself. See the comment
	+ * in blkptr.c for more details.
	+ *
	+ * The blkptr_t is laid out as follows:
	+ *
	+ * 64 56 48 40 32 24 16 8 0
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 0 \| payload \|
	+ * 1 \| payload \|
	+ * 2 \| payload \|
	+ * 3 \| payload \|
	+ * 4 \| payload \|
	+ * 5 \| payload \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 6 \|BDX\|lvl\| type \| etype \|E\| comp\| PSIZE\| LSIZE \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 7 \| payload \|
	+ * 8 \| payload \|
	+ * 9 \| payload \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * a \| logical birth txg \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * b \| payload \|
	+ * c \| payload \|
	+ * d \| payload \|
	+ * e \| payload \|
	+ * f \| payload \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ *
	+ * Legend:
	+ *
	+ * payload contains the embedded data
	+ * B (byteorder) byteorder (endianness)
	+ * D (dedup) padding (set to zero)
	+ * X encryption (set to zero; see above)
	+ * E (embedded) set to one
	+ * lvl indirection level
	+ * type DMU object type
	+ * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
	+ * comp compression function of payload
	+ * PSIZE size of payload after compression, in bytes
	+ * LSIZE logical size of payload, in bytes
	+ * note that 25 bits is enough to store the largest
	+ * "normal" BP's LSIZE (2^16 * 2^9) in bytes
	+ * log. birth transaction group in which the block was logically born
	+ *
	+ * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
	+ * bp's they are stored in units of SPA_MINBLOCKSHIFT.
	+ * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
	+ * The B, D, X, lvl, type, and comp fields are stored the same as with normal
	+ * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
	+ * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
	+ * other macros, as they assert that they are only used on BP's of the correct
	+ * "embedded-ness".
	+ */
	+
	+#define BPE_GET_ETYPE(bp) \
	+ (assert(BP_IS_EMBEDDED(bp)), \
	+ BF64_GET((bp)->blk_prop, 40, 8))
	+#define BPE_SET_ETYPE(bp, t) do { \
	+ assert(BP_IS_EMBEDDED(bp)); \
	+ BF64_SET((bp)->blk_prop, 40, 8, t); \
	+_NOTE(CONSTCOND) } while (0)
	+
	+#define BPE_GET_LSIZE(bp) \
	+ (assert(BP_IS_EMBEDDED(bp)), \
	+ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
	+#define BPE_SET_LSIZE(bp, x) do { \
	+ assert(BP_IS_EMBEDDED(bp)); \
	+ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
	+_NOTE(CONSTCOND) } while (0)
	+
	+#define BPE_GET_PSIZE(bp) \
	+ (assert(BP_IS_EMBEDDED(bp)), \
	+ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
	+#define BPE_SET_PSIZE(bp, x) do { \
	+ assert(BP_IS_EMBEDDED(bp)); \
	+ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
	+_NOTE(CONSTCOND) } while (0)
	+
	+typedef enum bp_embedded_type {
	+ BP_EMBEDDED_TYPE_DATA,
	+ BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
	+ NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
	+} bp_embedded_type_t;
	+
	+#define BPE_NUM_WORDS 14
	+#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
	+#define BPE_IS_PAYLOADWORD(bp, wp) \
	+ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
	+
	+#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
	+#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
	+
	+typedef struct blkptr {
	+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
	+ uint64_t blk_prop; /* size, compression, type, etc */
	+ uint64_t blk_pad[2]; /* Extra space for the future */
	+ uint64_t blk_phys_birth; /* txg when block was allocated */
	+ uint64_t blk_birth; /* transaction group at birth */
	+ uint64_t blk_fill; /* fill count */
	+ zio_cksum_t blk_cksum; /* 256-bit checksum */
	+} blkptr_t;
	+
	+/*
	+ * Macros to get and set fields in a bp or DVA.
	+ */
	+#define DVA_GET_ASIZE(dva) \
	+ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
	+#define DVA_SET_ASIZE(dva, x) \
	+ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
	+ SPA_MINBLOCKSHIFT, 0, x)
	+
	+#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
	+#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
	+
	+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
	+#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
	+
	+#define DVA_GET_OFFSET(dva) \
	+ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
	+#define DVA_SET_OFFSET(dva, x) \
	+ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
	+
	+#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
	+#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
	+
	+#define BP_GET_LSIZE(bp) \
	+ (BP_IS_EMBEDDED(bp) ? \
	+ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
	+ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
	+#define BP_SET_LSIZE(bp, x) do { \
	+ assert(!BP_IS_EMBEDDED(bp)); \
	+ BF64_SET_SB((bp)->blk_prop, \
	+ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
	+_NOTE(CONSTCOND) } while (0)
	+
	+#define BP_GET_PSIZE(bp) \
	+ BF64_GET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
	+#define BP_SET_PSIZE(bp, x) \
	+ BF64_SET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
	+
	+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
	+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
	+
	+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
	+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
	+
	+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
	+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
	+
	+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
	+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
	+
	+#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
	+
	+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
	+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
	+
	+#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
	+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
	+
	+#define BP_PHYSICAL_BIRTH(bp) \
	+ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
	+
	+#define BP_SET_BIRTH(bp, logical, physical) \
	+{ \
	+ assert(!BP_IS_EMBEDDED(bp)); \
	+ (bp)->blk_birth = (logical); \
	+ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
	+}
	+
	+#define BP_GET_FILL(bp) \
	+ ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)
	+
	+#define BP_SET_FILL(bp, fill) \
	+{ \
	+ (bp)->blk_fill = fill; \
	+}
	+
	+#define BP_GET_ASIZE(bp) \
	+ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
	+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
	+
	+#define BP_GET_UCSIZE(bp) \
	+ ((BP_GET_LEVEL(bp) > 0 \|\| dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
	+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
	+
	+#define BP_GET_NDVAS(bp) \
	+ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
	+ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
	+ !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
	+
	+#define DVA_EQUAL(dva1, dva2) \
	+ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
	+ (dva1)->dva_word[0] == (dva2)->dva_word[0])
	+
	+#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
	+ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) \| \
	+ ((zc1).zc_word[1] - (zc2).zc_word[1]) \| \
	+ ((zc1).zc_word[2] - (zc2).zc_word[2]) \| \
	+ ((zc1).zc_word[3] - (zc2).zc_word[3])))
	+
	+
	+#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
	+
	+#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
	+{ \
	+ (zcp)->zc_word[0] = w0; \
	+ (zcp)->zc_word[1] = w1; \
	+ (zcp)->zc_word[2] = w2; \
	+ (zcp)->zc_word[3] = w3; \
	+}
	+
	+#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
	+#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
	+#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
	+ (dva)->dva_word[1] == 0ULL)
	+#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp))
	+#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
	+
	+#define BP_ZERO(bp) \
	+{ \
	+ (bp)->blk_dva[0].dva_word[0] = 0; \
	+ (bp)->blk_dva[0].dva_word[1] = 0; \
	+ (bp)->blk_dva[1].dva_word[0] = 0; \
	+ (bp)->blk_dva[1].dva_word[1] = 0; \
	+ (bp)->blk_dva[2].dva_word[0] = 0; \
	+ (bp)->blk_dva[2].dva_word[1] = 0; \
	+ (bp)->blk_prop = 0; \
	+ (bp)->blk_pad[0] = 0; \
	+ (bp)->blk_pad[1] = 0; \
	+ (bp)->blk_phys_birth = 0; \
	+ (bp)->blk_birth = 0; \
	+ (bp)->blk_fill = 0; \
	+ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
	+}
	+
	+#if BYTE_ORDER == _BIG_ENDIAN
	+#define ZFS_HOST_BYTEORDER (0ULL)
	+#else
	+#define ZFS_HOST_BYTEORDER (1ULL)
	+#endif
	+
	+#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
	+#define BPE_NUM_WORDS 14
	+#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
	+#define BPE_IS_PAYLOADWORD(bp, wp) \
	+ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
	+
	+#define TXG_INITIAL 4
	+
	+/*
	+ * Embedded checksum
	+ */
	+#define ZEC_MAGIC 0x210da7ab10c7a11ULL
	+
	+typedef struct zio_eck {
	+ uint64_t zec_magic; /* for validation, endianness */
	+ zio_cksum_t zec_cksum; /* 256-bit checksum */
	+} zio_eck_t;
	+
	+/*
	+ * Gang block headers are self-checksumming and contain an array
	+ * of block pointers.
	+ */
	+#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
	+#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
	+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
	+#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
	+ sizeof (zio_eck_t) - \
	+ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
	+ sizeof (uint64_t))
	+
	+typedef struct zio_gbh {
	+ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
	+ uint64_t zg_filler[SPA_GBH_FILLER];
	+ zio_eck_t zg_tail;
	+} zio_gbh_phys_t;
	+
	+#define VDEV_RAIDZ_MAXPARITY 3
	+
	+#define VDEV_PAD_SIZE (8 << 10)
	+/* 2 padding areas (vl_pad1 and vl_be) to skip */
	+#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
	+#define VDEV_PHYS_SIZE (112 << 10)
	+#define VDEV_UBERBLOCK_RING (128 << 10)
	+
	+/*
	+ * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
	+ * ring when MMP is enabled.
	+ */
	+#define MMP_BLOCKS_PER_LABEL 1
	+
	+/* The largest uberblock we support is 8k. */
	+#define MAX_UBERBLOCK_SHIFT (13)
	+#define VDEV_UBERBLOCK_SHIFT(vd) \
	+ MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT)
	+#define VDEV_UBERBLOCK_COUNT(vd) \
	+ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
	+#define VDEV_UBERBLOCK_OFFSET(vd, n) \
	+ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
	+#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
	+
	+typedef struct vdev_phys {
	+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
	+ zio_eck_t vp_zbt;
	+} vdev_phys_t;
	+
	+typedef enum vbe_vers {
	+ /* The bootenv file is stored as ascii text in the envblock */
	+ VB_RAW = 0,
	+
	+ /*
	+ * The bootenv file is converted to an nvlist and then packed into the
	+ * envblock.
	+ */
	+ VB_NVLIST = 1
	+} vbe_vers_t;
	+
	+typedef struct vdev_boot_envblock {
	+ uint64_t vbe_version;
	+ char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) -
	+ sizeof (zio_eck_t)];
	+ zio_eck_t vbe_zbt;
	+} vdev_boot_envblock_t;
	+
	+_Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE,
	+ "incorrect vdev_boot_envblock size");
	+
	+typedef struct vdev_label {
	+ char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
	+ vdev_boot_envblock_t vl_be; /* 8K */
	+ vdev_phys_t vl_vdev_phys; /* 112K */
	+ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
	+} vdev_label_t; /* 256K total */
	+
	+/*
	+ * vdev_dirty() flags
	+ */
	+#define VDD_METASLAB 0x01
	+#define VDD_DTL 0x02
	+
	+/*
	+ * Size and offset of embedded boot loader region on each label.
	+ * The total size of the first two labels plus the boot area is 4MB.
	+ */
	+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
	+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
	+
	+/*
	+ * Size of label regions at the start and end of each leaf device.
	+ */
	+#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
	+#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
	+#define VDEV_LABELS 4
	+
	+enum zio_checksum {
	+ ZIO_CHECKSUM_INHERIT = 0,
	+ ZIO_CHECKSUM_ON,
	+ ZIO_CHECKSUM_OFF,
	+ ZIO_CHECKSUM_LABEL,
	+ ZIO_CHECKSUM_GANG_HEADER,
	+ ZIO_CHECKSUM_ZILOG,
	+ ZIO_CHECKSUM_FLETCHER_2,
	+ ZIO_CHECKSUM_FLETCHER_4,
	+ ZIO_CHECKSUM_SHA256,
	+ ZIO_CHECKSUM_ZILOG2,
	+ ZIO_CHECKSUM_NOPARITY,
	+ ZIO_CHECKSUM_SHA512,
	+ ZIO_CHECKSUM_SKEIN,
	+ ZIO_CHECKSUM_EDONR,
	+ ZIO_CHECKSUM_FUNCTIONS
	+};
	+
	+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
	+#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
	+
	+enum zio_compress {
	+ ZIO_COMPRESS_INHERIT = 0,
	+ ZIO_COMPRESS_ON,
	+ ZIO_COMPRESS_OFF,
	+ ZIO_COMPRESS_LZJB,
	+ ZIO_COMPRESS_EMPTY,
	+ ZIO_COMPRESS_GZIP_1,
	+ ZIO_COMPRESS_GZIP_2,
	+ ZIO_COMPRESS_GZIP_3,
	+ ZIO_COMPRESS_GZIP_4,
	+ ZIO_COMPRESS_GZIP_5,
	+ ZIO_COMPRESS_GZIP_6,
	+ ZIO_COMPRESS_GZIP_7,
	+ ZIO_COMPRESS_GZIP_8,
	+ ZIO_COMPRESS_GZIP_9,
	+ ZIO_COMPRESS_ZLE,
	+ ZIO_COMPRESS_LZ4,
	+ ZIO_COMPRESS_ZSTD,
	+ ZIO_COMPRESS_FUNCTIONS
	+};
	+
	+enum zio_zstd_levels {
	+ ZIO_ZSTD_LEVEL_INHERIT = 0,
	+ ZIO_ZSTD_LEVEL_1,
	+#define ZIO_ZSTD_LEVEL_MIN ZIO_ZSTD_LEVEL_1
	+ ZIO_ZSTD_LEVEL_2,
	+ ZIO_ZSTD_LEVEL_3,
	+#define ZIO_ZSTD_LEVEL_DEFAULT ZIO_ZSTD_LEVEL_3
	+ ZIO_ZSTD_LEVEL_4,
	+ ZIO_ZSTD_LEVEL_5,
	+ ZIO_ZSTD_LEVEL_6,
	+ ZIO_ZSTD_LEVEL_7,
	+ ZIO_ZSTD_LEVEL_8,
	+ ZIO_ZSTD_LEVEL_9,
	+ ZIO_ZSTD_LEVEL_10,
	+ ZIO_ZSTD_LEVEL_11,
	+ ZIO_ZSTD_LEVEL_12,
	+ ZIO_ZSTD_LEVEL_13,
	+ ZIO_ZSTD_LEVEL_14,
	+ ZIO_ZSTD_LEVEL_15,
	+ ZIO_ZSTD_LEVEL_16,
	+ ZIO_ZSTD_LEVEL_17,
	+ ZIO_ZSTD_LEVEL_18,
	+ ZIO_ZSTD_LEVEL_19,
	+#define ZIO_ZSTD_LEVEL_MAX ZIO_ZSTD_LEVEL_19
	+ ZIO_ZSTD_LEVEL_RESERVE = 101, /* Leave room for new positive levels */
	+ ZIO_ZSTD_LEVEL_FAST, /* Fast levels are negative */
	+ ZIO_ZSTD_LEVEL_FAST_1,
	+#define ZIO_ZSTD_LEVEL_FAST_DEFAULT ZIO_ZSTD_LEVEL_FAST_1
	+ ZIO_ZSTD_LEVEL_FAST_2,
	+ ZIO_ZSTD_LEVEL_FAST_3,
	+ ZIO_ZSTD_LEVEL_FAST_4,
	+ ZIO_ZSTD_LEVEL_FAST_5,
	+ ZIO_ZSTD_LEVEL_FAST_6,
	+ ZIO_ZSTD_LEVEL_FAST_7,
	+ ZIO_ZSTD_LEVEL_FAST_8,
	+ ZIO_ZSTD_LEVEL_FAST_9,
	+ ZIO_ZSTD_LEVEL_FAST_10,
	+ ZIO_ZSTD_LEVEL_FAST_20,
	+ ZIO_ZSTD_LEVEL_FAST_30,
	+ ZIO_ZSTD_LEVEL_FAST_40,
	+ ZIO_ZSTD_LEVEL_FAST_50,
	+ ZIO_ZSTD_LEVEL_FAST_60,
	+ ZIO_ZSTD_LEVEL_FAST_70,
	+ ZIO_ZSTD_LEVEL_FAST_80,
	+ ZIO_ZSTD_LEVEL_FAST_90,
	+ ZIO_ZSTD_LEVEL_FAST_100,
	+ ZIO_ZSTD_LEVEL_FAST_500,
	+ ZIO_ZSTD_LEVEL_FAST_1000,
	+#define ZIO_ZSTD_LEVEL_FAST_MAX ZIO_ZSTD_LEVEL_FAST_1000
	+ ZIO_ZSTD_LEVEL_AUTO = 251, /* Reserved for future use */
	+ ZIO_ZSTD_LEVEL_LEVELS
	+};
	+
	+#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
	+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
	+
	+/*
	+ * On-disk version number.
	+ */
	+#define SPA_VERSION_1 1ULL
	+#define SPA_VERSION_2 2ULL
	+#define SPA_VERSION_3 3ULL
	+#define SPA_VERSION_4 4ULL
	+#define SPA_VERSION_5 5ULL
	+#define SPA_VERSION_6 6ULL
	+#define SPA_VERSION_7 7ULL
	+#define SPA_VERSION_8 8ULL
	+#define SPA_VERSION_9 9ULL
	+#define SPA_VERSION_10 10ULL
	+#define SPA_VERSION_11 11ULL
	+#define SPA_VERSION_12 12ULL
	+#define SPA_VERSION_13 13ULL
	+#define SPA_VERSION_14 14ULL
	+#define SPA_VERSION_15 15ULL
	+#define SPA_VERSION_16 16ULL
	+#define SPA_VERSION_17 17ULL
	+#define SPA_VERSION_18 18ULL
	+#define SPA_VERSION_19 19ULL
	+#define SPA_VERSION_20 20ULL
	+#define SPA_VERSION_21 21ULL
	+#define SPA_VERSION_22 22ULL
	+#define SPA_VERSION_23 23ULL
	+#define SPA_VERSION_24 24ULL
	+#define SPA_VERSION_25 25ULL
	+#define SPA_VERSION_26 26ULL
	+#define SPA_VERSION_27 27ULL
	+#define SPA_VERSION_28 28ULL
	+#define SPA_VERSION_5000 5000ULL
	+
	+/*
	+ * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
	+ * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
	+ * and do the appropriate changes. Also bump the version number in
	+ * usr/src/grub/capability.
	+ */
	+#define SPA_VERSION SPA_VERSION_5000
	+#define SPA_VERSION_STRING "5000"
	+
	+/*
	+ * Symbolic names for the changes that caused a SPA_VERSION switch.
	+ * Used in the code when checking for presence or absence of a feature.
	+ * Feel free to define multiple symbolic names for each version if there
	+ * were multiple changes to on-disk structures during that version.
	+ *
	+ * NOTE: When checking the current SPA_VERSION in your code, be sure
	+ * to use spa_version() since it reports the version of the
	+ * last synced uberblock. Checking the in-flight version can
	+ * be dangerous in some cases.
	+ */
	+#define SPA_VERSION_INITIAL SPA_VERSION_1
	+#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
	+#define SPA_VERSION_SPARES SPA_VERSION_3
	+#define SPA_VERSION_RAID6 SPA_VERSION_3
	+#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3
	+#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
	+#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
	+#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
	+#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
	+#define SPA_VERSION_BOOTFS SPA_VERSION_6
	+#define SPA_VERSION_SLOGS SPA_VERSION_7
	+#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
	+#define SPA_VERSION_FUID SPA_VERSION_9
	+#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
	+#define SPA_VERSION_REFQUOTA SPA_VERSION_9
	+#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
	+#define SPA_VERSION_L2CACHE SPA_VERSION_10
	+#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
	+#define SPA_VERSION_ORIGIN SPA_VERSION_11
	+#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
	+#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
	+#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
	+#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
	+#define SPA_VERSION_USERSPACE SPA_VERSION_15
	+#define SPA_VERSION_STMF_PROP SPA_VERSION_16
	+#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
	+#define SPA_VERSION_USERREFS SPA_VERSION_18
	+#define SPA_VERSION_HOLES SPA_VERSION_19
	+#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
	+#define SPA_VERSION_DEDUP SPA_VERSION_21
	+#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
	+#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
	+#define SPA_VERSION_SA SPA_VERSION_24
	+#define SPA_VERSION_SCAN SPA_VERSION_25
	+#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
	+#define SPA_VERSION_DEADLISTS SPA_VERSION_26
	+#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
	+#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
	+#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
	+#define SPA_VERSION_FEATURES SPA_VERSION_5000
	+
	+#define SPA_VERSION_IS_SUPPORTED(v) \
	+ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) \|\| \
	+ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
	+
	+/*
	+ * The following are configuration names used in the nvlist describing a pool's
	+ * configuration.
	+ */
	+#define ZPOOL_CONFIG_VERSION "version"
	+#define ZPOOL_CONFIG_POOL_NAME "name"
	+#define ZPOOL_CONFIG_POOL_STATE "state"
	+#define ZPOOL_CONFIG_POOL_TXG "txg"
	+#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
	+#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
	+#define ZPOOL_CONFIG_TOP_GUID "top_guid"
	+#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
	+#define ZPOOL_CONFIG_TYPE "type"
	+#define ZPOOL_CONFIG_CHILDREN "children"
	+#define ZPOOL_CONFIG_ID "id"
	+#define ZPOOL_CONFIG_GUID "guid"
	+#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
	+#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
	+#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
	+#define ZPOOL_CONFIG_PATH "path"
	+#define ZPOOL_CONFIG_DEVID "devid"
	+#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
	+#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
	+#define ZPOOL_CONFIG_ASHIFT "ashift"
	+#define ZPOOL_CONFIG_ASIZE "asize"
	+#define ZPOOL_CONFIG_DTL "DTL"
	+#define ZPOOL_CONFIG_STATS "stats"
	+#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
	+#define ZPOOL_CONFIG_ERRCOUNT "error_count"
	+#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
	+#define ZPOOL_CONFIG_SPARES "spares"
	+#define ZPOOL_CONFIG_IS_SPARE "is_spare"
	+#define ZPOOL_CONFIG_NPARITY "nparity"
	+#define ZPOOL_CONFIG_HOSTID "hostid"
	+#define ZPOOL_CONFIG_HOSTNAME "hostname"
	+#define ZPOOL_CONFIG_IS_LOG "is_log"
	+#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
	+#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
	+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
	+
	+/*
	+ * The persistent vdev state is stored as separate values rather than a single
	+ * 'vdev_state' entry. This is because a device can be in multiple states, such
	+ * as offline and degraded.
	+ */
	+#define ZPOOL_CONFIG_OFFLINE "offline"
	+#define ZPOOL_CONFIG_FAULTED "faulted"
	+#define ZPOOL_CONFIG_DEGRADED "degraded"
	+#define ZPOOL_CONFIG_REMOVED "removed"
	+#define ZPOOL_CONFIG_FRU "fru"
	+#define ZPOOL_CONFIG_AUX_STATE "aux_state"
	+
	+#define VDEV_TYPE_ROOT "root"
	+#define VDEV_TYPE_MIRROR "mirror"
	+#define VDEV_TYPE_REPLACING "replacing"
	+#define VDEV_TYPE_RAIDZ "raidz"
	+#define VDEV_TYPE_DISK "disk"
	+#define VDEV_TYPE_FILE "file"
	+#define VDEV_TYPE_MISSING "missing"
	+#define VDEV_TYPE_HOLE "hole"
	+#define VDEV_TYPE_SPARE "spare"
	+#define VDEV_TYPE_LOG "log"
	+#define VDEV_TYPE_L2CACHE "l2cache"
	+#define VDEV_TYPE_INDIRECT "indirect"
	+
	+/*
	+ * This is needed in userland to report the minimum necessary device size.
	+ */
	+#define SPA_MINDEVSIZE (64ULL << 20)
	+
	+/*
	+ * The location of the pool configuration repository, shared between kernel and
	+ * userland.
	+ */
	+#define ZPOOL_CACHE "/boot/zfs/zpool.cache"
	+
	+/*
	+ * vdev states are ordered from least to most healthy.
	+ * A vdev that's CANT_OPEN or below is considered unusable.
	+ */
	+typedef enum vdev_state {
	+ VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
	+ VDEV_STATE_CLOSED, /* Not currently open */
	+ VDEV_STATE_OFFLINE, /* Not allowed to open */
	+ VDEV_STATE_REMOVED, /* Explicitly removed from system */
	+ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
	+ VDEV_STATE_FAULTED, /* External request to fault device */
	+ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
	+ VDEV_STATE_HEALTHY /* Presumed good */
	+} vdev_state_t;
	+
	+/*
	+ * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
	+ * of the vdev stats structure uses these constants to distinguish why.
	+ */
	+typedef enum vdev_aux {
	+ VDEV_AUX_NONE, /* no error */
	+ VDEV_AUX_OPEN_FAILED, /* ldi_open_() or vn_open() failed /
	+ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
	+ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
	+ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
	+ VDEV_AUX_TOO_SMALL, /* vdev size is too small */
	+ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
	+ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
	+ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
	+ VDEV_AUX_SPARED /* hot spare used in another pool */
	+} vdev_aux_t;
	+
	+/*
	+ * pool state. The following states are written to disk as part of the normal
	+ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are
	+ * software abstractions used at various levels to communicate pool state.
	+ */
	+typedef enum pool_state {
	+ POOL_STATE_ACTIVE = 0, /* In active use */
	+ POOL_STATE_EXPORTED, /* Explicitly exported */
	+ POOL_STATE_DESTROYED, /* Explicitly destroyed */
	+ POOL_STATE_SPARE, /* Reserved for hot spare use */
	+ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
	+ POOL_STATE_UNAVAIL, /* Internal libzfs state */
	+ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
	+} pool_state_t;
	+
	+/*
	+ * The uberblock version is incremented whenever an incompatible on-disk
	+ * format change is made to the SPA, DMU, or ZAP.
	+ *
	+ * Note: the first two fields should never be moved. When a storage pool
	+ * is opened, the uberblock must be read off the disk before the version
	+ * can be checked. If the ub_version field is moved, we may not detect
	+ * version mismatch. If the ub_magic field is moved, applications that
	+ * expect the magic number in the first word won't work.
	+ */
	+#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
	+#define UBERBLOCK_SHIFT 10 /* up to 1K */
	+
	+#define MMP_MAGIC 0xa11cea11 /* all-see-all */
	+
	+#define MMP_INTERVAL_VALID_BIT 0x01
	+#define MMP_SEQ_VALID_BIT 0x02
	+#define MMP_FAIL_INT_VALID_BIT 0x04
	+
	+#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
	+ ubp->ub_mmp_magic == MMP_MAGIC)
	+#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
	+ MMP_INTERVAL_VALID_BIT))
	+#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
	+ MMP_SEQ_VALID_BIT))
	+#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
	+ MMP_FAIL_INT_VALID_BIT))
	+
	+#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
	+ >> 8)
	+#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
	+ >> 32)
	+#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
	+ >> 48)
	+
	+typedef struct uberblock {
	+ uint64_t ub_magic; /* UBERBLOCK_MAGIC */
	+ uint64_t ub_version; /* SPA_VERSION */
	+ uint64_t ub_txg; /* txg of last sync */
	+ uint64_t ub_guid_sum; /* sum of all vdev guids */
	+ uint64_t ub_timestamp; /* UTC time of last sync */
	+ blkptr_t ub_rootbp; /* MOS objset_phys_t */
	+ /* highest SPA_VERSION supported by software that wrote this txg */
	+ uint64_t ub_software_version;
	+ /* Maybe missing in uberblocks we read, but always written */
	+ uint64_t ub_mmp_magic;
	+ /*
	+ * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
	+ * Otherwise, nanosec since last MMP write.
	+ */
	+ uint64_t ub_mmp_delay;
	+
	+ /*
	+ * The ub_mmp_config contains the multihost write interval, multihost
	+ * fail intervals, sequence number for sub-second granularity, and
	+ * valid bit mask. This layout is as follows:
	+ *
	+ * 64 56 48 40 32 24 16 8 0
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ * 0 \| Fail Intervals\| Seq \| Write Interval (ms) \| VALID \|
	+ * +-------+-------+-------+-------+-------+-------+-------+-------+
	+ *
	+ * This allows a write_interval of (2^24/1000)s, over 4.5 hours
	+ *
	+ * VALID Bits:
	+ * - 0x01 - Write Interval (ms)
	+ * - 0x02 - Sequence number exists
	+ * - 0x04 - Fail Intervals
	+ * - 0xf8 - Reserved
	+ */
	+ uint64_t ub_mmp_config;
	+
	+ /*
	+ * ub_checkpoint_txg indicates two things about the current uberblock:
	+ *
	+ * 1] If it is not zero then this uberblock is a checkpoint. If it is
	+ * zero, then this uberblock is not a checkpoint.
	+ *
	+ * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
	+ * the ub_txg that the uberblock had at the time we moved it to
	+ * the MOS config.
	+ *
	+ * The field is set when we checkpoint the uberblock and continues to
	+ * hold that value even after we've rewound (unlike the ub_txg that
	+ * is reset to a higher value).
	+ *
	+ * Besides checks used to determine whether we are reopening the
	+ * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
	+ * the value of the field is used to determine which ZIL blocks have
	+ * been allocated according to the ms_sm when we are rewinding to a
	+ * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
	+ * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
	+ */
	+ uint64_t ub_checkpoint_txg;
	+} uberblock_t;
	+
	+/*
	+ * Flags.
	+ */
	+#define DNODE_MUST_BE_ALLOCATED 1
	+#define DNODE_MUST_BE_FREE 2
	+
	+/*
	+ * Fixed constants.
	+ */
	+#define DNODE_SHIFT 9 /* 512 bytes */
	+#define DN_MIN_INDBLKSHIFT 12 /* 4k */
	+#define DN_MAX_INDBLKSHIFT 17 /* 128k */
	+#define DNODE_BLOCK_SHIFT 14 /* 16k */
	+#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
	+#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
	+#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
	+
	+/*
	+ * Derived constants.
	+ */
	+#define DNODE_MIN_SIZE (1 << DNODE_SHIFT)
	+#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT)
	+#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT)
	+#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT)
	+#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT)
	+#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \
	+ (1 << SPA_BLKPTRSHIFT))
	+#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT)
	+#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE))
	+#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> \
	+ SPA_BLKPTRSHIFT)
	+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
	+#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
	+
	+#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
	+#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
	+#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
	+
	+/* The +2 here is a cheesy way to round up */
	+#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
	+ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
	+
	+#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
	+ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
	+
	+#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
	+ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
	+
	+#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
	+
	+/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
	+#define DNODE_FLAG_USED_BYTES (1<<0)
	+#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
	+
	+/* Does dnode have a SA spill blkptr in bonus? */
	+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
	+
	+typedef struct dnode_phys {
	+ uint8_t dn_type; /* dmu_object_type_t */
	+ uint8_t dn_indblkshift; /* ln2(indirect block size) */
	+ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
	+ uint8_t dn_nblkptr; /* length of dn_blkptr */
	+ uint8_t dn_bonustype; /* type of data in bonus buffer */
	+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
	+ uint8_t dn_compress; /* ZIO_COMPRESS type */
	+ uint8_t dn_flags; /* DNODE_FLAG_* */
	+ uint16_t dn_datablkszsec; /* data block size in 512b sectors */
	+ uint16_t dn_bonuslen; /* length of dn_bonus */
	+ uint8_t dn_extra_slots; /* # of subsequent slots consumed */
	+ uint8_t dn_pad2[3];
	+
	+ /* accounting is protected by dn_dirty_mtx */
	+ uint64_t dn_maxblkid; /* largest allocated block ID */
	+ uint64_t dn_used; /* bytes (or sectors) of disk space */
	+
	+ uint64_t dn_pad3[4];
	+
	+ /*
	+ * The tail region is 448 bytes for a 512 byte dnode, and
	+ * correspondingly larger for larger dnode sizes. The spill
	+ * block pointer, when present, is always at the end of the tail
	+ * region. There are three ways this space may be used, using
	+ * a 512 byte dnode for this diagram:
	+ *
	+ * 0 64 128 192 256 320 384 448 (offset)
	+ * +---------------+---------------+---------------+-------+
	+ * \| dn_blkptr[0] \| dn_blkptr[1] \| dn_blkptr[2] \| / \|
	+ * +---------------+---------------+---------------+-------+
	+ * \| dn_blkptr[0] \| dn_bonus[0..319] \|
	+ * +---------------+-----------------------+---------------+
	+ * \| dn_blkptr[0] \| dn_bonus[0..191] \| dn_spill \|
	+ * +---------------+-----------------------+---------------+
	+ */
	+ union {
	+ blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
	+ struct {
	+ blkptr_t __dn_ignore1;
	+ uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN];
	+ };
	+ struct {
	+ blkptr_t __dn_ignore2;
	+ uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
	+ sizeof (blkptr_t)];
	+ blkptr_t dn_spill;
	+ };
	+ };
	+} dnode_phys_t;
	+
	+#define DN_SPILL_BLKPTR(dnp) (blkptr_t )((char )(dnp) + \
	+ (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
	+
	+typedef enum dmu_object_byteswap {
	+ DMU_BSWAP_UINT8,
	+ DMU_BSWAP_UINT16,
	+ DMU_BSWAP_UINT32,
	+ DMU_BSWAP_UINT64,
	+ DMU_BSWAP_ZAP,
	+ DMU_BSWAP_DNODE,
	+ DMU_BSWAP_OBJSET,
	+ DMU_BSWAP_ZNODE,
	+ DMU_BSWAP_OLDACL,
	+ DMU_BSWAP_ACL,
	+ /*
	+ * Allocating a new byteswap type number makes the on-disk format
	+ * incompatible with any other format that uses the same number.
	+ *
	+ * Data can usually be structured to work with one of the
	+ * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
	+ */
	+ DMU_BSWAP_NUMFUNCS
	+} dmu_object_byteswap_t;
	+
	+#define DMU_OT_NEWTYPE 0x80
	+#define DMU_OT_METADATA 0x40
	+#define DMU_OT_BYTESWAP_MASK 0x3f
	+
	+/*
	+ * Defines a uint8_t object type. Object types specify if the data
	+ * in the object is metadata (boolean) and how to byteswap the data
	+ * (dmu_object_byteswap_t).
	+ */
	+#define DMU_OT(byteswap, metadata) \
	+ (DMU_OT_NEWTYPE \| \
	+ ((metadata) ? DMU_OT_METADATA : 0) \| \
	+ ((byteswap) & DMU_OT_BYTESWAP_MASK))
	+
	+typedef enum dmu_object_type {
	+ DMU_OT_NONE,
	+ /* general: */
	+ DMU_OT_OBJECT_DIRECTORY, /* ZAP */
	+ DMU_OT_OBJECT_ARRAY, /* UINT64 */
	+ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
	+ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
	+ DMU_OT_BPOBJ, /* UINT64 */
	+ DMU_OT_BPOBJ_HDR, /* UINT64 */
	+ /* spa: */
	+ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
	+ DMU_OT_SPACE_MAP, /* UINT64 */
	+ /* zil: */
	+ DMU_OT_INTENT_LOG, /* UINT64 */
	+ /* dmu: */
	+ DMU_OT_DNODE, /* DNODE */
	+ DMU_OT_OBJSET, /* OBJSET */
	+ /* dsl: */
	+ DMU_OT_DSL_DIR, /* UINT64 */
	+ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
	+ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
	+ DMU_OT_DSL_PROPS, /* ZAP */
	+ DMU_OT_DSL_DATASET, /* UINT64 */
	+ /* zpl: */
	+ DMU_OT_ZNODE, /* ZNODE */
	+ DMU_OT_OLDACL, /* Old ACL */
	+ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
	+ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
	+ DMU_OT_MASTER_NODE, /* ZAP */
	+ DMU_OT_UNLINKED_SET, /* ZAP */
	+ /* zvol: */
	+ DMU_OT_ZVOL, /* UINT8 */
	+ DMU_OT_ZVOL_PROP, /* ZAP */
	+ /* other; for testing only! */
	+ DMU_OT_PLAIN_OTHER, /* UINT8 */
	+ DMU_OT_UINT64_OTHER, /* UINT64 */
	+ DMU_OT_ZAP_OTHER, /* ZAP */
	+ /* new object types: */
	+ DMU_OT_ERROR_LOG, /* ZAP */
	+ DMU_OT_SPA_HISTORY, /* UINT8 */
	+ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
	+ DMU_OT_POOL_PROPS, /* ZAP */
	+ DMU_OT_DSL_PERMS, /* ZAP */
	+ DMU_OT_ACL, /* ACL */
	+ DMU_OT_SYSACL, /* SYSACL */
	+ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
	+ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
	+ DMU_OT_NEXT_CLONES, /* ZAP */
	+ DMU_OT_SCAN_QUEUE, /* ZAP */
	+ DMU_OT_USERGROUP_USED, /* ZAP */
	+ DMU_OT_USERGROUP_QUOTA, /* ZAP */
	+ DMU_OT_USERREFS, /* ZAP */
	+ DMU_OT_DDT_ZAP, /* ZAP */
	+ DMU_OT_DDT_STATS, /* ZAP */
	+ DMU_OT_SA, /* System attr */
	+ DMU_OT_SA_MASTER_NODE, /* ZAP */
	+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
	+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
	+ DMU_OT_SCAN_XLATE, /* ZAP */
	+ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
	+ DMU_OT_DEADLIST, /* ZAP */
	+ DMU_OT_DEADLIST_HDR, /* UINT64 */
	+ DMU_OT_DSL_CLONES, /* ZAP */
	+ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
	+ DMU_OT_NUMTYPES,
	+
	+ /*
	+ * Names for valid types declared with DMU_OT().
	+ */
	+ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
	+ DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
	+ DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
	+ DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
	+ DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
	+ DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
	+ DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
	+ DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
	+ DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
	+ DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE)
	+} dmu_object_type_t;
	+
	+typedef enum dmu_objset_type {
	+ DMU_OST_NONE,
	+ DMU_OST_META,
	+ DMU_OST_ZFS,
	+ DMU_OST_ZVOL,
	+ DMU_OST_OTHER, /* For testing only! */
	+ DMU_OST_ANY, /* Be careful! */
	+ DMU_OST_NUMTYPES
	+} dmu_objset_type_t;
	+
	+#define ZAP_MAXVALUELEN (1024 * 8)
	+
	+/*
	+ * header for all bonus and spill buffers.
	+ * The header has a fixed portion with a variable number
	+ * of "lengths" depending on the number of variable sized
	+ * attribues which are determined by the "layout number"
	+ */
	+
	+#define SA_MAGIC 0x2F505A /* ZFS SA */
	+typedef struct sa_hdr_phys {
	+ uint32_t sa_magic;
	+ uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
	+ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
	+ /* ... Data follows the lengths. */
	+} sa_hdr_phys_t;
	+
	+/*
	+ * sa_hdr_phys -> sa_layout_info
	+ *
	+ * 16 10 0
	+ * +--------+-------+
	+ * \| hdrsz \|layout \|
	+ * +--------+-------+
	+ *
	+ * Bits 0-10 are the layout number
	+ * Bits 11-16 are the size of the header.
	+ * The hdrsize is the number * 8
	+ *
	+ * For example.
	+ * hdrsz of 1 ==> 8 byte header
	+ * 2 ==> 16 byte header
	+ *
	+ */
	+
	+#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
	+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
	+#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
	+{ \
	+ BF32_SET_SB(x, 10, 6, 3, 0, size); \
	+ BF32_SET(x, 0, 10, num); \
	+}
	+
	+#define SA_ATTR_BSWAP(x) BF32_GET(x, 16, 8)
	+#define SA_ATTR_LENGTH(x) BF32_GET(x, 24, 16)
	+#define SA_ATTR_NUM(x) BF32_GET(x, 0, 16)
	+#define SA_ATTR_ENCODE(x, attr, length, bswap) \
	+{ \
	+ BF64_SET(x, 24, 16, length); \
	+ BF64_SET(x, 16, 8, bswap); \
	+ BF64_SET(x, 0, 16, attr); \
	+}
	+
	+#define SA_MODE_OFFSET 0
	+#define SA_SIZE_OFFSET 8
	+#define SA_GEN_OFFSET 16
	+#define SA_UID_OFFSET 24
	+#define SA_GID_OFFSET 32
	+#define SA_PARENT_OFFSET 40
	+#define SA_FLAGS_OFFSET 48
	+#define SA_ATIME_OFFSET 56
	+#define SA_MTIME_OFFSET 72
	+#define SA_CTIME_OFFSET 88
	+#define SA_CRTIME_OFFSET 104
	+#define SA_LINKS_OFFSET 120
	+//#define SA_PROJID_OFFSET 128
	+
	+#define SA_REGISTRY "REGISTRY"
	+#define SA_LAYOUTS "LAYOUTS"
	+
	+typedef enum sa_bswap_type {
	+ SA_UINT64_ARRAY,
	+ SA_UINT32_ARRAY,
	+ SA_UINT16_ARRAY,
	+ SA_UINT8_ARRAY,
	+ SA_ACL,
	+} sa_bswap_type_t;
	+
	+typedef uint16_t sa_attr_type_t;
	+
	+#define ZIO_OBJSET_MAC_LEN 32
	+
	+/*
	+ * Intent log header - this on disk structure holds fields to manage
	+ * the log. All fields are 64 bit to easily handle cross architectures.
	+ */
	+typedef struct zil_header {
	+ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
	+ uint64_t zh_replay_seq; /* highest replayed sequence number */
	+ blkptr_t zh_log; /* log chain */
	+ uint64_t zh_claim_seq; /* highest claimed sequence number */
	+ uint64_t zh_pad[5];
	+} zil_header_t;
	+
	+#define OBJSET_PHYS_SIZE_V2 2048
	+#define OBJSET_PHYS_SIZE_V3 4096
	+
	+typedef struct objset_phys {
	+ dnode_phys_t os_meta_dnode;
	+ zil_header_t os_zil_header;
	+ uint64_t os_type;
	+ uint64_t os_flags;
	+ uint8_t os_portable_mac[ZIO_OBJSET_MAC_LEN];
	+ uint8_t os_local_mac[ZIO_OBJSET_MAC_LEN];
	+ char os_pad0[OBJSET_PHYS_SIZE_V2 - sizeof (dnode_phys_t)*3 -
	+ sizeof (zil_header_t) - sizeof (uint64_t)*2 -
	+ 2*ZIO_OBJSET_MAC_LEN];
	+ dnode_phys_t os_userused_dnode;
	+ dnode_phys_t os_groupused_dnode;
	+ dnode_phys_t os_projectused_dnode;
	+ char os_pad1[OBJSET_PHYS_SIZE_V3 - OBJSET_PHYS_SIZE_V2 -
	+ sizeof (dnode_phys_t)];
	+} objset_phys_t;
	+
	+#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
	+#define SPACE_MAP_HISTOGRAM_SIZE 32
	+
	+typedef struct space_map_phys {
	+ /* object number: not needed but kept for backwards compatibility */
	+ uint64_t smp_object;
	+
	+ /* length of the object in bytes */
	+ uint64_t smp_length;
	+
	+ /* space allocated from the map */
	+ int64_t smp_alloc;
	+
	+ /* reserved */
	+ uint64_t smp_pad[5];
	+
	+ /*
	+ * The smp_histogram maintains a histogram of free regions. Each
	+ * bucket, smp_histogram[i], contains the number of free regions
	+ * whose size is:
	+ * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
	+ *
	+ * Note that, if log space map feature is enabled, histograms of
	+ * space maps that belong to metaslabs will take into account any
	+ * unflushed changes for their metaslabs, even though the actual
	+ * space map doesn't have entries for these changes.
	+ */
	+ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
	+} space_map_phys_t;
	+
	+typedef enum {
	+ SM_ALLOC,
	+ SM_FREE
	+} maptype_t;
	+
	+typedef struct space_map_entry {
	+ maptype_t sme_type;
	+ uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */
	+ uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */
	+ uint64_t sme_run; /* max is 2^36; units of sm_shift */
	+
	+ /*
	+ * The following fields are not part of the actual space map entry
	+ * on-disk and they are populated with the values from the debug
	+ * entry most recently visited starting from the beginning to the
	+ * end of the space map.
	+ */
	+ uint64_t sme_txg;
	+ uint64_t sme_sync_pass;
	+} space_map_entry_t;
	+
	+/* one-word entry constants */
	+#define SM_DEBUG_PREFIX 2
	+#define SM_OFFSET_BITS 47
	+#define SM_RUN_BITS 15
	+
	+/* two-word entry constants */
	+#define SM2_PREFIX 3
	+#define SM2_OFFSET_BITS 63
	+#define SM2_RUN_BITS 36
	+
	+#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2)
	+#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2)
	+
	+#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2)
	+#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2)
	+#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
	+#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
	+#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
	+#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
	+
	+#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS)
	+#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS)
	+#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
	+#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
	+#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
	+#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
	+#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
	+#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL)
	+
	+#define SM2_RUN_DECODE(x) (BF64_DECODE(x, 24, SM2_RUN_BITS) + 1)
	+#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 24, SM2_RUN_BITS)
	+#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, 24)
	+#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, 24)
	+#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1)
	+#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
	+#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS)
	+#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
	+#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL)
	+#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL)
	+
	+typedef struct dsl_dir_phys {
	+ uint64_t dd_creation_time; /* not actually used */
	+ uint64_t dd_head_dataset_obj;
	+ uint64_t dd_parent_obj;
	+ uint64_t dd_clone_parent_obj;
	+ uint64_t dd_child_dir_zapobj;
	+ /*
	+ * how much space our children are accounting for; for leaf
	+ * datasets, == physical space used by fs + snaps
	+ */
	+ uint64_t dd_used_bytes;
	+ uint64_t dd_compressed_bytes;
	+ uint64_t dd_uncompressed_bytes;
	+ /* Administrative quota setting */
	+ uint64_t dd_quota;
	+ /* Administrative reservation setting */
	+ uint64_t dd_reserved;
	+ uint64_t dd_props_zapobj;
	+ uint64_t dd_pad[7];
	+ uint64_t dd_clones;
	+ uint64_t dd_pad1[13]; /* pad out to 256 bytes for good measure */
	+} dsl_dir_phys_t;
	+
	+typedef struct dsl_dataset_phys {
	+ uint64_t ds_dir_obj;
	+ uint64_t ds_prev_snap_obj;
	+ uint64_t ds_prev_snap_txg;
	+ uint64_t ds_next_snap_obj;
	+ uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */
	+ uint64_t ds_num_children; /* clone/snap children; ==0 for head */
	+ uint64_t ds_creation_time; /* seconds since 1970 */
	+ uint64_t ds_creation_txg;
	+ uint64_t ds_deadlist_obj;
	+ uint64_t ds_used_bytes;
	+ uint64_t ds_compressed_bytes;
	+ uint64_t ds_uncompressed_bytes;
	+ uint64_t ds_unique_bytes; /* only relevant to snapshots */
	+ /*
	+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
	+ * collisions. The ds_guid is a 64-bit ID that will never
	+ * change, so there is a small probability that it will collide.
	+ */
	+ uint64_t ds_fsid_guid;
	+ uint64_t ds_guid;
	+ uint64_t ds_flags;
	+ blkptr_t ds_bp;
	+ uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
	+} dsl_dataset_phys_t;
	+
	+typedef struct dsl_deadlist_phys {
	+ uint64_t dl_used;
	+ uint64_t dl_comp;
	+ uint64_t dl_uncomp;
	+ uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
	+} dsl_deadlist_phys_t;
	+
	+#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t))
	+
	+typedef struct bpobj_phys {
	+ uint64_t bpo_num_blkptrs;
	+ uint64_t bpo_bytes;
	+ uint64_t bpo_comp;
	+ uint64_t bpo_uncomp;
	+ uint64_t bpo_subobjs;
	+ uint64_t bpo_num_subobjs;
	+ uint64_t bpo_num_freed;
	+} bpobj_phys_t;
	+
	+/*
	+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
	+ */
	+#define DMU_POOL_DIRECTORY_OBJECT 1
	+#define DMU_POOL_CONFIG "config"
	+#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
	+#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
	+#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
	+#define DMU_POOL_ROOT_DATASET "root_dataset"
	+#define DMU_POOL_SYNC_BPLIST "sync_bplist"
	+#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
	+#define DMU_POOL_ERRLOG_LAST "errlog_last"
	+#define DMU_POOL_SPARES "spares"
	+#define DMU_POOL_DEFLATE "deflate"
	+#define DMU_POOL_HISTORY "history"
	+#define DMU_POOL_PROPS "pool_props"
	+#define DMU_POOL_FREE_BPOBJ "free_bpobj"
	+#define DMU_POOL_BPTREE_OBJ "bptree_obj"
	+#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
	+#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
	+#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
	+#define DMU_POOL_REMOVING "com.delphix:removing"
	+#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
	+#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
	+#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
	+
	+#define ZAP_MAGIC 0x2F52AB2ABULL
	+
	+#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_block_shift)
	+
	+#define ZAP_MAXCD (uint32_t)(-1)
	+#define ZAP_HASHBITS 28
	+#define MZAP_ENT_LEN 64
	+#define MZAP_ENT_MAX \
	+ ((MZAP_MAX_BLKSZ - sizeof(mzap_phys_t)) / sizeof(mzap_ent_phys_t) + 1)
	+#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
	+#define MZAP_MAX_BLKSZ SPA_OLDMAXBLOCKSIZE
	+
	+typedef struct mzap_ent_phys {
	+ uint64_t mze_value;
	+ uint32_t mze_cd;
	+ uint16_t mze_pad; /* in case we want to chain them someday */
	+ char mze_name[MZAP_NAME_LEN];
	+} mzap_ent_phys_t;
	+
	+typedef struct mzap_phys {
	+ uint64_t mz_block_type; /* ZBT_MICRO */
	+ uint64_t mz_salt;
	+ uint64_t mz_normflags;
	+ uint64_t mz_pad[5];
	+ mzap_ent_phys_t mz_chunk[1];
	+ /* actually variable size depending on block size */
	+} mzap_phys_t;
	+
	+/*
	+ * The (fat) zap is stored in one object. It is an array of
	+ * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
	+ *
	+ * ptrtbl fits in first block:
	+ * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
	+ *
	+ * ptrtbl too big for first block:
	+ * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
	+ *
	+ */
	+
	+#define ZBT_LEAF ((1ULL << 63) + 0)
	+#define ZBT_HEADER ((1ULL << 63) + 1)
	+#define ZBT_MICRO ((1ULL << 63) + 3)
	+/* any other values are ptrtbl blocks */
	+
	+/*
	+ * the embedded pointer table takes up half a block:
	+ * block size / entry size (2^3) / 2
	+ */
	+#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
	+
	+/*
	+ * The embedded pointer table starts half-way through the block. Since
	+ * the pointer table itself is half the block, it starts at (64-bit)
	+ * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
	+ */
	+#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
	+ ((uint64_t *)(zap)->zap_phys) \
	+ [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
	+
	+#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
	+
	+/*
	+ * TAKE NOTE:
	+ * If zap_phys_t is modified, zap_byteswap() must be modified.
	+ */
	+typedef struct zap_phys {
	+ uint64_t zap_block_type; /* ZBT_HEADER */
	+ uint64_t zap_magic; /* ZAP_MAGIC */
	+
	+ struct zap_table_phys {
	+ uint64_t zt_blk; /* starting block number */
	+ uint64_t zt_numblks; /* number of blocks */
	+ uint64_t zt_shift; /* bits to index it */
	+ uint64_t zt_nextblk; /* next (larger) copy start block */
	+ uint64_t zt_blks_copied; /* number source blocks copied */
	+ } zap_ptrtbl;
	+
	+ uint64_t zap_freeblk; /* the next free block */
	+ uint64_t zap_num_leafs; /* number of leafs */
	+ uint64_t zap_num_entries; /* number of entries */
	+ uint64_t zap_salt; /* salt to stir into hash function */
	+ uint64_t zap_normflags; /* flags for u8_textprep_str() */
	+ uint64_t zap_flags; /* zap_flags_t */
	+ /*
	+ * This structure is followed by padding, and then the embedded
	+ * pointer table. The embedded pointer table takes up second
	+ * half of the block. It is accessed using the
	+ * ZAP_EMBEDDED_PTRTBL_ENT() macro.
	+ */
	+} zap_phys_t;
	+
	+typedef struct zap_table_phys zap_table_phys_t;
	+
	+struct spa;
	+typedef struct fat_zap {
	+ int zap_block_shift; /* block size shift */
	+ zap_phys_t *zap_phys;
	+ const struct spa *zap_spa;
	+ const dnode_phys_t *zap_dnode;
	+} fat_zap_t;
	+
	+#define ZAP_LEAF_MAGIC 0x2AB1EAF
	+
	+/* chunk size = 24 bytes */
	+#define ZAP_LEAF_CHUNKSIZE 24
	+
	+/*
	+ * The amount of space available for chunks is:
	+ * block size (1<<l->l_bs) - hash entry size (2) * number of hash
	+ * entries - header space (2*chunksize)
	+ */
	+#define ZAP_LEAF_NUMCHUNKS(l) \
	+ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
	+ ZAP_LEAF_CHUNKSIZE - 2)
	+
	+/*
	+ * The amount of space within the chunk available for the array is:
	+ * chunk size - space for type (1) - space for next pointer (2)
	+ */
	+#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
	+
	+#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
	+ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
	+
	+/*
	+ * Low water mark: when there are only this many chunks free, start
	+ * growing the ptrtbl. Ideally, this should be larger than a
	+ * "reasonably-sized" entry. 20 chunks is more than enough for the
	+ * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
	+ * while still being only around 3% for 16k blocks.
	+ */
	+#define ZAP_LEAF_LOW_WATER (20)
	+
	+/*
	+ * The leaf hash table has block size / 2^5 (32) number of entries,
	+ * which should be more than enough for the maximum number of entries,
	+ * which is less than block size / CHUNKSIZE (24) / minimum number of
	+ * chunks per entry (3).
	+ */
	+#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
	+#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
	+
	+/*
	+ * The chunks start immediately after the hash table. The end of the
	+ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
	+ * chunk_t.
	+ */
	+#define ZAP_LEAF_CHUNK(l, idx) \
	+ ((zap_leaf_chunk_t )(void ) \
	+ ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
	+#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
	+
	+#define ZAP_LEAF_HASH(l, h) \
	+ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
	+ ((h) >> \
	+ (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len)))
	+#define ZAP_LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[ZAP_LEAF_HASH(l, h)])
	+
	+typedef enum zap_chunk_type {
	+ ZAP_CHUNK_FREE = 253,
	+ ZAP_CHUNK_ENTRY = 252,
	+ ZAP_CHUNK_ARRAY = 251,
	+ ZAP_CHUNK_TYPE_MAX = 250
	+} zap_chunk_type_t;
	+
	+/*
	+ * TAKE NOTE:
	+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
	+ */
	+typedef struct zap_leaf_phys {
	+ struct zap_leaf_header {
	+ uint64_t lh_block_type; /* ZBT_LEAF */
	+ uint64_t lh_pad1;
	+ uint64_t lh_prefix; /* hash prefix of this leaf */
	+ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
	+ uint16_t lh_nfree; /* number free chunks */
	+ uint16_t lh_nentries; /* number of entries */
	+ uint16_t lh_prefix_len; /* num bits used to id this */
	+
	+/* above is accessable to zap, below is zap_leaf private */
	+
	+ uint16_t lh_freelist; /* chunk head of free list */
	+ uint8_t lh_pad2[12];
	+ } l_hdr; /* 2 24-byte chunks */
	+
	+ /*
	+ * The header is followed by a hash table with
	+ * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
	+ * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
	+ * zap_leaf_chunk structures. These structures are accessed
	+ * with the ZAP_LEAF_CHUNK() macro.
	+ */
	+
	+ uint16_t l_hash[1];
	+} zap_leaf_phys_t;
	+
	+typedef union zap_leaf_chunk {
	+ struct zap_leaf_entry {
	+ uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
	+ uint8_t le_value_intlen; /* size of ints */
	+ uint16_t le_next; /* next entry in hash chain */
	+ uint16_t le_name_chunk; /* first chunk of the name */
	+ uint16_t le_name_numints; /* bytes in name, incl null */
	+ uint16_t le_value_chunk; /* first chunk of the value */
	+ uint16_t le_value_numints; /* value length in ints */
	+ uint32_t le_cd; /* collision differentiator */
	+ uint64_t le_hash; /* hash value of the name */
	+ } l_entry;
	+ struct zap_leaf_array {
	+ uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
	+ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
	+ uint16_t la_next; /* next blk or CHAIN_END */
	+ } l_array;
	+ struct zap_leaf_free {
	+ uint8_t lf_type; /* always ZAP_CHUNK_FREE */
	+ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
	+ uint16_t lf_next; /* next in free list, or CHAIN_END */
	+ } l_free;
	+} zap_leaf_chunk_t;
	+
	+typedef struct zap_leaf {
	+ int l_bs; /* block size shift */
	+ zap_leaf_phys_t *l_phys;
	+} zap_leaf_t;
	+
	+#define ZAP_MAXNAMELEN 256
	+#define ZAP_MAXVALUELEN (1024 * 8)
	+
	+#define ACE_READ_DATA 0x00000001 /* file: read data */
	+#define ACE_LIST_DIRECTORY 0x00000001 /* dir: list files */
	+#define ACE_WRITE_DATA 0x00000002 /* file: write data */
	+#define ACE_ADD_FILE 0x00000002 /* dir: create file */
	+#define ACE_APPEND_DATA 0x00000004 /* file: append data */
	+#define ACE_ADD_SUBDIRECTORY 0x00000004 /* dir: create subdir */
	+#define ACE_READ_NAMED_ATTRS 0x00000008 /* FILE_READ_EA */
	+#define ACE_WRITE_NAMED_ATTRS 0x00000010 /* FILE_WRITE_EA */
	+#define ACE_EXECUTE 0x00000020 /* file: execute */
	+#define ACE_TRAVERSE 0x00000020 /* dir: lookup name */
	+#define ACE_DELETE_CHILD 0x00000040 /* dir: unlink child */
	+#define ACE_READ_ATTRIBUTES 0x00000080 /* (all) stat, etc. */
	+#define ACE_WRITE_ATTRIBUTES 0x00000100 /* (all) utimes, etc. */
	+#define ACE_DELETE 0x00010000 /* (all) unlink self */
	+#define ACE_READ_ACL 0x00020000 /* (all) getsecattr */
	+#define ACE_WRITE_ACL 0x00040000 /* (all) setsecattr */
	+#define ACE_WRITE_OWNER 0x00080000 /* (all) chown */
	+#define ACE_SYNCHRONIZE 0x00100000 /* (all) */
	+
	+#define ACE_FILE_INHERIT_ACE 0x0001
	+#define ACE_DIRECTORY_INHERIT_ACE 0x0002
	+#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004
	+#define ACE_INHERIT_ONLY_ACE 0x0008
	+#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010
	+#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020
	+#define ACE_IDENTIFIER_GROUP 0x0040
	+#define ACE_INHERITED_ACE 0x0080
	+#define ACE_OWNER 0x1000
	+#define ACE_GROUP 0x2000
	+#define ACE_EVERYONE 0x4000
	+
	+#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000
	+#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001
	+#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002
	+#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003
	+
	+typedef struct zfs_ace_hdr {
	+ uint16_t z_type;
	+ uint16_t z_flags;
	+ uint32_t z_access_mask;
	+} zfs_ace_hdr_t;
	+
	+/*
	+ * Define special zfs pflags
	+ */
	+#define ZFS_XATTR 0x1 /* is an extended attribute */
	+#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
	+#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
	+#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */
	+#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */
	+#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
	+#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
	+#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
	+#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
	+
	+#define ZFS_READONLY 0x0000000100000000ull
	+#define ZFS_HIDDEN 0x0000000200000000ull
	+#define ZFS_SYSTEM 0x0000000400000000ull
	+#define ZFS_ARCHIVE 0x0000000800000000ull
	+#define ZFS_IMMUTABLE 0x0000001000000000ull
	+#define ZFS_NOUNLINK 0x0000002000000000ull
	+#define ZFS_APPENDONLY 0x0000004000000000ull
	+#define ZFS_NODUMP 0x0000008000000000ull
	+#define ZFS_OPAQUE 0x0000010000000000ull
	+#define ZFS_AV_QUARANTINED 0x0000020000000000ull
	+#define ZFS_AV_MODIFIED 0x0000040000000000ull
	+#define ZFS_REPARSE 0x0000080000000000ull
	+#define ZFS_OFFLINE 0x0000100000000000ull
	+#define ZFS_SPARSE 0x0000200000000000ull
	+
	+#define MASTER_NODE_OBJ 1
	+
	+/*
	+ * special attributes for master node.
	+ */
	+
	+#define ZFS_FSID "FSID"
	+#define ZFS_UNLINKED_SET "DELETE_QUEUE"
	+#define ZFS_ROOT_OBJ "ROOT"
	+#define ZPL_VERSION_OBJ "VERSION"
	+#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE"
	+#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS"
	+#define ZFS_SA_ATTRS "SA_ATTRS"
	+
	+#define ZFS_FLAG_BLOCKPERPAGE 0x1
	+#define ZFS_FLAG_NOGROWBLOCKS 0x2
	+
	+/*
	+ * ZPL version - rev'd whenever an incompatible on-disk format change
	+ * occurs. Independent of SPA/DMU/ZAP versioning.
	+ */
	+
	+#define ZPL_VERSION 1ULL
	+
	+/*
	+ * The directory entry has the type (currently unused on Solaris) in the
	+ * top 4 bits, and the object number in the low 48 bits. The "middle"
	+ * 12 bits are unused.
	+ */
	+#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
	+#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
	+#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) \| obj)
	+
	+typedef struct ace {
	+ uid_t a_who; /* uid or gid */
	+ uint32_t a_access_mask; /* read,write,... */
	+ uint16_t a_flags; /* see below */
	+ uint16_t a_type; /* allow or deny */
	+} ace_t;
	+
	+#define ACE_SLOT_CNT 6
	+
	+typedef struct zfs_znode_acl {
	+ uint64_t z_acl_extern_obj; /* ext acl pieces */
	+ uint32_t z_acl_count; /* Number of ACEs */
	+ uint16_t z_acl_version; /* acl version */
	+ uint16_t z_acl_pad; /* pad */
	+ ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
	+} zfs_znode_acl_t;
	+
	+/*
	+ * This is the persistent portion of the znode. It is stored
	+ * in the "bonus buffer" of the file. Short symbolic links
	+ * are also stored in the bonus buffer.
	+ */
	+typedef struct znode_phys {
	+ uint64_t zp_atime[2]; /* 0 - last file access time */
	+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
	+ uint64_t zp_ctime[2]; /* 32 - last file change time */
	+ uint64_t zp_crtime[2]; /* 48 - creation time */
	+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
	+ uint64_t zp_mode; /* 72 - file mode bits */
	+ uint64_t zp_size; /* 80 - size of file */
	+ uint64_t zp_parent; /* 88 - directory parent (`..') */
	+ uint64_t zp_links; /* 96 - number of links to file */
	+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
	+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
	+ uint64_t zp_flags; /* 120 - persistent flags */
	+ uint64_t zp_uid; /* 128 - file owner */
	+ uint64_t zp_gid; /* 136 - owning group */
	+ uint64_t zp_pad[4]; /* 144 - future */
	+ zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */
	+ /*
	+ * Data may pad out any remaining bytes in the znode buffer, eg:
	+ *
	+ * \|<---------------------- dnode_phys (512) ------------------------>\|
	+ * \|<-- dnode (192) --->\|<----------- "bonus" buffer (320) ---------->\|
	+ * \|<---- znode (264) ---->\|<---- data (56) ---->\|
	+ *
	+ * At present, we only use this space to store symbolic links.
	+ */
	+} znode_phys_t;
	+
	+/*
	+ * In-core vdev representation.
	+ */
	+struct vdev;
	+struct spa;
	+typedef int vdev_phys_read_t(struct vdev , void , off_t, void *, size_t);
	+typedef int vdev_phys_write_t(struct vdev , off_t, void , size_t);
	+typedef int vdev_read_t(struct vdev , const blkptr_t , void *, off_t, size_t);
	+
	+typedef STAILQ_HEAD(vdev_list, vdev) vdev_list_t;
	+
	+typedef struct vdev_indirect_mapping_entry_phys {
	+ /*
	+ * Decode with DVA_MAPPING_* macros.
	+ * Contains:
	+ * the source offset (low 63 bits)
	+ * the one-bit "mark", used for garbage collection (by zdb)
	+ */
	+ uint64_t vimep_src;
	+
	+ /*
	+ * Note: the DVA's asize is 24 bits, and can thus store ranges
	+ * up to 8GB.
	+ */
	+ dva_t vimep_dst;
	+} vdev_indirect_mapping_entry_phys_t;
	+
	+#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \
	+ BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
	+#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \
	+ BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
	+
	+#if 0
	+typedef struct vdev_indirect_mapping_entry {
	+ vdev_indirect_mapping_entry_phys_t vime_mapping;
	+ uint32_t vime_obsolete_count;
	+ list_node_t vime_node;
	+} vdev_indirect_mapping_entry_t;
	+#endif
	+
	+/*
	+ * This is stored in the bonus buffer of the mapping object, see comment of
	+ * vdev_indirect_config for more details.
	+ */
	+typedef struct vdev_indirect_mapping_phys {
	+ uint64_t vimp_max_offset;
	+ uint64_t vimp_bytes_mapped;
	+ uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */
	+
	+ /*
	+ * For each entry in the mapping object, this object contains an
	+ * entry representing the number of bytes of that mapping entry
	+ * that were no longer in use by the pool at the time this indirect
	+ * vdev was last condensed.
	+ */
	+ uint64_t vimp_counts_object;
	+} vdev_indirect_mapping_phys_t;
	+
	+#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t))
	+
	+typedef struct vdev_indirect_mapping {
	+ uint64_t vim_object;
	+ boolean_t vim_havecounts;
	+
	+ /* vim_entries segment offset currently in memory. */
	+ uint64_t vim_entry_offset;
	+ /* vim_entries segment size. */
	+ size_t vim_num_entries;
	+
	+ /* Needed by dnode_read() */
	+ const void *vim_spa;
	+ dnode_phys_t *vim_dn;
	+
	+ /*
	+ * An ordered array of mapping entries, sorted by source offset.
	+ * Note that vim_entries is needed during a removal (and contains
	+ * mappings that have been synced to disk so far) to handle frees
	+ * from the removing device.
	+ */
	+ vdev_indirect_mapping_entry_phys_t *vim_entries;
	+ objset_phys_t *vim_objset;
	+ vdev_indirect_mapping_phys_t *vim_phys;
	+} vdev_indirect_mapping_t;
	+
	+/*
	+ * On-disk indirect vdev state.
	+ *
	+ * An indirect vdev is described exclusively in the MOS config of a pool.
	+ * The config for an indirect vdev includes several fields, which are
	+ * accessed in memory by a vdev_indirect_config_t.
	+ */
	+typedef struct vdev_indirect_config {
	+ /*
	+ * Object (in MOS) which contains the indirect mapping. This object
	+ * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
	+ * vimep_src. The bonus buffer for this object is a
	+ * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
	+ * removal is initiated.
	+ *
	+ * Note that this object can be empty if none of the data on the vdev
	+ * has been copied yet.
	+ */
	+ uint64_t vic_mapping_object;
	+
	+ /*
	+ * Object (in MOS) which contains the birth times for the mapping
	+ * entries. This object contains an array of
	+ * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
	+ * buffer for this object is a vdev_indirect_birth_phys_t. This object
	+ * is allocated when a vdev removal is initiated.
	+ *
	+ * Note that this object can be empty if none of the vdev has yet been
	+ * copied.
	+ */
	+ uint64_t vic_births_object;
	+
	+/*
	+ * This is the vdev ID which was removed previous to this vdev, or
	+ * UINT64_MAX if there are no previously removed vdevs.
	+ */
	+ uint64_t vic_prev_indirect_vdev;
	+} vdev_indirect_config_t;
	+
	+typedef struct vdev {
	+ STAILQ_ENTRY(vdev) v_childlink; /* link in parent's child list */
	+ STAILQ_ENTRY(vdev) v_alllink; /* link in global vdev list */
	+ vdev_list_t v_children; /* children of this vdev */
	+ const char v_name; / vdev name */
	+ uint64_t v_guid; /* vdev guid */
	+ uint64_t v_id; /* index in parent */
	+ uint64_t v_psize; /* physical device capacity */
	+ int v_ashift; /* offset to block shift */
	+ int v_nparity; /* # parity for raidz */
	+ struct vdev v_top; / parent vdev */
	+ size_t v_nchildren; /* # children */
	+ vdev_state_t v_state; /* current state */
	+ vdev_phys_read_t v_phys_read; / read from raw leaf vdev */
	+ vdev_phys_write_t v_phys_write; / write to raw leaf vdev */
	+ vdev_read_t v_read; / read from vdev */
	+ void v_priv; / data for read/write function */
	+ boolean_t v_islog;
	+ struct spa v_spa; / link to spa */
	+ /*
	+ * Values stored in the config for an indirect or removing vdev.
	+ */
	+ vdev_indirect_config_t vdev_indirect_config;
	+ vdev_indirect_mapping_t *v_mapping;
	+} vdev_t;
	+
	+/*
	+ * In-core pool representation.
	+ */
	+typedef STAILQ_HEAD(spa_list, spa) spa_list_t;
	+
	+typedef struct spa {
	+ STAILQ_ENTRY(spa) spa_link; /* link in global pool list */
	+ char spa_name; / pool name */
	+ uint64_t spa_guid; /* pool guid */
	+ uint64_t spa_txg; /* most recent transaction */
	+ struct uberblock spa_uberblock; / best uberblock so far */
	+ vdev_t spa_root_vdev; / toplevel vdev container */
	+ objset_phys_t spa_mos; / MOS for this pool */
	+ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
	+ void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
	+ boolean_t spa_with_log; /* this pool has log */
	+
	+ struct uberblock spa_uberblock_master; /* best uberblock so far */
	+ objset_phys_t spa_mos_master; /* MOS for this pool */
	+ struct uberblock spa_uberblock_checkpoint; /* checkpoint uberblock */
	+ objset_phys_t spa_mos_checkpoint; /* Checkpoint MOS */
	+ void spa_bootenv; / bootenv from pool label */
	+} spa_t;
	+
	+spa_t spa_create(uint64_t guid, const char name);
	+
	+/* IO related arguments. */
	+typedef struct zio {
	+ spa_t *io_spa;
	+ blkptr_t *io_bp;
	+ void *io_data;
	+ uint64_t io_size;
	+ uint64_t io_offset;
	+
	+ /* Stuff for the vdev stack */
	+ vdev_t *io_vd;
	+ void *io_vsd;
	+
	+ int io_error;
	+} zio_t;
	+
	+#if 0 /* XXXMJ */
	+static void decode_embedded_bp_compressed(const blkptr_t , void );
	+#endif
	+
	+#endif /* _ZFSIMPL_H_ */

File Metadata

Mime Type: text/plain
Expires: Tue, Nov 19, 12:50 AM (9 h, 27 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 14707690
Default Alt Text: D35248.id106145.diff (229 KB)

D35248.id106145.diffNo OneTemporaryActions

D35248.id106145.diffView Options

File Metadata

Event Timeline

D35248.id106145.diff
No OneTemporary
Actions

D35248.id106145.diff
View Options