Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F102854007
D35248.id106145.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
229 KB
Referenced Files
None
Subscribers
None
D35248.id106145.diff
View Options
Index: usr.sbin/makefs/Makefile
===================================================================
--- usr.sbin/makefs/Makefile
+++ usr.sbin/makefs/Makefile
@@ -13,7 +13,8 @@
makefs.c \
msdos.c \
mtree.c \
- walk.c
+ walk.c \
+ zfs.c
MAN= makefs.8
NO_WCAST_ALIGN=
@@ -22,6 +23,7 @@
.include "${SRCDIR}/cd9660/Makefile.inc"
.include "${SRCDIR}/ffs/Makefile.inc"
.include "${SRCDIR}/msdos/Makefile.inc"
+.include "${SRCDIR}/zfs/Makefile.inc"
CFLAGS+=-DHAVE_STRUCT_STAT_ST_FLAGS=1
@@ -36,6 +38,9 @@
CFLAGS+= -I${SRCTOP}/lib/libnetbsd
LIBADD= netbsd util sbuf
+CFLAGS.zfs.c+= -I${SRCDIR}/zfs \
+ -I${SRCTOP}/sys/cddl/boot/zfs \
+
HAS_TESTS=
SUBDIR.${MK_TESTS}+= tests
Index: usr.sbin/makefs/makefs.h
===================================================================
--- usr.sbin/makefs/makefs.h
+++ usr.sbin/makefs/makefs.h
@@ -78,12 +78,14 @@
FI_SIZED = 1<<0, /* inode sized */
FI_ALLOCATED = 1<<1, /* fsinode->ino allocated */
FI_WRITTEN = 1<<2, /* inode written */
+ FI_ROOT = 1<<3, /* root of a ZFS dataset */
};
typedef struct {
uint32_t ino; /* inode number used on target fs */
uint32_t nlink; /* number of links to this entry */
enum fi_flags flags; /* flags used by fs specific code */
+ void *param; /* for use by individual fs impls */
struct stat st; /* stat entry */
} fsinode;
@@ -186,6 +188,7 @@
DECLARE_FUN(cd9660);
DECLARE_FUN(ffs);
DECLARE_FUN(msdos);
+DECLARE_FUN(zfs);
extern u_int debug;
extern int dupsok;
Index: usr.sbin/makefs/makefs.8
===================================================================
--- usr.sbin/makefs/makefs.8
+++ usr.sbin/makefs/makefs.8
@@ -35,7 +35,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd September 17, 2020
+.Dd May 18, 2022
.Dt MAKEFS 8
.Os
.Sh NAME
@@ -266,6 +266,8 @@
ISO 9660 file system.
.It Sy msdos
FAT12, FAT16, or FAT32 file system.
+.It Sy zfs
+ZFS pool containing one or more file systems.
.El
.It Fl x
Exclude file system nodes not explicitly listed in the specfile.
@@ -494,10 +496,87 @@
.It Cm volume_label
Volume Label.
.El
+.Ss zfs-specific options
+The image created by
+.Nm
+contains a ZFS pool with a single vdev of type
+.Ql disk .
+The root dataset is always created implicitly and contains the entire input
+directory tree unless additional datasets are specified using the options
+described below.
+.Pp
+The arguments consist of a keyword, an equal sign
+.Pq Ql = ,
+and a value.
+The following keywords are supported:
+.Pp
+.Bl -tag -width omit-trailing-period -offset indent -compact
+.It ashift
+The base-2 logarithm of the minimum block size.
+Typical values are 9 (512B blocks) and 12 (4KB blocks).
+The default value is 12.
+.It bootfs
+The name of the bootable dataset for the pool.
+Specifying this option causes the
+.Ql bootfs
+property to be set in the created pool.
+.It poolname
+The name of the ZFS pool.
+This option must be specified.
+.It rootpath
+An implicit path prefix added to dataset mountpoints.
+By default it is
+.Pa /<poolname> .
+For creating bootable pools, the
+.Va rootpath
+should be set to
+.Pa / .
+At least one dataset must have a mountpoint equal to
+.Va rootpath .
+.It fs
+Create an additional dataset.
+This option may be specified multiple times.
+The argument value must be of the form
+.Ar <dataset>[:<prop1=v1>[:<prop2=v2>[:...]]] ,
+where
+.Ar dataset
+is the name of the dataset and must belong to the pool's namespace.
+For example, with a pool name of
+.Ql test
+all dataset names must be prefixed by
+.Ql test/ .
+A dataset must exist at each level of the pool's namespace.
+For example, to create
+.Ql test/foo/bar ,
+.Ql test/foo
+must be created as well.
+.Pp
+The dataset mountpoints determine how the datasets are populated with
+files from the staged directory tree.
+Conceptually, all datasets are mounted before any are populated with files.
+The root of the staged directory tree is mapped to
+.Va rootpath .
+.Pp
+Dataset properties, as described in
+.Xr zfsprops 8 ,
+may be specified following the dataset name.
+The following properties may be set for a dataset:
+.Pp
+.Bl -tag -compact -offset indent
+.It atime
+.It canmount
+.It exec
+.It mountpoint
+.It setuid
+.El
+.El
.Sh SEE ALSO
.Xr mtree 5 ,
.Xr mtree 8 ,
-.Xr newfs 8
+.Xr newfs 8 ,
+.Xr zfsconcepts 8 ,
+.Xr zfsprops 8 ,
+.Xr zpoolprops 8
.Sh HISTORY
The
.Nm
Index: usr.sbin/makefs/makefs.c
===================================================================
--- usr.sbin/makefs/makefs.c
+++ usr.sbin/makefs/makefs.c
@@ -77,6 +77,7 @@
ENTRY(cd9660),
ENTRY(ffs),
ENTRY(msdos),
+ ENTRY(zfs),
{ .type = NULL },
};
@@ -266,7 +267,7 @@
break;
case 'Z':
- /* Superscedes 'p' for compatibility with NetBSD makefs(8) */
+ /* Supersedes 'p' for compatibility with NetBSD makefs(8) */
fsoptions.sparse = 1;
break;
Index: usr.sbin/makefs/tests/Makefile
===================================================================
--- usr.sbin/makefs/tests/Makefile
+++ usr.sbin/makefs/tests/Makefile
@@ -2,6 +2,7 @@
ATF_TESTS_SH+= makefs_cd9660_tests
ATF_TESTS_SH+= makefs_ffs_tests
+ATF_TESTS_SH+= makefs_zfs_tests
BINDIR= ${TESTSDIR}
@@ -12,7 +13,7 @@
TEST_METADATA.makefs_cd9660_tests+= required_files="/sbin/mount_cd9660"
.for t in ${ATF_TESTS_SH}
-TEST_METADATA.$t+= required_user="root"
+#TEST_METADATA.$t+= required_user="root"
.endfor
.include <bsd.test.mk>
Index: usr.sbin/makefs/tests/makefs_zfs_tests.sh
===================================================================
--- /dev/null
+++ usr.sbin/makefs/tests/makefs_zfs_tests.sh
@@ -0,0 +1,521 @@
+#-
+# SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+#
+# Copyright (c) 2022 The FreeBSD Foundation
+#
+# This software was developed by Mark Johnston under sponsorship from
+# the FreeBSD Foundation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+
+MAKEFS="makefs -t zfs"
+ZFS_POOL_NAME="makefstest$(jot -r 1 100000)"
+TEST_ZFS_POOL_NAME="$TMPDIR/poolname"
+
+. "$(dirname "$0")/makefs_tests_common.sh"
+
+common_cleanup()
+{
+ local pool md
+
+ # Try to force a TXG, this can help catch bugs by triggering a panic.
+ sync
+
+ pool=$(cat $TEST_ZFS_POOL_NAME)
+ if zpool list "$pool" >/dev/null; then
+ zpool destroy "$pool"
+ fi
+
+ md=$(cat $TEST_MD_DEVICE_FILE)
+ if [ -c /dev/"$md" ]; then
+ mdconfig -d -u "$md"
+ fi
+}
+
+import_image()
+{
+ atf_check -e empty -o save:$TEST_MD_DEVICE_FILE -s exit:0 \
+ mdconfig -a -f $TEST_IMAGE
+ atf_check -e empty -o empty -s exit:0 \
+ zpool import -R $TEST_MOUNT_DIR $ZFS_POOL_NAME
+ echo "$ZFS_POOL_NAME" > $TEST_ZFS_POOL_NAME
+}
+
+#
+# Test with some default layout defined by the common code.
+#
+atf_test_case basic cleanup
+basic_body()
+{
+ create_test_inputs
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+}
+basic_cleanup()
+{
+ common_cleanup
+}
+
+atf_test_case dataset_removal cleanup
+dataset_removal_body()
+{
+ create_test_dirs
+
+ cd $TEST_INPUTS_DIR
+ mkdir dir
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ -o fs=${ZFS_POOL_NAME}/dir \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+
+ atf_check -o empty -e empty -s exit:0 zfs destroy ${ZFS_POOL_NAME}/dir
+}
+dataset_removal_cleanup()
+{
+ common_cleanup
+}
+
+#
+# Make sure that we can create and remove an empty directory.
+#
+atf_test_case empty_dir cleanup
+empty_dir_body()
+{
+ create_test_dirs
+
+ cd $TEST_INPUTS_DIR
+ mkdir dir
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+
+ atf_check -s exit:0 rmdir ${TEST_MOUNT_DIR}/dir
+}
+empty_dir_cleanup()
+{
+ common_cleanup
+}
+
+atf_test_case empty_fs cleanup
+empty_fs_body()
+{
+ create_test_dirs
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+}
+empty_fs_cleanup()
+{
+ common_cleanup
+}
+
+atf_test_case file_sizes cleanup
+file_sizes_body()
+{
+ local i
+
+ create_test_dirs
+ cd $TEST_INPUTS_DIR
+
+ i=1
+ while [ $i -lt $((1 << 20)) ]; do
+ truncate -s $i ${i}.1
+ truncate -s $(($i - 1)) ${i}.2
+ truncate -s $(($i + 1)) ${i}.3
+ i=$(($i << 1))
+ done
+
+ cd -
+
+ # XXXMJ this creates sparse files, make sure makefs doesn't
+ # preserve the sparseness.
+ # XXXMJ need to test with larger files (at least 128MB for L2 indirs)
+ # XXXMJ try with different ashifts
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+}
+file_sizes_cleanup()
+{
+ common_cleanup
+}
+
+atf_test_case hard_links cleanup
+hard_links_body()
+{
+ local f
+
+ create_test_dirs
+ cd $TEST_INPUTS_DIR
+
+ mkdir dir
+ echo "hello" > 1
+ ln 1 2
+ ln 1 dir/1
+
+ echo "goodbye" > dir/a
+ ln dir/a dir/b
+ ln dir/a a
+
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+
+ stat -f '%i' ${TEST_MOUNT_DIR}/1 > ./ino
+ stat -f '%l' ${TEST_MOUNT_DIR}/1 > ./nlink
+ for f in 1 2 dir/1; do
+ atf_check -o file:./nlink -e empty -s exit:0 \
+ stat -f '%l' ${TEST_MOUNT_DIR}/${f}
+ atf_check -o file:./ino -e empty -s exit:0 \
+ stat -f '%i' ${TEST_MOUNT_DIR}/${f}
+ atf_check -o empty -e empty -s exit:0 \
+ cmp -s ${TEST_INPUTS_DIR}/1 ${TEST_MOUNT_DIR}/${f}
+ done
+
+ stat -f '%i' ${TEST_MOUNT_DIR}/dir/a > ./ino
+ stat -f '%l' ${TEST_MOUNT_DIR}/dir/a > ./nlink
+ for f in dir/a dir/b a; do
+ atf_check -o file:./nlink -e empty -s exit:0 \
+ stat -f '%l' ${TEST_MOUNT_DIR}/${f}
+ atf_check -o file:./ino -e empty -s exit:0 \
+ stat -f '%i' ${TEST_MOUNT_DIR}/${f}
+ atf_check -o empty -e empty -s exit:0 \
+ cmp -s ${TEST_INPUTS_DIR}/dir/a ${TEST_MOUNT_DIR}/${f}
+ done
+}
+hard_links_cleanup()
+{
+ common_cleanup
+}
+
+# Allocate enough dnodes from an object set that the meta dnode needs to use
+# indirect blocks.
+atf_test_case indirect_dnode_array cleanup
+indirect_dnode_array_body()
+{
+ local i
+
+ create_test_dirs
+ cd $TEST_INPUTS_DIR
+ # 512 bytes per dnode, 3*128KB of direct blocks => limit of 768 files.
+ # XXXMJ actual threshold is much lower
+ for i in $(seq 1 1000); do
+ touch $i
+ done
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+}
+indirect_dnode_array_cleanup()
+{
+ common_cleanup
+}
+
+#
+# Create some files with long names, so as to test fat ZAP handling.
+#
+atf_test_case long_file_name cleanup
+long_file_name_body()
+{
+ local dir i
+
+ create_test_dirs
+ cd $TEST_INPUTS_DIR
+
+ # micro ZAP keys can be at most 50 bytes.
+ for i in $(seq 1 60); do
+ touch $(jot -s '' $i 1 1)
+ done
+ dir=$(jot -s '' 61 1 1)
+ mkdir $dir
+ for i in $(seq 1 60); do
+ touch ${dir}/$(jot -s '' $i 1 1)
+ done
+
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+
+ # Add a directory entry in the hope that OpenZFS might catch a bug
+ # in makefs' fat ZAP encoding.
+ touch ${TEST_MOUNT_DIR}/foo
+}
+long_file_name_cleanup()
+{
+ common_cleanup
+}
+
+#
+# Exercise handling of multiple datasets.
+#
+atf_test_case multi_dataset_1 cleanup
+multi_dataset_1_body()
+{
+ create_test_dirs
+ cd $TEST_INPUTS_DIR
+
+ mkdir dir1
+ echo a > dir1/a
+ mkdir dir2
+ echo b > dir2/b
+
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ -o fs=${ZFS_POOL_NAME}/dir1 -o fs=${ZFS_POOL_NAME}/dir2 \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+
+ # Make sure that we have three datasets with the expected mount points.
+ atf_check -o inline:${ZFS_POOL_NAME}\\n -e empty -s exit:0 \
+ zfs list -H -o name ${ZFS_POOL_NAME}
+ atf_check -o inline:${TEST_MOUNT_DIR}\\n -e empty -s exit:0 \
+ zfs list -H -o mountpoint ${ZFS_POOL_NAME}
+
+ atf_check -o inline:${ZFS_POOL_NAME}/dir1\\n -e empty -s exit:0 \
+ zfs list -H -o name ${ZFS_POOL_NAME}/dir1
+ atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \
+ zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
+
+ atf_check -o inline:${ZFS_POOL_NAME}/dir2\\n -e empty -s exit:0 \
+ zfs list -H -o name ${ZFS_POOL_NAME}/dir2
+ atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \
+ zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2
+}
+multi_dataset_1_cleanup()
+{
+ common_cleanup
+}
+
+atf_test_case multi_dataset_2 cleanup
+multi_dataset_2_body()
+{
+ create_test_dirs
+ cd $TEST_INPUTS_DIR
+
+ mkdir dir1
+ echo a > dir1/a
+ mkdir dir2
+ echo b > dir2/b
+
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ -o fs=${ZFS_POOL_NAME}/dir1:mountpoint=/ \
+ -o fs=${ZFS_POOL_NAME}:mountpoint=/dir1 \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+}
+multi_dataset_2_cleanup()
+{
+ common_cleanup
+}
+
+#
+# Rudimentary test to verify that two ZFS images created using the same
+# parameters and input hierarchy are byte-identical. In particular, makefs(1)
+# does not preserve file access times.
+#
+atf_test_case reproducible cleanup
+reproducible_body()
+{
+ create_test_inputs
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ ${TEST_IMAGE}.1 $TEST_INPUTS_DIR
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ ${TEST_IMAGE}.2 $TEST_INPUTS_DIR
+
+ # XXX-MJ cmp(1) is really slow
+ atf_check -o empty -e empty -s exit:0 \
+ cmp ${TEST_IMAGE}.1 ${TEST_IMAGE}.2
+}
+reproducible_cleanup()
+{
+}
+
+atf_test_case snapshot cleanup
+snapshot_body()
+{
+ create_test_dirs
+ cd $TEST_INPUTS_DIR
+
+ mkdir dir
+ echo "hello" > dir/hello
+ echo "goodbye" > goodbye
+
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ atf_check -o empty -e empty -s exit:0 zfs snapshot ${ZFS_POOL_NAME}@1
+}
+snapshot_cleanup()
+{
+ common_cleanup
+}
+
+atf_test_case soft_links cleanup
+soft_links_body()
+{
+ create_test_dirs
+ cd $TEST_INPUTS_DIR
+
+ mkdir dir
+ ln -s a a
+ ln -s dir/../a a
+ ln -s dir/b b
+ echo 'c' > dir
+ ln -s dir/c c
+ # XXX-MJ overflows bonus buffer ln -s $(jot -s '' 320 1 1) 1
+
+ cd -
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+}
+soft_links_cleanup()
+{
+ common_cleanup
+}
+
+#
+# Verify that we can set properties on the root dataset.
+#
+atf_test_case root_props cleanup
+root_props_body()
+{
+ create_test_inputs
+
+ atf_check -o empty -e empty -s exit:0 \
+ $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+ -o fs=${ZFS_POOL_NAME}:atime=off:setuid=off \
+ $TEST_IMAGE $TEST_INPUTS_DIR
+
+ import_image
+
+ check_image_contents
+
+ atf_check -o inline:off\\n -e empty -s exit:0 \
+ zfs get -H -o value atime $ZFS_POOL_NAME
+ atf_check -o inline:local\\n -e empty -s exit:0 \
+ zfs get -H -o source atime $ZFS_POOL_NAME
+ atf_check -o inline:off\\n -e empty -s exit:0 \
+ zfs get -H -o value setuid $ZFS_POOL_NAME
+ atf_check -o inline:local\\n -e empty -s exit:0 \
+ zfs get -H -o source setuid $ZFS_POOL_NAME
+}
+root_props_cleanup()
+{
+ common_cleanup
+}
+
+atf_init_test_cases()
+{
+ atf_add_test_case basic
+ atf_add_test_case dataset_removal
+ atf_add_test_case empty_dir
+ atf_add_test_case empty_fs
+ atf_add_test_case file_sizes
+ atf_add_test_case hard_links
+ atf_add_test_case indirect_dnode_array
+ atf_add_test_case long_file_name
+ atf_add_test_case multi_dataset_1
+ atf_add_test_case multi_dataset_2
+ # XXX-MJ one to check handling of non-existent mountpoints
+ # one to check mountpoint "none"
+ atf_add_test_case reproducible
+ atf_add_test_case snapshot
+ atf_add_test_case soft_links
+ atf_add_test_case root_props
+
+ # XXXMJ tests:
+ # - test with different ashifts (at least, 9 and 12), different image sizes
+ # - create datasets in imported pool
+ # - bootenvs
+}
Index: usr.sbin/makefs/zfs.c
===================================================================
--- /dev/null
+++ usr.sbin/makefs/zfs.c
@@ -0,0 +1,3322 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/dirent.h>
+#include <sys/endian.h>
+#include <sys/errno.h>
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <bitstring.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <util.h>
+
+#include "makefs.h"
+#include "zfs/nvlist.h"
+#include "zfs/zfsimpl.h"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-function"
+#include "fletcher.c"
+#include "sha256.c"
+#pragma clang diagnostic pop
+
+/*
+ * XXX-MJ
+ * - documentation
+ * - split into multiple files?
+ * - review checksum algorithm selection (most should likely be "inherit"?)
+ * - review vdev_space_alloc()
+ * - review type usage (off_t vs. size_t vs. uint64_t)
+ * - inconsistency in variable/field naming (how to name a dnode vs dnode id)
+ * - bootfs property, bootenvs
+ * - ZFS_SHARES_DIR
+ */
+
+#define MAXBLOCKSHIFT 17 /* 128KB */
+#define MAXBLOCKSIZE ((off_t)(1 << MAXBLOCKSHIFT))
+_Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, "");
+#define MINBLOCKSHIFT 9 /* 512B */
+#define MINBLOCKSIZE ((off_t)(1 << MINBLOCKSHIFT))
+_Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, "");
+#define MINDEVSIZE ((off_t)SPA_MINDEVSIZE)
+
+#define INDIR_LEVELS 6
+#define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t))
+
+#define VDEV_LABEL_SPACE \
+ ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
+_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
+
+typedef struct {
+ const char *name;
+ unsigned int id;
+ uint16_t size;
+ sa_bswap_type_t bs;
+} zfs_sattr_t;
+
+typedef struct zfs_objset {
+ objset_phys_t *phys;
+ off_t osloc;
+ off_t osblksz;
+ blkptr_t osbp; /* set in objset_write() */
+
+ off_t space; /* bytes allocated to this objset */
+
+ dnode_phys_t *dnodes; /* dnode array */
+ uint64_t dnodenextfree; /* dnode ID bump allocator */
+ uint64_t dnodecount; /* total number of dnodes */
+ off_t dnodeloc; /* preallocated vdev space */
+} zfs_objset_t;
+
+typedef struct zfs_zap_entry {
+ char *name; /* entry key, private copy */
+ uint64_t hash; /* key hash */
+ union {
+ uint8_t *valp;
+ uint16_t *val16p;
+ uint32_t *val32p;
+ uint64_t *val64p;
+ }; /* entry value, an integer array */
+ uint64_t val64; /* embedded value for a common case */
+ size_t intsz; /* array element size; 1, 2, 4 or 8 */
+ size_t intcnt; /* array size */
+ STAILQ_ENTRY(zfs_zap_entry) next;
+} zfs_zap_entry_t;
+
+typedef struct zfs_zap {
+ STAILQ_HEAD(, zfs_zap_entry) kvps;
+ uint64_t hashsalt; /* key hash input */
+ unsigned long kvpcnt; /* number of key-value pairs */
+ unsigned long chunks; /* count of chunks needed for fat ZAP */
+ bool micro; /* can this be a micro ZAP? */
+
+ dnode_phys_t *dnode; /* backpointer */
+ zfs_objset_t *os; /* backpointer */
+} zfs_zap_t;
+
+struct zfs_dsl_dir;
+
+typedef struct zfs_dsl_dataset {
+ zfs_objset_t *os; /* referenced objset, may be null */
+ dsl_dataset_phys_t *phys; /* on-disk representation */
+ uint64_t dsid; /* DSL dataset dnode */
+
+ struct zfs_dsl_dir *dir; /* containing parent */
+} zfs_dsl_dataset_t;
+
+typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t;
+
+typedef struct zfs_dsl_dir {
+ char *fullname; /* full dataset name */
+ char *name; /* basename(fullname) */
+ dsl_dir_phys_t *phys; /* on-disk representation */
+ nvlist_t *propsnv; /* properties saved in propszap */
+
+ zfs_dsl_dataset_t *headds; /* principal dataset, may be null */
+
+ uint64_t dirid; /* DSL directory dnode */
+ zfs_zap_t propszap; /* dataset properties */
+ zfs_zap_t childzap; /* child directories */
+
+ /* DSL directory tree linkage. */
+ struct zfs_dsl_dir *parent;
+ zfs_dsl_dir_list_t children;
+ STAILQ_ENTRY(zfs_dsl_dir) next;
+} zfs_dsl_dir_t;
+
+typedef struct zfs_fs {
+ zfs_objset_t *os;
+
+ /* Offset table for system attributes, indexed by a zpl_attr_t. */
+ uint16_t *saoffs;
+ size_t sacnt;
+ const zfs_sattr_t *satab;
+} zfs_fs_t;
+
+struct dataset_desc {
+ char *params;
+ STAILQ_ENTRY(dataset_desc) next;
+};
+
+typedef struct {
+ /* I/O buffer, just for convenience. */
+ char filebuf[MAXBLOCKSIZE];
+
+ /* Pool parameters. */
+ const char *poolname;
+ char *rootpath; /* implicit mount point prefix */
+ char *bootfs; /* bootable dataset, pool property */
+ int ashift; /* vdev block size */
+ STAILQ_HEAD(, dataset_desc) datasets; /* non-root dataset descrs */
+
+ /* Pool state. */
+ uint64_t guid; /* pool and vdev GUID */
+ zfs_zap_t poolprops;
+
+ /* MOS state. */
+ zfs_objset_t mos; /* meta object set */
+ uint64_t objarrid; /* space map object array */
+
+ /* DSL state. */
+ zfs_dsl_dir_t rootdsldir; /* root DSL directory */
+ zfs_dsl_dataset_t rootds;
+ zfs_dsl_dir_t origindsldir; /* $ORIGIN */
+ zfs_dsl_dataset_t originds;
+ zfs_dsl_dataset_t snapds;
+ zfs_zap_t cloneszap;
+ zfs_dsl_dir_t freedsldir; /* $FREE */
+ zfs_dsl_dir_t mosdsldir; /* $MOS */
+
+ /* vdev state. */
+ int fd; /* vdev disk fd */
+ off_t vdevsize; /* vdev size, including labels */
+ off_t asize; /* vdev size, excluding labels */
+ bitstr_t *spacemap; /* space allocation tracking */
+ int spacemapbits; /* one bit per ashift-sized block */
+ uint64_t msshift; /* log2(metaslab size) */
+ uint64_t mscount; /* number of metaslabs for this vdev */
+} zfs_opt_t;
+
+static void zap_init(zfs_zap_t *, zfs_objset_t *, dnode_phys_t *);
+static void zap_add_uint64(zfs_zap_t *, const char *, uint64_t);
+static void zap_add_string(zfs_zap_t *, const char *, const char *);
+static void zap_write(zfs_opt_t *, zfs_zap_t *);
+
+static dnode_phys_t *objset_dnode_lookup(zfs_objset_t *, uint64_t);
+static dnode_phys_t *objset_dnode_alloc(zfs_objset_t *, uint8_t, uint64_t *);
+static dnode_phys_t *objset_dnode_bonus_alloc(zfs_objset_t *, uint8_t, uint8_t,
+ uint16_t, uint64_t *);
+static off_t objset_space_alloc(zfs_opt_t *, zfs_objset_t *, off_t *);
+
+static void dsl_dir_init(zfs_opt_t *, const char *, zfs_dsl_dir_t *);
+static void dsl_dataset_init(zfs_opt_t *, zfs_dsl_dir_t *, zfs_dsl_dataset_t *);
+
+static void spacemap_init(zfs_opt_t *);
+
+struct dnode_cursor {
+ char inddir[INDIR_LEVELS][MAXBLOCKSIZE];
+ off_t indloc;
+ off_t indspace;
+ dnode_phys_t *dnode;
+ off_t dataoff;
+ off_t datablksz;
+};
+
+static struct dnode_cursor *dnode_cursor_init(zfs_opt_t *, zfs_objset_t *,
+ dnode_phys_t *, off_t, off_t);
+static blkptr_t *dnode_cursor_next(zfs_opt_t *, struct dnode_cursor *,
+ off_t);
+static void dnode_cursor_finish(zfs_opt_t *, struct dnode_cursor *);
+
+static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
+
+/*
+ * The order of the attributes doesn't matter, this is simply the one hard-coded
+ * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
+ */
+typedef enum zpl_attr {
+ ZPL_ATIME,
+ ZPL_MTIME,
+ ZPL_CTIME,
+ ZPL_CRTIME,
+ ZPL_GEN,
+ ZPL_MODE,
+ ZPL_SIZE,
+ ZPL_PARENT,
+ ZPL_LINKS,
+ ZPL_XATTR,
+ ZPL_RDEV,
+ ZPL_FLAGS,
+ ZPL_UID,
+ ZPL_GID,
+ ZPL_PAD,
+ ZPL_ZNODE_ACL,
+ ZPL_DACL_COUNT,
+ ZPL_SYMLINK,
+ ZPL_SCANSTAMP,
+ ZPL_DACL_ACES,
+ ZPL_DXATTR,
+ ZPL_PROJID,
+} zpl_attr_t;
+
+/*
+ * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
+ */
+static const zfs_sattr_t zpl_attrs[] = {
+#define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b }
+ _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
+ _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
+ _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
+ _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
+ _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
+ _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
+#undef ZPL_ATTR
+};
+
+/*
+ * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
+ * It need not match in general, but FreeBSD's loader doesn't bother parsing the
+ * layout and just hard-codes attribute offsets.
+ */
+static const sa_attr_type_t zpl_attr_layout[] = {
+ ZPL_MODE,
+ ZPL_SIZE,
+ ZPL_GEN,
+ ZPL_UID,
+ ZPL_GID,
+ ZPL_PARENT,
+ ZPL_FLAGS,
+ ZPL_ATIME,
+ ZPL_MTIME,
+ ZPL_CTIME,
+ ZPL_CRTIME,
+ ZPL_LINKS,
+ ZPL_DACL_COUNT,
+ ZPL_DACL_ACES,
+ ZPL_SYMLINK,
+};
+
+/*
+ * Keys for the ZPL attribute tables in the SA layout ZAP. The first two
+ * indices are reserved for legacy attribute encoding.
+ */
+#define SA_LAYOUT_INDEX_DEFAULT 2
+#define SA_LAYOUT_INDEX_SYMLINK 3
+
+void
+zfs_prep_opts(fsinfo_t *fsopts)
+{
+ zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs));
+
+ const option_t zfs_options[] = {
+ { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
+ 0, 0, "Bootable dataset" },
+ { '\0', "poolname", &zfs->poolname, OPT_STRPTR,
+ 0, 0, "ZFS pool name" },
+ { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
+ 0, 0, "Prefix for all dataset mount points" },
+ { '\0', "ashift", &zfs->ashift, OPT_INT32,
+ MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
+ { .name = NULL }
+ };
+
+ /* Set some default values. */
+ zfs->ashift = 12;
+
+ STAILQ_INIT(&zfs->datasets);
+
+ fsopts->fs_specific = zfs;
+ fsopts->fs_options = copy_opts(zfs_options);
+}
+
+int
+zfs_parse_opts(const char *option, fsinfo_t *fsopts)
+{
+ zfs_opt_t *zfs;
+ struct dataset_desc *dsdesc;
+ char buf[BUFSIZ], *opt, *val;
+ int rv;
+
+ zfs = fsopts->fs_specific;
+
+ opt = val = estrdup(option);
+ opt = strsep(&val, "=");
+ if (strcmp(opt, "fs") == 0) {
+ if (val == NULL)
+ errx(1, "invalid filesystem parameters `%s'", option);
+
+ /*
+ * Dataset descriptions will be parsed later, in dsl_init().
+ * Just stash them away for now.
+ */
+ dsdesc = ecalloc(1, sizeof(*dsdesc));
+ dsdesc->params = estrdup(val);
+ free(opt);
+ STAILQ_INSERT_TAIL(&zfs->datasets, dsdesc, next);
+ return (1);
+ }
+ free(opt);
+
+ rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
+ return (rv == -1 ? 0 : 1);
+}
+
+static void
+zfs_check_opts(fsinfo_t *fsopts)
+{
+ zfs_opt_t *zfs;
+
+ zfs = fsopts->fs_specific;
+
+ if (fsopts->offset != 0)
+ errx(1, "unhandled offset option");
+ if (zfs->poolname == NULL)
+ errx(1, "a pool name must be specified");
+ if (zfs->rootpath == NULL)
+ easprintf(&zfs->rootpath, "/%s", zfs->poolname);
+ if (zfs->rootpath[0] != '/')
+ errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
+}
+
+void
+zfs_cleanup_opts(fsinfo_t *fsopts)
+{
+ struct dataset_desc *d, *tmp;
+ zfs_opt_t *zfs;
+
+ zfs = fsopts->fs_specific;
+ free(zfs->rootpath);
+ free(zfs->bootfs);
+ free(__DECONST(void *, zfs->poolname));
+ STAILQ_FOREACH_SAFE(d, &zfs->datasets, next, tmp) {
+ free(d->params);
+ free(d);
+ }
+ free(zfs);
+ free(fsopts->fs_options);
+}
+
+static int
+nvlist_find_string(nvlist_t *nvl, const char *key, char **retp)
+{
+ char *str;
+ int error, len;
+
+ error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len);
+ if (error == 0) {
+ *retp = ecalloc(1, len + 1);
+ memcpy(*retp, str, len);
+ }
+ return (error);
+}
+
+static int
+nvlist_find_uint64(nvlist_t *nvl, const char *key, uint64_t *retp)
+{
+ return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL));
+}
+
+static size_t
+nvlist_size(const nvlist_t *nvl)
+{
+ return (sizeof(nvl->nv_header) + nvl->nv_size);
+}
+
+static void
+nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
+{
+ assert(sz >= nvlist_size(nvl));
+
+ memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
+ memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
+}
+
+static void
+blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level,
+ uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum)
+{
+ dva_t *dva;
+
+ assert(powerof2(size));
+
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ BP_SET_CHECKSUM(bp, cksumt);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+ BP_SET_LEVEL(bp, level);
+ BP_SET_FILL(bp, fill);
+ BP_SET_TYPE(bp, dntype);
+
+ dva = BP_IDENTITY(bp);
+ DVA_SET_VDEV(dva, 0);
+ DVA_SET_OFFSET(dva, off);
+ DVA_SET_ASIZE(dva, size);
+ memcpy(&bp->blk_cksum, cksum, sizeof(*cksum));
+}
+
+static void
+vdev_init(zfs_opt_t *zfs, size_t size, const char *image)
+{
+ assert(zfs->ashift >= MINBLOCKSHIFT);
+
+ zfs->vdevsize = rounddown2(size, 1 << zfs->ashift);
+ if (zfs->vdevsize < MINDEVSIZE) {
+ errx(1, "Maximum image size %ju is too small",
+ (uintmax_t)zfs->vdevsize);
+ }
+ zfs->asize = zfs->vdevsize - VDEV_LABEL_SPACE;
+
+ zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644);
+ if (zfs->fd == -1)
+ err(1, "Can't open `%s' for writing", image);
+ if (ftruncate(zfs->fd, zfs->vdevsize) != 0)
+ err(1, "Failed to extend image file `%s'", image);
+
+ spacemap_init(zfs);
+}
+
+static void
+vdev_fini(zfs_opt_t *zfs)
+{
+ assert(zfs->spacemap == NULL);
+
+ if (zfs->fd != -1) {
+ if (close(zfs->fd) != 0)
+ err(1, "close");
+ zfs->fd = -1;
+ }
+}
+
+/*
+ * Write a block of data to the vdev. The offset is always relative to the end
+ * of the second leading vdev label.
+ *
+ * Consumers should generally use the helpers below, which provide block
+ * pointers and update dnode accounting, rather than calling this function
+ * directly.
+ */
+static void
+vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off)
+{
+ ssize_t n;
+
+ assert(off >= 0 && off < zfs->asize);
+ assert(powerof2(len));
+ assert((off_t)len > 0 && off + (off_t)len > off &&
+ off + (off_t)len < zfs->asize);
+ if (zfs->spacemap != NULL) {
+ /*
+ * Verify that the blocks being written were in fact allocated.
+ *
+ * The space map isn't available once the on-disk space map is
+ * finalized, so this check doesn't quite catch everything.
+ */
+ assert(bit_ntest(zfs->spacemap, off >> zfs->ashift,
+ (off + len - 1) >> zfs->ashift, 1));
+ }
+
+ off += VDEV_LABEL_START_SIZE;
+ for (size_t sofar = 0; sofar < len; sofar += n) {
+ n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar,
+ off + sofar);
+ if (n < 0)
+ err(1, "pwrite");
+ assert(n > 0);
+ }
+}
+
+static void
+vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
+ uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
+ blkptr_t *bp)
+{
+ zio_cksum_t cksum;
+
+ assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4);
+
+ fletcher_4_native(data, sz, NULL, &cksum);
+ blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum);
+ vdev_pwrite(zfs, data, sz, loc);
+}
+
+static void
+vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
+ uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp)
+{
+ vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill,
+ data, sz, loc, bp);
+
+ assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0);
+ dnode->dn_used += sz;
+}
+
+static void
+vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
+ off_t sz, off_t loc)
+{
+ vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc,
+ &dnode->dn_blkptr[0]);
+}
+
+static void
+vdev_label_set_checksum(void *buf, off_t off, off_t size)
+{
+ zio_cksum_t cksum;
+ zio_eck_t *eck;
+
+ assert(size > 0 && (size_t)size >= sizeof(zio_eck_t));
+
+ eck = (zio_eck_t *)((char *)buf + size) - 1;
+ eck->zec_magic = ZEC_MAGIC;
+ ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0);
+ zio_checksum_SHA256(buf, size, NULL, &cksum);
+ eck->zec_cksum = cksum;
+}
+
+/*
+ * Set embedded checksums and write the label at the specified index.
+ */
+static void
+vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp)
+{
+ vdev_label_t *label;
+ ssize_t n;
+ off_t blksz, loff;
+
+ assert(ind >= 0 && ind < VDEV_LABELS);
+
+ /*
+ * Make a copy since we have to modify the label to set checksums.
+ */
+ label = ecalloc(1, sizeof(*label));
+ memcpy(label, labelp, sizeof(*label));
+
+ if (ind < 2)
+ loff = ind * sizeof(*label);
+ else
+ loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label);
+
+ /*
+ * Set the verifier checksum for the boot block. We don't use it, but
+ * the FreeBSD loader reads it and will complain if the checksum isn't
+ * valid.
+ */
+ vdev_label_set_checksum(&label->vl_be,
+ loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be));
+
+ /*
+ * Set the verifier checksum for the label.
+ */
+ vdev_label_set_checksum(&label->vl_vdev_phys,
+ loff + __offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof(label->vl_vdev_phys));
+
+ /*
+ * Set the verifier checksum for the uberblocks. There is one uberblock
+ * per sector; for example, with an ashift of 12 we end up with
+ * 128KB/4KB=32 copies of the uberblock in the ring.
+ */
+ blksz = 1 << zfs->ashift;
+ assert(sizeof(label->vl_uberblock) % blksz == 0);
+ for (size_t roff = 0; roff < sizeof(label->vl_uberblock);
+ roff += blksz) {
+ vdev_label_set_checksum(&label->vl_uberblock[0] + roff,
+ loff + __offsetof(vdev_label_t, vl_uberblock) + roff,
+ blksz);
+ }
+
+ n = pwrite(zfs->fd, label, sizeof(*label), loff);
+ if (n < 0)
+ err(1, "writing vdev label");
+ assert(n == sizeof(*label));
+
+ free(label);
+}
+
+/*
+ * Find a chunk of contiguous free space of length *lenp, according to the
+ * following rules:
+ * 1. If the length is less than or equal to 128KB, the returned run's length
+ * will be the smallest power of 2 equal to or larger than the length.
+ * 2. If the length is larger than 128KB, the returned run's length will be
+ * the smallest multiple of 128KB that is larger than the length.
+ * 3. The returned run's length will be size-aligned up to 128KB.
+ *
+ * XXX-MJ the third rule isn't actually required, so this can just be a dumb
+ * bump allocator. Maybe there's some benefit to keeping large blocks aligned,
+ * so let's keep it for now and hope we don't get too much fragmentation.
+ * Alternately we could try to allocate all blocks of a certain size from the
+ * same metaslab.
+ */
+static off_t
+vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp)
+{
+ off_t len;
+ int align, loc, minblksz, nbits;
+
+ minblksz = 1 << zfs->ashift;
+ len = roundup2(*lenp, minblksz);
+
+ assert(len != 0);
+ assert(len / minblksz <= INT_MAX);
+
+ if (len < MAXBLOCKSIZE) {
+ if ((len & (len - 1)) != 0)
+ len = (off_t)1 << flsll(len);
+ align = len / minblksz;
+ } else {
+ len = roundup2(len, MAXBLOCKSIZE);
+ align = MAXBLOCKSIZE / minblksz;
+ }
+
+ for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) {
+ bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits,
+ &loc);
+ if (loc == -1) {
+ errx(1, "failed to find %ju bytes of space",
+ (uintmax_t)len);
+ }
+ if ((loc & (align - 1)) == 0)
+ break;
+ }
+ assert(loc + nbits > loc);
+ bit_nset(zfs->spacemap, loc, loc + nbits - 1);
+ *lenp = len;
+
+ return ((off_t)loc << zfs->ashift);
+}
+
+static void
+spacemap_init(zfs_opt_t *zfs)
+{
+ uint64_t msshift, nbits, slabs;
+
+ nbits = zfs->asize >> zfs->ashift;
+ if (nbits > INT_MAX) {
+ /*
+ * With the smallest block size of 512B, the limit on the image
+ * size is 2TB. That should be enough for anyone.
+ */
+ errx(1, "image size is too large");
+ }
+ zfs->spacemapbits = (int)nbits;
+ zfs->spacemap = bit_alloc(zfs->spacemapbits);
+ if (zfs->spacemap == NULL)
+ err(1, "bitstring allocation failed");
+
+ /*
+ * Try to choose a metaslab size that gives us a "reasonable" number of
+ * metaslabs. OpenZFS seems to expect at least 2.
+ *
+ * This is simplistic since we expect the pool to be autoexpanded upon
+ * first use, so OpenZFS will have to reorganize things anyway.
+ */
+ for (msshift = 24 /* 16MB */; msshift < 34 /* 16GB */; msshift++) {
+ slabs = zfs->asize / ((uint64_t)1 << msshift);
+ if (slabs >= 4 && slabs <= 200)
+ break;
+ }
+ if (msshift == 34) {
+ errx(1,
+ "failed to find a metaslab size, image size is too large");
+ }
+
+ zfs->msshift = msshift;
+ zfs->mscount = slabs;
+}
+
+static void
+spacemap_write(zfs_opt_t *zfs)
+{
+ dnode_phys_t *objarr;
+ zfs_objset_t *mos;
+ bitstr_t *spacemap;
+ uint64_t *objarrblk;
+ off_t smblksz, objarrblksz, objarrloc;
+
+ struct {
+ dnode_phys_t *dnode;
+ uint64_t dnid;
+ off_t loc;
+ } *sma;
+
+ mos = &zfs->mos;
+
+ objarrblksz = sizeof(uint64_t) * zfs->mscount;
+ assert(objarrblksz <= MAXBLOCKSIZE);
+ objarrloc = objset_space_alloc(zfs, mos, &objarrblksz);
+ objarrblk = ecalloc(1, objarrblksz);
+
+ objarr = objset_dnode_lookup(mos, zfs->objarrid);
+ objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT;
+
+ /*
+ * Use the smallest block size for space maps. The space allocation
+ * algorithm should aim to minimize the number of holes.
+ */
+ smblksz = 1 << zfs->ashift;
+
+ /*
+ * First allocate dnodes and space for all of our space maps. No more
+ * space can be allocated from the vdev after this point.
+ */
+ sma = ecalloc(zfs->mscount, sizeof(*sma));
+ for (uint64_t i = 0; i < zfs->mscount; i++) {
+ sma[i].dnode = objset_dnode_bonus_alloc(mos, DMU_OT_SPACE_MAP,
+ DMU_OT_SPACE_MAP_HEADER, SPACE_MAP_SIZE_V0, &sma[i].dnid);
+ sma[i].loc = objset_space_alloc(zfs, mos, &smblksz);
+ }
+ spacemap = zfs->spacemap;
+ zfs->spacemap = NULL;
+
+ /*
+ * Now that the set of allocated space is finalized, populate each space
+ * map and write it to the vdev.
+ */
+ for (uint64_t i = 0; i < zfs->mscount; i++) {
+ space_map_phys_t *sm;
+ uint64_t alloc, length, *smblk;
+ int shift, startb, endb, srunb, erunb;
+
+ /*
+ * We only allocate a single block for this space map, but OpenZFS
+ * assumes that a space map object with sufficient bonus space supports
+ * histograms.
+ */
+ sma[i].dnode->dn_nblkptr = 3;
+ sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT;
+
+ smblk = ecalloc(1, smblksz);
+
+ alloc = length = 0;
+ shift = zfs->msshift - zfs->ashift;
+ for (srunb = startb = i * (1 << shift),
+ endb = (i + 1) * (1 << shift);
+ srunb < endb; srunb = erunb) {
+ uint64_t runlen, runoff;
+
+ /* Find a run of allocated space. */
+ bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb);
+ if (srunb == -1 || srunb >= endb)
+ break;
+
+ bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb);
+ if (erunb == -1 || erunb > endb)
+ erunb = endb;
+
+ /*
+ * The space represented by [srunb, erunb) has been
+ * allocated. Add a record to the space map to indicate
+ * this. Run offsets are relative to the beginning of
+ * the metaslab.
+ */
+ runlen = erunb - srunb;
+ runoff = srunb - startb;
+
+ assert(length * sizeof(uint64_t) < (uint64_t)smblksz);
+ smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) |
+ SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0);
+ smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) |
+ SM2_OFFSET_ENCODE(runoff);
+
+ alloc += runlen << zfs->ashift;
+ length += 2;
+ }
+
+ sm = DN_BONUS(sma[i].dnode);
+ sm->smp_length = length * sizeof(uint64_t);
+ sm->smp_alloc = alloc;
+
+ vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz,
+ sma[i].loc);
+ free(smblk);
+
+ /* Record this space map in the space map object array. */
+ objarrblk[i] = sma[i].dnid;
+ }
+
+ /*
+ * All of the space maps are written, now write the object array.
+ */
+ vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc);
+ free(objarrblk);
+
+ assert(zfs->spacemap == NULL);
+ free(spacemap);
+ free(sma);
+}
+
+static void
+objset_init(zfs_opt_t *zfs, zfs_objset_t *os, uint64_t type,
+ uint64_t dnodecount)
+{
+ dnode_phys_t *mdnode;
+ off_t blksz;
+
+ /*
+ * Allocate space on the vdev for the objset and dnode array. For other
+ * objects we do that only when going to actually write them to the
+ * vdev, but in this case it simplifies space map accounting to do it
+ * now.
+ */
+ os->osblksz = sizeof(objset_phys_t);
+ os->osloc = objset_space_alloc(zfs, os, &os->osblksz);
+
+ /*
+ * Object ID zero is always reserved for the meta dnode, which is
+ * embedded in the objset itself.
+ */
+ dnodecount++;
+
+ os->dnodenextfree = 1;
+ os->dnodecount = dnodecount;
+ blksz = roundup2(dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE);
+ os->dnodeloc = objset_space_alloc(zfs, os, &blksz);
+ assert(blksz % DNODE_BLOCK_SIZE == 0);
+ os->dnodes = ecalloc(1, blksz);
+
+ os->phys = ecalloc(1, os->osblksz);
+ os->phys->os_type = type;
+
+ mdnode = &os->phys->os_meta_dnode;
+ mdnode->dn_indblkshift = MAXBLOCKSHIFT;
+ mdnode->dn_type = DMU_OT_DNODE;
+ mdnode->dn_bonustype = DMU_OT_NONE;
+ mdnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ mdnode->dn_datablkszsec = DNODE_BLOCK_SIZE >> MINBLOCKSHIFT;
+ mdnode->dn_nlevels = 1;
+ for (uint64_t count = dnodecount / DNODES_PER_BLOCK; count > 1;
+ count /= BLKPTR_PER_INDIR)
+ mdnode->dn_nlevels++;
+ mdnode->dn_nblkptr = 1;
+ mdnode->dn_maxblkid = howmany(dnodecount, DNODES_PER_BLOCK) - 1;
+ mdnode->dn_flags = DNODE_FLAG_USED_BYTES;
+}
+
+/*
+ * Write the dnode array and physical object set to disk.
+ */
+static void
+_objset_write(zfs_opt_t *zfs, zfs_objset_t *os, struct dnode_cursor *c)
+{
+ assert(os->dnodenextfree == os->dnodecount);
+
+ /*
+ * Write out the dnode array, i.e., the meta-dnode. For some reason its
+ * data blocks must be 16KB in size no matter how large the array is.
+ */
+ for (uint64_t i = 0; i < os->dnodecount; i += DNODES_PER_BLOCK) {
+ dnode_phys_t *blk;
+ uint64_t fill;
+ off_t loc;
+
+ blk = os->dnodes + i;
+ loc = os->dnodeloc + i * sizeof(dnode_phys_t);
+ fill = os->dnodecount - i < DNODES_PER_BLOCK ?
+ os->dnodecount - i : 0;
+
+ vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode,
+ 0, fill, blk, DNODE_BLOCK_SIZE, loc,
+ dnode_cursor_next(zfs, c, i * sizeof(dnode_phys_t)));
+ }
+ dnode_cursor_finish(zfs, c);
+ free(os->dnodes);
+ os->dnodes = NULL;
+
+ /*
+ * Write the object set itself. The saved block pointer will be copied
+ * into the referencing DSL dataset or the uberblocks.
+ */
+ vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1,
+ os->phys, os->osblksz, os->osloc, &os->osbp);
+}
+
+static void
+objset_write(zfs_opt_t *zfs, zfs_objset_t *os)
+{
+ struct dnode_cursor *c;
+
+ c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode,
+ os->dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE);
+ _objset_write(zfs, os, c);
+}
+
+static void
+objset_mos_write(zfs_opt_t *zfs)
+{
+ struct dnode_cursor *c;
+ zfs_objset_t *mos;
+
+ mos = &zfs->mos;
+
+ /*
+ * There is a chicken-and-egg problem here: we cannot write space maps
+ * before we're finished allocating space from the vdev, and we can't
+ * write the MOS without having allocated space for indirect dnode
+ * blocks. Thus, rather than lazily allocating indirect blocks for the
+ * meta-dnode (which would be simpler), they are allocated up-front and
+ * before writing space maps.
+ */
+ c = dnode_cursor_init(zfs, mos, &mos->phys->os_meta_dnode,
+ mos->dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE);
+ spacemap_write(zfs);
+
+ /*
+ * We've finished allocating space, account for it in $MOS.
+ */
+ zfs->mosdsldir.phys->dd_used_bytes = mos->space;
+ zfs->mosdsldir.phys->dd_compressed_bytes = mos->space;
+ zfs->mosdsldir.phys->dd_uncompressed_bytes = mos->space;
+
+ _objset_write(zfs, mos, c);
+}
+
+static dnode_phys_t *
+objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype,
+ uint16_t bonuslen, uint64_t *idp)
+{
+ dnode_phys_t *dnode;
+
+ assert(os->dnodenextfree < os->dnodecount);
+ assert(bonuslen <= DN_OLD_MAX_BONUSLEN);
+
+ *idp = os->dnodenextfree;
+ dnode = &os->dnodes[os->dnodenextfree++];
+ dnode->dn_type = type;
+ dnode->dn_indblkshift = MAXBLOCKSHIFT;
+ dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT;
+ dnode->dn_nlevels = 1;
+ dnode->dn_nblkptr = 1;
+ dnode->dn_bonustype = bonustype;
+ dnode->dn_bonuslen = bonuslen;
+ dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ dnode->dn_compress = ZIO_COMPRESS_OFF;
+ dnode->dn_flags = DNODE_FLAG_USED_BYTES;
+ return (dnode);
+}
+
+static dnode_phys_t *
+objset_dnode_alloc(zfs_objset_t *os, uint8_t type, uint64_t *idp)
+{
+ return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp));
+}
+
+static dnode_phys_t *
+objset_dnode_lookup(zfs_objset_t *os, uint64_t id)
+{
+ assert(id > 0 && id <= os->dnodecount);
+
+ return (&os->dnodes[id]);
+}
+
+static off_t
+objset_space_alloc(zfs_opt_t *zfs, zfs_objset_t *os, off_t *lenp)
+{
+ off_t loc;
+
+ loc = vdev_space_alloc(zfs, lenp);
+ os->space += *lenp;
+ return (loc);
+}
+
+/*
+ * Return an allocated string containing the head dataset's mountpoint,
+ * including the root path prefix.
+ *
+ * If the dataset has a mountpoint property, it is returned. Otherwise we have
+ * to follow ZFS' inheritance rules.
+ */
+static char *
+dsl_dir_get_mountpoint(zfs_opt_t *zfs, zfs_dsl_dir_t *dir)
+{
+ zfs_dsl_dir_t *pdir;
+ char *mountpoint, *origmountpoint;
+
+ if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) {
+ if (strcmp(mountpoint, "none") == 0)
+ return (NULL);
+
+ /*
+ * nvlist_find_string() does not make a copy.
+ */
+ mountpoint = estrdup(mountpoint);
+ } else {
+ /*
+ * If we don't have a mountpoint, it's inherited from one of our
+ * ancestors. Walk up the hierarchy until we find it, building
+ * up our mountpoint along the way. The mountpoint property is
+ * always set for the root dataset.
+ */
+ for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) {
+ origmountpoint = mountpoint;
+
+ if (nvlist_find_string(pdir->propsnv, "mountpoint",
+ &mountpoint) == 0) {
+ easprintf(&mountpoint, "%s%s%s", mountpoint,
+ mountpoint[strlen(mountpoint) - 1] == '/' ?
+ "" : "/", origmountpoint);
+ free(origmountpoint);
+ break;
+ }
+
+ easprintf(&mountpoint, "%s/%s", pdir->name,
+ origmountpoint);
+ free(origmountpoint);
+ pdir = pdir->parent;
+ }
+ }
+ assert(mountpoint[0] == '/');
+ assert(strstr(mountpoint, zfs->rootpath) == mountpoint);
+
+ return (mountpoint);
+}
+
+/*
+ * Handle dataset properties that we know about; stash them into an nvlist to be
+ * written later to the properties ZAP object.
+ *
+ * If the set of properties we handle grows too much, we should probably explore
+ * using libzfs to manage them.
+ */
+static void
+dsl_dir_set_prop(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, const char *key,
+ const char *val)
+{
+ nvlist_t *nvl;
+
+ nvl = dir->propsnv;
+ if (val == NULL || val[0] == '\0')
+ errx(1, "missing value for property `%s'", key);
+ if (nvpair_find(nvl, key) != NULL)
+ errx(1, "property `%s' already set", key);
+
+ if (strcmp(key, "mountpoint") == 0) {
+ if (strcmp(val, "none") != 0) {
+ if (val[0] != '/')
+ errx(1, "mountpoint `%s' is not absolute", val);
+ if (strcmp(val, zfs->rootpath) != 0 &&
+ strcmp(zfs->rootpath, "/") != 0 &&
+ (strstr(val, zfs->rootpath) != val ||
+ val[strlen(zfs->rootpath)] != '/')) {
+ errx(1, "mountpoint `%s' is not prefixed by "
+ "the root path `%s'", val, zfs->rootpath);
+ }
+ }
+ nvlist_add_string(nvl, key, val);
+ } else if (strcmp(key, "atime") == 0 || strcmp(key, "exec") == 0 ||
+ strcmp(key, "setuid") == 0) {
+ if (strcmp(val, "on") == 0)
+ nvlist_add_uint64(nvl, key, 1);
+ else if (strcmp(val, "off") == 0)
+ nvlist_add_uint64(nvl, key, 0);
+ else
+ errx(1, "invalid value `%s' for %s", val, key);
+ } else if (strcmp(key, "canmount") == 0) {
+ if (strcmp(val, "noauto") == 0)
+ nvlist_add_uint64(nvl, key, 2);
+ else if (strcmp(val, "on") == 0)
+ nvlist_add_uint64(nvl, key, 1);
+ else if (strcmp(val, "off") == 0)
+ nvlist_add_uint64(nvl, key, 0);
+ else
+ errx(1, "invalid value `%s' for %s", val, key);
+ } else {
+ errx(1, "unknown property `%s'", key);
+ }
+}
+
+static void
+dsl_init_metadir(zfs_opt_t *zfs, const char *name, zfs_dsl_dir_t *dir)
+{
+ char *path;
+
+ easprintf(&path, "%s/%s", zfs->poolname, name);
+ dsl_dir_init(zfs, path, dir);
+ free(path);
+}
+
+static void
+dsl_init_origindir(zfs_opt_t *zfs)
+{
+ dnode_phys_t *clones;
+ uint64_t clonesid;
+
+ dsl_init_metadir(zfs, "$ORIGIN", &zfs->origindsldir);
+ dsl_dataset_init(zfs, &zfs->origindsldir, &zfs->originds);
+ dsl_dataset_init(zfs, &zfs->origindsldir, &zfs->snapds);
+
+ clones = objset_dnode_alloc(&zfs->mos, DMU_OT_DSL_CLONES, &clonesid);
+ zap_init(&zfs->cloneszap, &zfs->mos, clones);
+ zfs->origindsldir.phys->dd_clones = clonesid;
+}
+
+static void
+dsl_init(zfs_opt_t *zfs)
+{
+ zfs_dsl_dir_t *dir;
+ struct dataset_desc *d;
+
+ dsl_dir_init(zfs, NULL, &zfs->rootdsldir);
+
+ nvlist_add_uint64(zfs->rootdsldir.propsnv, "compression",
+ ZIO_COMPRESS_OFF);
+
+ dsl_dataset_init(zfs, &zfs->rootdsldir, &zfs->rootds);
+ zfs->rootdsldir.headds = &zfs->rootds;
+
+ dsl_init_metadir(zfs, "$MOS", &zfs->mosdsldir);
+ dsl_init_metadir(zfs, "$FREE", &zfs->freedsldir);
+ dsl_init_origindir(zfs);
+
+ /*
+ * Go through the list of user-specified datasets and create DSL objects
+ * for them.
+ */
+ STAILQ_FOREACH(d, &zfs->datasets, next) {
+ char *dsname, *params, *param, *nextparam;
+
+ params = d->params;
+ dsname = strsep(¶ms, ":");
+
+ if (strcmp(dsname, zfs->poolname) == 0) {
+ /*
+ * This is the root dataset; it's already created, so
+ * we're just setting options.
+ */
+ dir = &zfs->rootdsldir;
+ } else {
+ dir = ecalloc(1, sizeof(*dir));
+ dsl_dir_init(zfs, dsname, dir);
+ dir->headds = ecalloc(1, sizeof(*dir->headds));
+ dsl_dataset_init(zfs, dir, dir->headds);
+ }
+
+ for (nextparam = param = params; nextparam != NULL;) {
+ char *key, *val;
+
+ param = strsep(&nextparam, ":");
+
+ key = val = param;
+ key = strsep(&val, "=");
+ dsl_dir_set_prop(zfs, dir, key, val);
+ }
+ }
+
+ /*
+ * Set the root dataset's mount point if the user didn't override the
+ * default.
+ */
+ if (nvpair_find(zfs->rootdsldir.propsnv, "mountpoint") == NULL) {
+ nvlist_add_string(zfs->rootdsldir.propsnv, "mountpoint",
+ zfs->rootpath);
+ }
+}
+
+static void
+dsl_dir_foreach_post(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir,
+ void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg)
+{
+ zfs_dsl_dir_t *cdsldir;
+
+ STAILQ_FOREACH(cdsldir, &dsldir->children, next) {
+ dsl_dir_foreach_post(zfs, cdsldir, cb, arg);
+ }
+ cb(zfs, dsldir, arg);
+}
+
+/*
+ * Used when the caller doesn't care about the order one way or another.
+ */
+static void
+dsl_dir_foreach(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir,
+ void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg)
+{
+ dsl_dir_foreach_post(zfs, dsldir, cb, arg);
+}
+
+/*
+ * Create a DSL directory, which is effectively an entry in the ZFS namespace.
+ * We always create a root DSL directory, whose name is the pool's name, and
+ * several metadata directories.
+ *
+ * Each directory has two ZAP objects, one pointing to child directories, and
+ * one for properties (which are inherited by children unless overridden).
+ * Directories typically reference a DSL dataset, the "head dataset", which
+ * points to an object set.
+ */
+static void
+dsl_dir_init(zfs_opt_t *zfs, const char *name, zfs_dsl_dir_t *dsldir)
+{
+ zfs_dsl_dir_list_t l, *lp;
+ zfs_dsl_dir_t *parent;
+ zfs_objset_t *mos;
+ dnode_phys_t *dnode;
+ char *dirname, *nextdir, *origname;
+ uint64_t childid, propsid;
+
+ mos = &zfs->mos;
+
+ dnode = objset_dnode_bonus_alloc(mos, DMU_OT_DSL_DIR, DMU_OT_DSL_DIR,
+ sizeof(dsl_dir_phys_t), &dsldir->dirid);
+ dsldir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode);
+
+ dnode = objset_dnode_alloc(mos, DMU_OT_DSL_PROPS, &propsid);
+ zap_init(&dsldir->propszap, mos, dnode);
+
+ dnode = objset_dnode_alloc(mos, DMU_OT_DSL_DIR_CHILD_MAP, &childid);
+ zap_init(&dsldir->childzap, mos, dnode);
+
+ dsldir->propsnv = nvlist_create(NV_UNIQUE_NAME);
+ STAILQ_INIT(&dsldir->children);
+
+ dsldir->phys->dd_child_dir_zapobj = childid;
+ dsldir->phys->dd_props_zapobj = propsid;
+
+ if (name == NULL) {
+ /*
+ * This is the root DSL directory.
+ */
+ assert(dsldir == &zfs->rootdsldir);
+ dsldir->name = estrdup(zfs->poolname);
+ dsldir->fullname = estrdup(zfs->poolname);
+ dsldir->parent = NULL;
+ dsldir->phys->dd_parent_obj = 0;
+ return;
+ }
+
+ /*
+ * Insert the new directory into the hierarchy. Currently this must be
+ * done in order, e.g., when creating pool/a/b, pool/a must already
+ * exist.
+ */
+ STAILQ_INIT(&l);
+ STAILQ_INSERT_HEAD(&l, &zfs->rootdsldir, next);
+ origname = dirname = nextdir = estrdup(name);
+ for (lp = &l;; lp = &parent->children) {
+ dirname = strsep(&nextdir, "/");
+ if (nextdir == NULL)
+ break;
+
+ STAILQ_FOREACH(parent, lp, next) {
+ if (strcmp(parent->name, dirname) == 0)
+ break;
+ }
+ if (parent == NULL) {
+ errx(1, "no parent at `%s' for filesystem `%s'",
+ dirname, name);
+ }
+ }
+
+ dsldir->fullname = estrdup(name);
+ dsldir->name = estrdup(dirname);
+ free(origname);
+ STAILQ_INSERT_TAIL(lp, dsldir, next);
+ zap_add_uint64(&parent->childzap, dsldir->name, dsldir->dirid);
+
+ dsldir->parent = parent;
+ dsldir->phys->dd_parent_obj = parent->dirid;
+}
+
+/*
+ * Convert dataset properties into entries in the DSL directory's properties
+ * ZAP.
+ */
+static void
+dsl_dir_finalize_props(zfs_dsl_dir_t *dir)
+{
+ for (nvp_header_t *nvh = NULL;
+ (nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) {
+ nv_string_t *nvname;
+ nv_pair_data_t *nvdata;
+ const char *name;
+
+ nvname = (nv_string_t *)(nvh + 1);
+ nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] +
+ NV_ALIGN4(nvname->nv_size));
+
+ name = nvstring_get(nvname);
+ switch (nvdata->nv_type) {
+ case DATA_TYPE_UINT64: {
+ uint64_t val;
+
+ memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t));
+ zap_add_uint64(&dir->propszap, name, val);
+ break;
+ }
+ case DATA_TYPE_STRING: {
+ nv_string_t *nvstr;
+
+ nvstr = (nv_string_t *)&nvdata->nv_data[0];
+ zap_add_string(&dir->propszap, name,
+ nvstring_get(nvstr));
+ break;
+ }
+ default:
+ assert(0);
+ }
+ }
+}
+
+static void
+dsl_dir_finalize(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, void *arg __unused)
+{
+ zfs_dsl_dir_t *cdir;
+ uint64_t bytes;
+
+ dsl_dir_finalize_props(dir);
+ zap_write(zfs, &dir->propszap);
+ zap_write(zfs, &dir->childzap);
+
+ if (dir->headds != NULL && dir->headds->os != NULL) {
+ char key[32];
+ zfs_zap_t snapnameszap;
+ dnode_phys_t *snapnames;
+ zfs_dsl_dataset_t *headds;
+ zfs_objset_t *os;
+ uint64_t snapnamesid;
+
+ headds = dir->headds;
+ os = headds->os;
+
+ snapnames = objset_dnode_alloc(&zfs->mos,
+ DMU_OT_DSL_DS_SNAP_MAP, &snapnamesid);
+ zap_init(&snapnameszap, &zfs->mos, snapnames);
+ zap_write(zfs, &snapnameszap);
+
+ dir->phys->dd_head_dataset_obj = headds->dsid;
+ dir->phys->dd_clone_parent_obj = zfs->snapds.dsid;
+ headds->phys->ds_prev_snap_obj = zfs->snapds.dsid;
+ headds->phys->ds_snapnames_zapobj = snapnamesid;
+ memcpy(&headds->phys->ds_bp, &os->osbp, sizeof(blkptr_t));
+
+ zfs->snapds.phys->ds_num_children++;
+ snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid);
+ zap_add_uint64(&zfs->cloneszap, key, headds->dsid);
+
+ bytes = os->space;
+ headds->phys->ds_used_bytes = bytes;
+ /* XXX-MJ not sure what the difference is here... */
+ headds->phys->ds_uncompressed_bytes = bytes;
+ headds->phys->ds_compressed_bytes = bytes;
+
+ STAILQ_FOREACH(cdir, &dir->children, next) {
+ bytes += cdir->phys->dd_used_bytes;
+ }
+ dir->phys->dd_used_bytes = bytes;
+ dir->phys->dd_compressed_bytes = bytes;
+ dir->phys->dd_uncompressed_bytes = bytes;
+ }
+}
+
+static void
+dsl_write(zfs_opt_t *zfs)
+{
+ zfs_zap_t snapnameszap;
+ zfs_objset_t *mos;
+ dnode_phys_t *snapnames;
+ uint64_t snapmapid;
+
+ mos = &zfs->mos;
+
+ /*
+ * Perform accounting, starting from the leaves of the DSL directory
+ * tree. Accounting for $MOS is done later, once we've finished
+ * allocating space.
+ */
+ dsl_dir_foreach_post(zfs, &zfs->rootdsldir, dsl_dir_finalize, NULL);
+
+ snapnames = objset_dnode_alloc(mos, DMU_OT_DSL_DS_SNAP_MAP, &snapmapid);
+
+ zfs->origindsldir.phys->dd_head_dataset_obj = zfs->originds.dsid;
+ zfs->originds.phys->ds_prev_snap_obj = zfs->snapds.dsid;
+ zfs->originds.phys->ds_snapnames_zapobj = snapmapid;
+ zfs->snapds.phys->ds_next_snap_obj = zfs->originds.dsid;
+ assert(zfs->snapds.phys->ds_num_children > 0);
+ zfs->snapds.phys->ds_num_children++;
+
+ zap_init(&snapnameszap, mos, snapnames);
+ zap_add_uint64(&snapnameszap, "$ORIGIN", zfs->snapds.dsid);
+ zap_write(zfs, &snapnameszap);
+
+ zap_write(zfs, &zfs->cloneszap);
+}
+
+static void
+dsl_dataset_init(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, zfs_dsl_dataset_t *ds)
+{
+ zfs_zap_t deadlistzap;
+ dnode_phys_t *dnode;
+ uint64_t deadlistid;
+
+ dnode = objset_dnode_bonus_alloc(&zfs->mos, DMU_OT_DSL_DATASET,
+ DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid);
+ ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode);
+
+ dnode = objset_dnode_bonus_alloc(&zfs->mos, DMU_OT_DEADLIST,
+ DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid);
+ zap_init(&deadlistzap, &zfs->mos, dnode);
+ zap_write(zfs, &deadlistzap);
+
+ ds->phys->ds_dir_obj = dir->dirid;
+ ds->phys->ds_deadlist_obj = deadlistid;
+ ds->phys->ds_creation_txg = TXG_INITIAL - 1;
+ if (ds != &zfs->snapds)
+ ds->phys->ds_prev_snap_txg = TXG_INITIAL - 1;
+
+ ds->dir = dir;
+}
+
+static uint16_t
+zap_entry_chunks(zfs_zap_entry_t *ent)
+{
+ return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) +
+ howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES));
+}
+
+static uint64_t
+zap_hash(uint64_t salt, const char *name)
+{
+ static uint64_t crc64_table[256];
+ const uint64_t crc64_poly = 0xC96C5795D7870F42UL;
+ const uint8_t *cp;
+ uint64_t crc;
+ uint8_t c;
+
+ assert(salt != 0);
+ if (crc64_table[128] == 0) {
+ for (int i = 0; i < 256; i++) {
+ uint64_t *t;
+
+ t = crc64_table + i;
+ *t = i;
+ for (int j = 8; j > 0; j--)
+ *t = (*t >> 1) ^ (-(*t & 1) & crc64_poly);
+ }
+ }
+ assert(crc64_table[128] == crc64_poly);
+
+ for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++)
+ crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF];
+
+ /*
+ * Only use 28 bits, since we need 4 bits in the cookie for the
+ * collision differentiator. We MUST use the high bits, since
+ * those are the ones that we first pay attention to when
+ * choosing the bucket.
+ */
+ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+ return (crc);
+}
+
+static void
+zap_init(zfs_zap_t *zap, zfs_objset_t *os, dnode_phys_t *dnode)
+{
+ STAILQ_INIT(&zap->kvps);
+ zap->hashsalt = ((uint64_t)random() << 32) | random();
+ zap->micro = true;
+ zap->kvpcnt = 0;
+ zap->chunks = 0;
+ zap->dnode = dnode;
+ zap->os = os;
+}
+
+static void
+zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt,
+ const uint8_t *val)
+{
+ zfs_zap_entry_t *ent;
+
+ assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8);
+ assert(strlen(name) + 1 <= ZAP_MAXNAMELEN);
+ assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN);
+
+ ent = ecalloc(1, sizeof(*ent));
+ ent->name = estrdup(name);
+ ent->hash = zap_hash(zap->hashsalt, ent->name);
+ ent->intsz = intsz;
+ ent->intcnt = intcnt;
+ if (intsz == sizeof(uint64_t) && intcnt == 1) {
+ /*
+ * Micro-optimization to elide a memory allocation in that most
+ * common case where this is a directory entry.
+ */
+ ent->val64p = &ent->val64;
+ } else {
+ ent->valp = ecalloc(intcnt, intsz);
+ }
+ memcpy(ent->valp, val, intcnt * intsz);
+ zap->kvpcnt++;
+ zap->chunks += zap_entry_chunks(ent);
+ STAILQ_INSERT_TAIL(&zap->kvps, ent, next);
+
+ if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) ||
+ strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX))
+ zap->micro = false;
+}
+
+static void
+zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val)
+{
+ zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val);
+}
+
+static void
+zap_add_string(zfs_zap_t *zap, const char *name, const char *val)
+{
+ zap_add(zap, name, 1, strlen(val) + 1, val);
+}
+
+static bool
+zap_entry_exists(zfs_zap_t *zap, const char *name)
+{
+ zfs_zap_entry_t *ent;
+
+ STAILQ_FOREACH(ent, &zap->kvps, next) {
+ if (strcmp(ent->name, name) == 0)
+ return (true);
+ }
+ return (false);
+}
+
+static void
+zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap)
+{
+ dnode_phys_t *dnode;
+ zfs_zap_entry_t *ent;
+ mzap_phys_t *mzap;
+ mzap_ent_phys_t *ment;
+ off_t bytes, loc;
+
+ memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
+ mzap = (mzap_phys_t *)&zfs->filebuf[0];
+ mzap->mz_block_type = ZBT_MICRO;
+ mzap->mz_salt = zap->hashsalt;
+ mzap->mz_normflags = 0;
+
+ bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment);
+ assert(bytes <= (off_t)MZAP_MAX_BLKSZ);
+
+ ment = &mzap->mz_chunk[0];
+ STAILQ_FOREACH(ent, &zap->kvps, next) {
+ memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt);
+ ment->mze_cd = 0; /* XXX-MJ */
+ strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name));
+ ment++;
+ }
+
+ loc = objset_space_alloc(zfs, zap->os, &bytes);
+
+ dnode = zap->dnode;
+ dnode->dn_maxblkid = 0;
+ dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT;
+ dnode->dn_flags = DNODE_FLAG_USED_BYTES;
+
+ vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc);
+}
+
+/*
+ * Write some data to the fat ZAP leaf chunk starting at index "li".
+ *
+ * Note that individual integers in the value may be split among consecutive
+ * leaves.
+ */
+static void
+zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz,
+ const uint8_t *val)
+{
+ struct zap_leaf_array *la;
+
+ assert(sz <= ZAP_MAXVALUELEN);
+
+ for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) {
+ n = MIN(resid, ZAP_LEAF_ARRAY_BYTES);
+
+ la = &ZAP_LEAF_CHUNK(l, li).l_array;
+ assert(la->la_type == ZAP_CHUNK_FREE);
+ la->la_type = ZAP_CHUNK_ARRAY;
+ memcpy(la->la_array, val, n);
+ la->la_next = li + 1;
+ }
+ la->la_next = 0xffff;
+}
+
+/*
+ * Find the shortest hash prefix length which lets us distribute keys without
+ * overflowing a leaf block. This is not (space) optimal, but is simple, and
+ * directories large enough to overflow a single 128KB leaf block are uncommon.
+ */
+static unsigned int
+zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l)
+{
+ zfs_zap_entry_t *ent;
+ unsigned int prefixlen;
+
+ if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) {
+ /*
+ * All chunks will fit in a single leaf block.
+ */
+ return (0);
+ }
+
+ for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) {
+ uint32_t *leafchunks;
+
+ leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks));
+ STAILQ_FOREACH(ent, &zap->kvps, next) {
+ uint64_t li;
+ uint16_t chunks;
+
+ li = ZAP_HASH_IDX(ent->hash, prefixlen);
+
+ chunks = zap_entry_chunks(ent);
+ if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) {
+ /*
+ * Not enough space, grow the prefix and retry.
+ */
+ break;
+ }
+ leafchunks[li] += chunks;
+ }
+ free(leafchunks);
+
+ if (ent == NULL) {
+ /*
+ * Everything fits, we're done.
+ */
+ break;
+ }
+ }
+
+ /*
+ * If this fails, then we need to expand the pointer table. For now
+ * this situation is unhandled since it is hard to trigger.
+ */
+ assert(prefixlen < (unsigned int)l->l_bs);
+
+ return (prefixlen);
+}
+
+/*
+ * Initialize a fat ZAP leaf block.
+ */
+static void
+zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen)
+{
+ zap_leaf_phys_t *leaf;
+
+ leaf = l->l_phys;
+
+ leaf->l_hdr.lh_block_type = ZBT_LEAF;
+ leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+ leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+ leaf->l_hdr.lh_prefix = prefix;
+ leaf->l_hdr.lh_prefix_len = prefixlen;
+
+ /* Initialize the leaf hash table. */
+ assert(leaf->l_hdr.lh_nfree < 0xffff);
+ memset(leaf->l_hash, 0xff,
+ ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash));
+
+ /* Initialize the leaf chunks. */
+ for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ struct zap_leaf_free *lf;
+
+ lf = &ZAP_LEAF_CHUNK(l, i).l_free;
+ lf->lf_type = ZAP_CHUNK_FREE;
+ if (i + 1 == ZAP_LEAF_NUMCHUNKS(l))
+ lf->lf_next = 0xffff;
+ else
+ lf->lf_next = i + 1;
+ }
+}
+
+static void
+zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap)
+{
+ struct dnode_cursor *c;
+ zap_leaf_t l;
+ zap_phys_t *zaphdr;
+ struct zap_table_phys *zt;
+ zfs_zap_entry_t *ent;
+ dnode_phys_t *dnode;
+ uint8_t *leafblks;
+ uint64_t lblkcnt, *ptrhasht;
+ off_t loc, blksz;
+ size_t blkshift;
+ unsigned int prefixlen;
+ int ptrcnt;
+
+ /*
+ * For simplicity, always use the largest block size. This should be ok
+ * since most directories will be micro ZAPs, but it's space inefficient
+ * for small ZAPs and might need to be revisited.
+ */
+ blkshift = MAXBLOCKSHIFT;
+ blksz = (off_t)1 << blkshift;
+
+ /*
+ * Embedded pointer tables give up to 8192 entries. This ought to be
+ * enough for anything except massive directories.
+ */
+ ptrcnt = (blksz / 2) / sizeof(uint64_t);
+
+ memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
+ zaphdr = (zap_phys_t *)&zfs->filebuf[0];
+ zaphdr->zap_block_type = ZBT_HEADER;
+ zaphdr->zap_magic = ZAP_MAGIC;
+ zaphdr->zap_num_entries = zap->kvpcnt;
+ zaphdr->zap_salt = zap->hashsalt;
+
+ l.l_bs = blkshift;
+ l.l_phys = NULL;
+
+ zt = &zaphdr->zap_ptrtbl;
+ zt->zt_blk = 0;
+ zt->zt_numblks = 0;
+ zt->zt_shift = flsl(ptrcnt) - 1;
+ zt->zt_nextblk = 0;
+ zt->zt_blks_copied = 0;
+
+ /*
+ * How many leaf blocks do we need? Initialize them and update the
+ * header.
+ */
+ prefixlen = zap_fat_write_prefixlen(zap, &l);
+ lblkcnt = 1 << prefixlen;
+ leafblks = ecalloc(lblkcnt, blksz);
+ for (unsigned int li = 0; li < lblkcnt; li++) {
+ l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
+ zap_fat_write_leaf_init(&l, li, prefixlen);
+ }
+ zaphdr->zap_num_leafs = lblkcnt;
+ zaphdr->zap_freeblk = lblkcnt + 1;
+
+ /*
+ * For each entry, figure out which leaf block it belongs to based on
+ * the upper bits of its hash, allocate chunks from that leaf, and fill
+ * them out.
+ */
+ ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2);
+ STAILQ_FOREACH(ent, &zap->kvps, next) {
+ struct zap_leaf_entry *le;
+ uint16_t *lptr;
+ uint64_t hi, li;
+ uint16_t namelen, nchunks, nnamechunks, nvalchunks;
+
+ hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift);
+ li = ZAP_HASH_IDX(ent->hash, prefixlen);
+ assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1);
+ ptrhasht[hi] = li + 1;
+ l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
+
+ namelen = strlen(ent->name) + 1;
+
+ /*
+ * How many leaf chunks do we need for this entry?
+ */
+ nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES);
+ nvalchunks = howmany(ent->intcnt,
+ ZAP_LEAF_ARRAY_BYTES / ent->intsz);
+ nchunks = 1 + nnamechunks + nvalchunks;
+
+ /*
+ * Allocate a run of free leaf chunks for this entry,
+ * potentially extending a hash chain.
+ */
+ assert(l.l_phys->l_hdr.lh_nfree >= nchunks);
+ l.l_phys->l_hdr.lh_nfree -= nchunks;
+ l.l_phys->l_hdr.lh_nentries++;
+ lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash);
+ while (*lptr != 0xffff) {
+ assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l));
+ le = ZAP_LEAF_ENTRY(&l, *lptr);
+ assert(le->le_type == ZAP_CHUNK_ENTRY);
+ le->le_cd++;
+ lptr = &le->le_next;
+ }
+ *lptr = l.l_phys->l_hdr.lh_freelist;
+ l.l_phys->l_hdr.lh_freelist += nchunks;
+ assert(l.l_phys->l_hdr.lh_freelist <=
+ ZAP_LEAF_NUMCHUNKS(&l));
+ if (l.l_phys->l_hdr.lh_freelist ==
+ ZAP_LEAF_NUMCHUNKS(&l))
+ l.l_phys->l_hdr.lh_freelist = 0xffff;
+
+ /*
+ * Integer values must be stored in big-endian format.
+ */
+ switch (ent->intsz) {
+ case 1:
+ break;
+ case 2:
+ for (uint16_t *v = ent->val16p;
+ v - ent->val16p < (ptrdiff_t)ent->intcnt;
+ v++)
+ *v = htobe16(*v);
+ break;
+ case 4:
+ for (uint32_t *v = ent->val32p;
+ v - ent->val32p < (ptrdiff_t)ent->intcnt;
+ v++)
+ *v = htobe32(*v);
+ break;
+ case 8:
+ for (uint64_t *v = ent->val64p;
+ v - ent->val64p < (ptrdiff_t)ent->intcnt;
+ v++)
+ *v = htobe64(*v);
+ break;
+ default:
+ assert(0);
+ }
+
+ /*
+ * Finally, write out the leaf chunks for this entry.
+ */
+ le = ZAP_LEAF_ENTRY(&l, *lptr);
+ assert(le->le_type == ZAP_CHUNK_FREE);
+ le->le_type = ZAP_CHUNK_ENTRY;
+ le->le_next = 0xffff;
+ le->le_name_chunk = *lptr + 1;
+ le->le_name_numints = namelen;
+ le->le_value_chunk = *lptr + 1 + nnamechunks;
+ le->le_value_intlen = ent->intsz;
+ le->le_value_numints = ent->intcnt;
+ le->le_hash = ent->hash;
+ zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name);
+ zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks,
+ ent->intcnt * ent->intsz, ent->valp);
+ }
+
+ /*
+ * Initialize unused slots of the pointer table.
+ */
+ for (int i = 0; i < ptrcnt; i++)
+ if (ptrhasht[i] == 0)
+ ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1;
+
+ /*
+ * Write the whole thing to disk.
+ */
+ dnode = zap->dnode;
+ dnode->dn_nblkptr = 1;
+ dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
+ dnode->dn_maxblkid = lblkcnt + 1;
+ dnode->dn_flags = DNODE_FLAG_USED_BYTES;
+
+ c = dnode_cursor_init(zfs, zap->os, zap->dnode,
+ (lblkcnt + 1) * blksz, blksz);
+
+ loc = objset_space_alloc(zfs, zap->os, &blksz);
+ vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc,
+ dnode_cursor_next(zfs, c, 0));
+
+ for (uint64_t i = 0; i < lblkcnt; i++) {
+ loc = objset_space_alloc(zfs, zap->os, &blksz);
+ vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz,
+ blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz));
+ }
+
+ dnode_cursor_finish(zfs, c);
+
+ free(leafblks);
+}
+
+static void
+zap_write(zfs_opt_t *zfs, zfs_zap_t *zap)
+{
+ zfs_zap_entry_t *ent;
+
+ if (zap->micro) {
+ zap_micro_write(zfs, zap);
+ } else {
+ assert(!STAILQ_EMPTY(&zap->kvps));
+ assert(zap->kvpcnt > 0);
+ zap_fat_write(zfs, zap);
+ }
+
+ while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) {
+ STAILQ_REMOVE_HEAD(&zap->kvps, next);
+ if (ent->val64p != &ent->val64)
+ free(ent->valp);
+ free(ent->name);
+ free(ent);
+ }
+}
+
+static nvlist_t *
+pool_config_nvcreate(zfs_opt_t *zfs)
+{
+ nvlist_t *featuresnv, *poolnv;
+
+ poolnv = nvlist_create(NV_UNIQUE_NAME);
+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG_INITIAL);
+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
+ nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->guid);
+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->guid);
+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->guid);
+ nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
+
+ featuresnv = nvlist_create(NV_UNIQUE_NAME);
+ nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
+ nvlist_destroy(featuresnv);
+
+ return (poolnv);
+}
+
+static nvlist_t *
+pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
+{
+ nvlist_t *diskvdevnv;
+
+ assert(zfs->objarrid != 0);
+
+ diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
+ nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->guid);
+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
+ nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG_INITIAL);
+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ zfs->objarrid);
+ nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ zfs->msshift);
+
+ return (diskvdevnv);
+}
+
+static nvlist_t *
+pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
+{
+ nvlist_t *diskvdevnv, *rootvdevnv;
+
+ diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
+ rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
+
+ nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
+ nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->guid);
+ nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
+ nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG_INITIAL);
+ nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
+ 1);
+ nvlist_destroy(diskvdevnv);
+
+ return (rootvdevnv);
+}
+
+/*
+ * Create the pool's "config" object, which contains an nvlist describing pool
+ * parameters and the vdev topology. It is similar but not identical to the
+ * nvlist stored in vdev labels. The main difference is that vdev labels do not
+ * describe the full vdev tree and in particular do not contain the "root"
+ * meta-vdev.
+ */
+static void
+pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
+{
+ dnode_phys_t *dnode;
+ nvlist_t *poolconfig, *vdevconfig;
+ zfs_objset_t *mos;
+ void *configbuf;
+ uint64_t dnid;
+ off_t configloc, configblksz;
+ int error;
+
+ mos = &zfs->mos;
+
+ dnode = objset_dnode_bonus_alloc(mos, DMU_OT_PACKED_NVLIST,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
+
+ poolconfig = pool_config_nvcreate(zfs);
+
+ vdevconfig = pool_root_vdev_config_nvcreate(zfs);
+ nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
+ nvlist_destroy(vdevconfig);
+
+ error = nvlist_export(poolconfig);
+ if (error != 0)
+ errc(1, error, "nvlist_export");
+
+ configblksz = nvlist_size(poolconfig);
+ configloc = objset_space_alloc(zfs, mos, &configblksz);
+ configbuf = ecalloc(1, configblksz);
+ nvlist_copy(poolconfig, configbuf, configblksz);
+
+ vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
+
+ dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
+ dnode->dn_flags = DNODE_FLAG_USED_BYTES;
+ *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
+
+ zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
+
+ nvlist_destroy(poolconfig);
+ free(configbuf);
+}
+
+/*
+ * Add objects block pointer list objects, used for deferred frees. We don't do
+ * anything with them, but they need to be present or OpenZFS will refuse to
+ * import the pool.
+ */
+static void
+pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
+{
+ zfs_objset_t *mos;
+ uint64_t dnid;
+
+ mos = &zfs->mos;
+
+ (void)objset_dnode_bonus_alloc(mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
+ BPOBJ_SIZE_V2, &dnid);
+ zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
+
+ (void)objset_dnode_bonus_alloc(mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
+ BPOBJ_SIZE_V2, &dnid);
+ zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
+}
+
+/*
+ * Add required feature metadata objects. We don't know anything about ZFS
+ * features, so the objects are just empty ZAPs.
+ */
+static void
+pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
+{
+ zfs_zap_t zap;
+ zfs_objset_t *mos;
+ dnode_phys_t *dnode;
+ uint64_t dnid;
+
+ mos = &zfs->mos;
+
+ dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid);
+ zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
+ zap_init(&zap, mos, dnode);
+ zap_write(zfs, &zap);
+
+ dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid);
+ zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
+ zap_init(&zap, mos, dnode);
+ zap_write(zfs, &zap);
+
+ dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid);
+ zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
+ zap_init(&zap, mos, dnode);
+ zap_write(zfs, &zap);
+}
+
+static void
+pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
+{
+ uint64_t id;
+
+ id = zfs->rootdsldir.dirid;
+ assert(id > 0);
+ zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, id);
+}
+
+static void
+pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
+{
+ dnode_phys_t *dnode;
+ uint64_t id;
+
+ dnode = objset_dnode_alloc(&zfs->mos, DMU_OT_POOL_PROPS, &id);
+ zap_init(&zfs->poolprops, &zfs->mos, dnode);
+ zap_add_uint64(objdir, DMU_POOL_PROPS, id);
+}
+
+/*
+ * Initialize the MOS object directory, the root of virtually all of the pool's
+ * data and metadata.
+ */
+static void
+pool_init_objdir(zfs_opt_t *zfs)
+{
+ zfs_zap_t zap;
+ dnode_phys_t *objdir;
+
+ objdir = objset_dnode_lookup(&zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
+
+ zap_init(&zap, &zfs->mos, objdir);
+ pool_init_objdir_config(zfs, &zap);
+ pool_init_objdir_bplists(zfs, &zap);
+ pool_init_objdir_feature_maps(zfs, &zap);
+ pool_init_objdir_dsl(zfs, &zap);
+ pool_init_objdir_poolprops(zfs, &zap);
+ zap_write(zfs, &zap);
+}
+
+/*
+ * Initialize the meta-object set and immediately write out several special
+ * objects whose contents are already finalized, including the object directory.
+ */
+static void
+pool_init(zfs_opt_t *zfs)
+{
+ struct dataset_desc *d;
+ zfs_objset_t *mos;
+ uint64_t dnid, dnodecount;
+
+ zfs->guid = 0xdeadfacec0debeef;
+
+ mos = &zfs->mos;
+
+ /*
+ * Figure out how many dnodes will be allocated from the MOS.
+ */
+ dnodecount = 0;
+ dnodecount++; /* object directory (ZAP) */
+ dnodecount++; /* |-> vdev config object (nvlist) */
+ dnodecount++; /* |-> features for read */
+ dnodecount++; /* |-> features for write */
+ dnodecount++; /* |-> feature descriptions */
+ dnodecount++; /* |-> sync bplist */
+ dnodecount++; /* |-> free bplist */
+ dnodecount++; /* |-> pool properties */
+ dnodecount++; /* L-> root DSL directory */
+ dnodecount++; /* |-> DSL child directory (ZAP) */
+ dnodecount++; /* | |-> $MOS (DSL dir) */
+ dnodecount++; /* | | |-> child map */
+ dnodecount++; /* | | L-> props (ZAP) */
+ dnodecount++; /* | |-> $FREE (DSL dir) */
+ dnodecount++; /* | | |-> child map */
+ dnodecount++; /* | | L-> props (ZAP) */
+ dnodecount++; /* | L-> $ORIGIN (DSL dir) */
+ dnodecount++; /* | |-> child map */
+ dnodecount++; /* | |-> dataset */
+ dnodecount++; /* | | L-> deadlist */
+ dnodecount++; /* | |-> snapshot */
+ dnodecount++; /* | | |-> deadlist */
+ dnodecount++; /* | | L-> snapshot names */
+ dnodecount++; /* | |-> props (ZAP) */
+ dnodecount++; /* | L-> clones (ZAP) */
+ dnodecount++; /* |-> DSL root dataset */
+ dnodecount++; /* | |-> snapshot names */
+ dnodecount++; /* | L-> deadlist */
+ dnodecount++; /* L-> props (ZAP) */
+ /*
+ * Space map stuff.
+ */
+ dnodecount++; /* space map object array */
+ dnodecount += zfs->mscount; /* space maps */
+ /*
+ * Child datasets.
+ */
+ STAILQ_FOREACH(d, &zfs->datasets, next) {
+ char buf[BUFSIZ];
+
+ /* Ugly hack to skip over root dataset parameters. */
+ snprintf(buf, sizeof(buf), "%s:", zfs->poolname);
+ if (strncmp(buf, d->params, strlen(buf)) == 0)
+ continue;
+
+ dnodecount++; /* DSL directory */
+ dnodecount++; /* |-> DSL dataset */
+ dnodecount++; /* | |-> snapshot names */
+ dnodecount++; /* | L-> deadlist */
+ dnodecount++; /* |-> child map */
+ dnodecount++; /* |-> props */
+ }
+
+ objset_init(zfs, mos, DMU_OST_META, dnodecount);
+
+ (void)objset_dnode_alloc(mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
+ assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
+
+ (void)objset_dnode_alloc(mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
+
+ dsl_init(zfs);
+
+ pool_init_objdir(zfs);
+}
+
+static void
+pool_labels_write(zfs_opt_t *zfs)
+{
+ uberblock_t *ub;
+ vdev_label_t *label;
+ nvlist_t *poolconfig, *vdevconfig;
+ int error;
+
+ label = ecalloc(1, sizeof(*label));
+
+ /*
+ * Assemble the vdev configuration and store it in the label.
+ */
+ poolconfig = pool_config_nvcreate(zfs);
+ vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
+ nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
+ nvlist_destroy(vdevconfig);
+
+ error = nvlist_export(poolconfig);
+ if (error != 0)
+ errc(1, error, "nvlist_export");
+ nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
+ sizeof(label->vl_vdev_phys.vp_nvlist));
+ nvlist_destroy(poolconfig);
+
+ /*
+ * Fill out the uberblock. Just make each one the same. The embedded
+ * checksum is calculated in vdev_label_write().
+ */
+ for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
+ uoff += (1 << zfs->ashift)) {
+ ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
+ ub->ub_magic = UBERBLOCK_MAGIC;
+ ub->ub_version = SPA_VERSION;
+ ub->ub_txg = TXG_INITIAL;
+ ub->ub_guid_sum = zfs->guid + zfs->guid; /* root + disk */
+ ub->ub_timestamp = 0; /* XXX-MJ */
+
+ ub->ub_software_version = SPA_VERSION;
+ ub->ub_mmp_magic = MMP_MAGIC;
+ ub->ub_mmp_delay = 0;
+ ub->ub_mmp_config = 0;
+ ub->ub_checkpoint_txg = 0;
+ memcpy(&ub->ub_rootbp, &zfs->mos.osbp, sizeof(blkptr_t));
+ }
+
+ /*
+ * Write out four copies of the label: two at the beginning of the vdev
+ * and two at the end.
+ */
+ for (int i = 0; i < VDEV_LABELS; i++)
+ vdev_label_write(zfs, i, label);
+
+ free(label);
+}
+
+static void
+pool_fini(zfs_opt_t *zfs)
+{
+ zap_write(zfs, &zfs->poolprops);
+ dsl_write(zfs);
+ objset_mos_write(zfs);
+ pool_labels_write(zfs);
+}
+
+/*
+ * Visit each node in a directory hierarchy, in pre-order depth-first order.
+ */
+static void
+fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
+{
+ assert(root->type == S_IFDIR);
+
+ for (fsnode *cur = root; cur != NULL; cur = cur->next) {
+ assert(cur->type == S_IFREG || cur->type == S_IFDIR ||
+ cur->type == S_IFLNK);
+
+ if (cb(cur, arg) == 0)
+ continue;
+ if (cur->type == S_IFDIR && cur->child != NULL)
+ fsnode_foreach(cur->child, cb, arg);
+ }
+}
+
+static bool
+fsnode_isroot(const fsnode *cur)
+{
+ return (strcmp(cur->name, ".") == 0);
+}
+
+static struct dnode_cursor *
+dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
+ off_t size, off_t blksz)
+{
+ struct dnode_cursor *c;
+ uint64_t nbppindir, indlevel, ndatablks, nindblks;
+
+ assert(dnode->dn_nblkptr == 1);
+ assert(blksz <= MAXBLOCKSIZE);
+
+ if (blksz == 0) {
+ /* Must be between 1<<ashift and 128KB. */
+ blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
+ powerof2(size) ? size : (1ul << flsl(size))));
+ }
+ assert(powerof2(blksz));
+
+ /*
+ * Do we need indirect blocks? Figure out how many levels are needed
+ * (indlevel == 1 means no indirect blocks) and how much space is needed
+ * (it has to be allocated up-front to break the dependency cycle
+ * described in objset_mos_write()).
+ */
+ ndatablks = size == 0 ? 0 : howmany(size, blksz);
+ nindblks = 0;
+ for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
+ nbppindir *= BLKPTR_PER_INDIR;
+ nindblks += howmany(ndatablks, indlevel * nbppindir);
+ }
+ assert(indlevel < INDIR_LEVELS);
+
+ dnode->dn_nlevels = (uint8_t)indlevel;
+ dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
+ dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
+
+ c = ecalloc(1, sizeof(*c));
+ if (nindblks > 0) {
+ c->indspace = nindblks * MAXBLOCKSIZE;
+ c->indloc = objset_space_alloc(zfs, os, &c->indspace);
+ }
+ c->dnode = dnode;
+ c->dataoff = 0;
+ c->datablksz = blksz;
+
+ return (c);
+}
+
+static void
+_dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels)
+{
+ blkptr_t *bp, *pbp;
+ void *buf;
+ uint64_t fill;
+ off_t blkid, blksz, loc;
+
+ assert(levels > 0);
+ assert(levels <= c->dnode->dn_nlevels - 1);
+
+ blksz = MAXBLOCKSIZE;
+ blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
+ for (int level = 1; level <= levels; level++) {
+ buf = c->inddir[level - 1];
+
+ if (level == c->dnode->dn_nlevels - 1) {
+ pbp = &c->dnode->dn_blkptr[0];
+ } else {
+ uint64_t iblkid;
+
+ iblkid = blkid & (BLKPTR_PER_INDIR - 1);
+ pbp = (blkptr_t *)
+ &c->inddir[level][iblkid * sizeof(blkptr_t)];
+ }
+
+ /*
+ * Space for indirect blocks is allocated up-front; see the
+ * comment in objset_mos_write().
+ */
+ loc = c->indloc;
+ c->indloc += blksz;
+ assert(c->indspace >= blksz);
+ c->indspace -= blksz;
+
+ bp = buf;
+ fill = 0;
+ for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
+ fill += BP_GET_FILL(&bp[i]);
+
+ vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
+ loc, pbp);
+ memset(buf, 0, MAXBLOCKSIZE);
+
+ blkid /= BLKPTR_PER_INDIR;
+ }
+}
+
+static blkptr_t *
+dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
+{
+ off_t blkid, l1id;
+ int levels;
+
+ if (c->dnode->dn_nlevels == 1) {
+ assert(off < MAXBLOCKSIZE);
+ return (&c->dnode->dn_blkptr[0]);
+ }
+
+ assert(off % c->datablksz == 0);
+
+ /* Do we need to flush any full indirect blocks? */
+ if (off > 0) {
+ blkid = off / c->datablksz;
+ for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) {
+ if (blkid % BLKPTR_PER_INDIR != 0)
+ break;
+ blkid /= BLKPTR_PER_INDIR;
+ }
+ if (levels > 0)
+ _dnode_cursor_flush(zfs, c, levels);
+ }
+
+ c->dataoff = off;
+ l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
+ return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
+}
+
+static void
+dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
+{
+ int levels;
+
+ levels = c->dnode->dn_nlevels - 1;
+ if (levels > 0)
+ _dnode_cursor_flush(zfs, c, levels);
+ assert(c->indspace == 0);
+ free(c);
+}
+
+struct fs_populate_dir {
+ SLIST_ENTRY(fs_populate_dir) next;
+ int dirfd;
+ uint64_t objid;
+ zfs_zap_t zap;
+};
+
+struct fs_populate_arg {
+ zfs_opt_t *zfs;
+ zfs_fs_t *fs; /* owning filesystem */
+ int dirfd; /* current directory fd */
+ uint64_t rootdirid; /* root directory dnode ID */
+ SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */
+};
+
+static void
+fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
+{
+ struct fs_populate_dir *dir;
+ uint64_t type;
+
+ switch (cur->type) {
+ case S_IFREG:
+ type = DT_REG;
+ break;
+ case S_IFDIR:
+ type = DT_DIR;
+ break;
+ case S_IFLNK:
+ type = DT_LNK;
+ break;
+ default:
+ assert(0);
+ }
+
+ dir = SLIST_FIRST(&arg->dirs);
+ zap_add_uint64(&dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
+}
+
+static void
+fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
+ size_t *szp)
+{
+ assert(ind < fs->sacnt);
+ assert(fs->saoffs[ind] != 0xffff);
+
+ memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
+ *szp += fs->satab[ind].size;
+}
+
+static void
+fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
+ size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
+{
+ assert(ind < fs->sacnt);
+ assert(fs->saoffs[ind] != 0xffff);
+ assert(fs->satab[ind].size == 0);
+
+ memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
+ *szp += valsz;
+}
+
+static void
+fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
+ dnode_phys_t *dnode)
+{
+ char target[PATH_MAX];
+ zfs_fs_t *fs;
+ zfs_ace_hdr_t aces[3];
+ struct stat *sb;
+ sa_hdr_phys_t *sahdr;
+ uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
+ char *attrbuf;
+ size_t bonussz, hdrsz;
+ int layout;
+
+ assert(dnode->dn_bonustype == DMU_OT_SA);
+ assert(dnode->dn_nblkptr == 1);
+
+ fs = arg->fs;
+ sb = &cur->inode->st;
+
+ switch (cur->type) {
+ case S_IFREG:
+ layout = SA_LAYOUT_INDEX_DEFAULT;
+ links = cur->inode->nlink;
+ objsize = sb->st_size;
+ parent = SLIST_FIRST(&arg->dirs)->objid;
+ break;
+ case S_IFDIR:
+ layout = SA_LAYOUT_INDEX_DEFAULT;
+ links = 1; /* .. */
+ objsize = 1; /* .. */
+
+ /*
+ * The size of a ZPL directory is the number of entries
+ * (including "." and ".."), and the link count is the number of
+ * entries which are directories (including "." and "..").
+ */
+ for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
+ c != NULL; c = c->next) {
+ if (c->type == S_IFDIR)
+ links++;
+ objsize++;
+ }
+
+ /* The root directory is its own parent. */
+ parent = SLIST_EMPTY(&arg->dirs) ?
+ arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
+ break;
+ case S_IFLNK: {
+ ssize_t n;
+
+ if ((n = readlinkat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
+ target, sizeof(target) - 1)) == -1)
+ err(1, "readlinkat(%s)", cur->name);
+ target[n] = '\0';
+
+ layout = SA_LAYOUT_INDEX_SYMLINK;
+ links = 1;
+ objsize = strlen(target);
+ parent = SLIST_FIRST(&arg->dirs)->objid;
+ break;
+ }
+ default:
+ assert(0);
+ }
+
+ daclcount = nitems(aces);
+ flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED |
+ ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */
+ gen = 1;
+ gid = sb->st_gid;
+ mode = sb->st_mode;
+ uid = sb->st_uid;
+
+ /* XXX-MJ need to review these */
+ memset(aces, 0, sizeof(aces));
+ aces[0].z_flags = ACE_OWNER;
+ aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+ aces[0].z_access_mask = ACE_READ_DATA | ACE_WRITE_ATTRIBUTES |
+ ACE_WRITE_OWNER | ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS |
+ ACE_READ_ACL | ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+ ACE_SYNCHRONIZE;
+ aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
+ aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+ aces[1].z_access_mask = ACE_READ_DATA | ACE_READ_ACL |
+ ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
+ aces[2].z_flags = ACE_EVERYONE;
+ aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+ aces[2].z_access_mask = ACE_READ_DATA | ACE_READ_ACL |
+ ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
+
+ switch (layout) {
+ case SA_LAYOUT_INDEX_DEFAULT:
+ /* At most one variable-length attribute. */
+ hdrsz = sizeof(uint64_t);
+ break;
+ case SA_LAYOUT_INDEX_SYMLINK:
+ /* At most five variable-length attributes. */
+ hdrsz = sizeof(uint64_t) * 2;
+ break;
+ default:
+ assert(0);
+ }
+
+ sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
+ sahdr->sa_magic = SA_MAGIC;
+ SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
+
+ bonussz = SA_HDR_SIZE(sahdr);
+ attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
+
+ fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
+ fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
+ fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
+ fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
+ fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
+ fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
+ fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
+ fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
+ fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
+
+ /*
+ * We deliberately set atime = mtime here to ensure that images are
+ * reproducible.
+ */
+ assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size);
+ fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
+ assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size);
+ fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
+ assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size);
+ fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
+ assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size);
+ fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
+
+ fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
+ ZPL_DACL_ACES, &bonussz);
+ sahdr->sa_lengths[0] = sizeof(aces);
+
+ if (cur->type == S_IFLNK) {
+ assert(layout == SA_LAYOUT_INDEX_SYMLINK);
+ /* Need to use a spill block pointer if the target is long. */
+ assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
+ fs_populate_varszattr(fs, attrbuf, target, objsize,
+ sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
+ sahdr->sa_lengths[1] = (uint16_t)objsize;
+ }
+
+ dnode->dn_bonuslen = bonussz;
+}
+
+static void
+fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
+{
+ struct dnode_cursor *c;
+ dnode_phys_t *dnode;
+ zfs_opt_t *zfs;
+ char *buf;
+ uint64_t dnid;
+ ssize_t n;
+ size_t bufsz;
+ off_t size, target;
+ int fd;
+
+ assert(cur->type == S_IFREG);
+ assert((cur->inode->flags & FI_ROOT) == 0);
+
+ zfs = arg->zfs;
+
+ assert(cur->inode->ino != 0);
+ if ((cur->inode->flags & FI_ALLOCATED) != 0) {
+ /*
+ * This is a hard link of an existing file.
+ *
+ * XXX-MJ need to check whether it crosses datasets, add a test
+ * case for that
+ */
+ fs_populate_dirent(arg, cur, cur->inode->ino);
+ return;
+ }
+
+ dnode = objset_dnode_bonus_alloc(arg->fs->os,
+ DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
+ cur->inode->ino = dnid;
+ cur->inode->flags |= FI_ALLOCATED;
+
+ fd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, O_RDONLY);
+ if (fd == -1)
+ err(1, "openat(%s)", cur->name);
+
+ buf = zfs->filebuf;
+ bufsz = sizeof(zfs->filebuf);
+ size = cur->inode->st.st_size;
+ c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
+ for (off_t foff = 0; foff < size; foff += target) {
+ off_t loc, sofar;
+
+ /* Fill up our buffer, handling partial reads. */
+ sofar = 0;
+ target = MIN(size - foff, (off_t)bufsz);
+ do {
+ n = read(fd, buf + sofar, target);
+ if (n < 0)
+ err(1, "reading from '%s'", cur->name);
+ if (n == 0)
+ errx(1, "unexpected EOF reading '%s'",
+ cur->name);
+ sofar += n;
+ } while (sofar < target);
+
+ if (target < (off_t)bufsz)
+ memset(buf + target, 0, bufsz - target);
+
+ loc = objset_space_alloc(zfs, arg->fs->os, &target);
+ vdev_pwrite_dnode_indir(zfs, c->dnode, 0, 1, buf, target, loc,
+ dnode_cursor_next(zfs, c, foff));
+ }
+ if (close(fd) != 0)
+ err(1, "close");
+ dnode_cursor_finish(zfs, c);
+
+ fs_populate_sattrs(arg, cur, dnode);
+ fs_populate_dirent(arg, cur, dnid);
+}
+
+static void
+fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
+{
+ dnode_phys_t *dnode;
+ zfs_objset_t *os;
+ uint64_t dnid;
+ int dirfd;
+
+ assert(cur->type == S_IFDIR);
+ assert((cur->inode->flags & FI_ALLOCATED) == 0);
+
+ os = arg->fs->os;
+
+ dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_SA, 0, &dnid);
+
+ /*
+ * Add an entry to the parent directory and open this directory.
+ */
+ if (!SLIST_EMPTY(&arg->dirs)) {
+ fs_populate_dirent(arg, cur, dnid);
+ dirfd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
+ O_DIRECTORY);
+ if (dirfd < 0)
+ err(1, "open(%s)", cur->name);
+ } else {
+ arg->rootdirid = dnid;
+ dirfd = arg->dirfd;
+ }
+
+ fs_populate_sattrs(arg, cur, dnode);
+
+ /*
+ * If this is a root directory, then its children belong to a different
+ * dataset and this directory remains empty in the current objset.
+ */
+ if ((cur->inode->flags & FI_ROOT) == 0) {
+ struct fs_populate_dir *dir;
+
+ dir = ecalloc(1, sizeof(*dir));
+ dir->dirfd = dirfd;
+ dir->objid = dnid;
+ zap_init(&dir->zap, os, dnode);
+ SLIST_INSERT_HEAD(&arg->dirs, dir, next);
+ } else {
+ zfs_zap_t dirzap;
+
+ zap_init(&dirzap, os, dnode);
+ zap_write(arg->zfs, &dirzap);
+
+ fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
+ }
+}
+
+static void
+fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
+{
+ dnode_phys_t *dnode;
+ uint64_t dnid;
+
+ assert(cur->type == S_IFLNK);
+ assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
+
+ dnode = objset_dnode_bonus_alloc(arg->fs->os,
+ DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
+
+ fs_populate_dirent(arg, cur, dnid);
+
+ fs_populate_sattrs(arg, cur, dnode);
+}
+
+static int
+fs_foreach_populate(fsnode *cur, void *_arg)
+{
+ struct fs_populate_arg *arg;
+ struct fs_populate_dir *dir;
+ int ret;
+
+ arg = _arg;
+ switch (cur->type) {
+ case S_IFREG:
+ fs_populate_file(cur, arg);
+ break;
+ case S_IFDIR:
+ if (fsnode_isroot(cur))
+ break;
+ fs_populate_dir(cur, arg);
+ break;
+ case S_IFLNK:
+ fs_populate_symlink(cur, arg);
+ break;
+ default:
+ assert(0);
+ }
+
+ ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
+
+ if (cur->next == NULL &&
+ (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
+ /*
+ * We reached a terminal node in a subtree. Walk back up and
+ * write out directories. We're done once we hit the root of a
+ * dataset or find a level where we're not on the edge of the
+ * tree.
+ */
+ do {
+ dir = SLIST_FIRST(&arg->dirs);
+ SLIST_REMOVE_HEAD(&arg->dirs, next);
+ zap_write(arg->zfs, &dir->zap);
+ if (dir->dirfd != -1 && close(dir->dirfd) != 0)
+ err(1, "close");
+ free(dir);
+ cur = cur->parent;
+ } while (cur != NULL && cur->next == NULL &&
+ (cur->inode->flags & FI_ROOT) == 0);
+ }
+
+ return (ret);
+}
+
+static void
+fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
+ const sa_attr_type_t layout[], size_t sacnt)
+{
+ char ti[16];
+
+ assert(sizeof(layout[0]) == 2);
+
+ snprintf(ti, sizeof(ti), "%u", index);
+ zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
+ (const uint8_t *)layout);
+}
+
+/*
+ * Initialize system attribute tables.
+ *
+ * There are two elements to this. First, we write the zpl_attrs[] and
+ * zpl_attr_layout[] tables to disk. Then we create a lookup table which
+ * allows us to set file attributes quickly.
+ */
+static uint64_t
+fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
+{
+ zfs_zap_t sazap, salzap, sarzap;
+ zfs_objset_t *os;
+ dnode_phys_t *saobj, *salobj, *sarobj;
+ uint64_t saobjid, salobjid, sarobjid;
+ uint16_t offset;
+
+ os = fs->os;
+
+ /*
+ * The on-disk tables are stored in two ZAP objects, the registry object
+ * and the layout object. Individual attributes are described by
+ * entries in the registry object; for example, the value for the
+ * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
+ * The attributes of a file are ordered according to one of the layouts
+ * defined in the layout object. The master node object is simply used
+ * to locate the registry and layout objects.
+ */
+ saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
+ salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
+ sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
+
+ zap_init(&sarzap, os, sarobj);
+ for (size_t i = 0; i < nitems(zpl_attrs); i++) {
+ const zfs_sattr_t *sa;
+ uint64_t attr;
+
+ attr = 0;
+ sa = &zpl_attrs[i];
+ SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
+ zap_add_uint64(&sarzap, sa->name, attr);
+ }
+ zap_write(zfs, &sarzap);
+
+ /*
+ * Layouts are arrays of indices into the registry. We define two
+ * layouts for use by the ZPL, one for non-symlinks and one for
+ * symlinks. They are identical except that the symlink layout includes
+ * ZPL_SYMLINK as its final attribute.
+ */
+ zap_init(&salzap, os, salobj);
+ assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
+ fs_add_zpl_attr_layout(&salzap, SA_LAYOUT_INDEX_DEFAULT,
+ zpl_attr_layout, nitems(zpl_attr_layout) - 1);
+ fs_add_zpl_attr_layout(&salzap, SA_LAYOUT_INDEX_SYMLINK,
+ zpl_attr_layout, nitems(zpl_attr_layout));
+ zap_write(zfs, &salzap);
+
+ zap_init(&sazap, os, saobj);
+ zap_add_uint64(&sazap, SA_LAYOUTS, salobjid);
+ zap_add_uint64(&sazap, SA_REGISTRY, sarobjid);
+ zap_write(zfs, &sazap);
+
+ /* Sanity check. */
+ for (size_t i = 0; i < nitems(zpl_attrs); i++)
+ assert(i == zpl_attrs[i].id);
+
+ /*
+ * Build the offset table used when setting file attributes. File
+ * attributes are stored in the object's bonus buffer; this table
+ * provides the buffer offset of attributes referenced by the layout
+ * table.
+ */
+ fs->sacnt = nitems(zpl_attrs);
+ fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
+ for (size_t i = 0; i < fs->sacnt; i++)
+ fs->saoffs[i] = 0xffff;
+ offset = 0;
+ for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
+ uint16_t size;
+
+ assert(zpl_attr_layout[i] < fs->sacnt);
+
+ fs->saoffs[zpl_attr_layout[i]] = offset;
+ size = zpl_attrs[zpl_attr_layout[i]].size;
+ offset += size;
+ }
+ fs->satab = zpl_attrs;
+
+ return (saobjid);
+}
+
+static void
+fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
+{
+ char *mountpoint, *origmountpoint, *name, *next;
+ fsnode *cur, *root;
+ uint64_t canmount;
+
+ if (dsldir->headds == NULL)
+ return;
+
+ mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
+ if (mountpoint == NULL)
+ return;
+ if (nvlist_find_uint64(dsldir->propsnv, "canmount", &canmount) == 0 &&
+ canmount == 0)
+ return;
+
+ /*
+ * If we were asked to specify a bootfs, set it here.
+ */
+ if (zfs->bootfs != NULL && strcmp(zfs->bootfs, dsldir->fullname) == 0)
+ zap_add_uint64(&zfs->poolprops, "bootfs", dsldir->headds->dsid);
+
+ origmountpoint = mountpoint;
+
+ /*
+ * Figure out which fsnode corresponds to our mountpoint.
+ */
+ root = arg;
+ cur = root;
+ if (strcmp(mountpoint, zfs->rootpath) != 0) {
+ mountpoint += strlen(zfs->rootpath);
+
+ /*
+ * Look up the directory in the staged tree. For example, if
+ * the dataset's mount point is /foo/bar/baz, we'll search the
+ * root directory for "foo", search "foo" for "baz", and so on.
+ * Each intermediate name must refer to a directory; the final
+ * component need not exist.
+ */
+ cur = root;
+ for (next = name = mountpoint; next != NULL;) {
+ for (; *next == '/'; next++)
+ ;
+ name = strsep(&next, "/");
+
+ for (; cur != NULL && strcmp(cur->name, name) != 0;
+ cur = cur->next)
+ ;
+ if (cur == NULL) {
+ if (next == NULL)
+ break;
+ errx(1, "missing mountpoint directory for `%s'",
+ dsldir->fullname);
+ }
+ if (cur->type != S_IFDIR) {
+ errx(1,
+ "mountpoint for `%s' is not a directory",
+ dsldir->fullname);
+ }
+ if (next != NULL)
+ cur = cur->child;
+ }
+ }
+
+ if (cur != NULL) {
+ assert(cur->type == S_IFDIR);
+
+ /*
+ * Multiple datasets shouldn't share a mountpoint. It's
+ * technically allowed, but it's not clear what makefs should do
+ * in that case.
+ */
+ assert((cur->inode->flags & FI_ROOT) == 0);
+ if (cur != root)
+ cur->inode->flags |= FI_ROOT;
+ assert(cur->inode->param == NULL);
+ cur->inode->param = dsldir;
+ }
+
+ free(origmountpoint);
+}
+
+static int
+fs_foreach_count(fsnode *cur, void *arg)
+{
+ uint64_t *countp;
+
+ countp = arg;
+ if (cur->type == S_IFDIR && fsnode_isroot(cur))
+ return (1);
+
+ if (cur->inode->ino == 0) {
+ cur->inode->ino = ++(*countp);
+ cur->inode->nlink = 1;
+ } else {
+ cur->inode->nlink++;
+ }
+
+ return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
+}
+
+/*
+ * Create a filesystem dataset. More specifically:
+ * - create an object set for the dataset,
+ * - add required metadata (SA tables, property definitions, etc.) to that
+ * object set,
+ * - optionally populate the object set with file objects, using "root" as the
+ * root directory.
+ *
+ * "dirfd" is a directory descriptor for the directory referenced by "root". It
+ * is closed before returning.
+ */
+static void
+fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
+{
+ struct fs_populate_arg arg;
+ zfs_fs_t fs;
+ zfs_zap_t deleteqzap, masterzap;
+ zfs_objset_t *os;
+ dnode_phys_t *deleteq, *masterobj;
+ uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
+ bool fakedroot;
+
+ if (root != NULL) {
+ assert(root->type == S_IFDIR);
+ assert(fsnode_isroot(root));
+ }
+
+ os = ecalloc(1, sizeof(*os));
+
+ memset(&fs, 0, sizeof(fs));
+ fs.os = os;
+
+ /*
+ * This dataset's mountpoint doesn't exist in the staging tree. Fake up
+ * a root fsnode to handle this case.
+ */
+ fakedroot = root == NULL;
+ if (fakedroot) {
+ struct stat *stp;
+
+ assert(dirfd == -1);
+
+ root = ecalloc(1, sizeof(*root));
+ root->inode = ecalloc(1, sizeof(*root->inode));
+ root->name = estrdup(".");
+ root->type = S_IFDIR;
+
+ stp = &root->inode->st;
+ stp->st_uid = 0;
+ stp->st_gid = 0;
+ stp->st_mode = S_IFDIR | 0755;
+ }
+
+ /*
+ * How many dnodes do we need? One for each file/directory/symlink plus
+ * several metadata objects.
+ */
+ dnodecount = 1; /* root directory */
+ fsnode_foreach(root, fs_foreach_count, &dnodecount);
+ dnodecount++; /* master object */
+ dnodecount++; /* delete queue */
+ dnodecount++; /* system attributes master node */
+ dnodecount++; /* system attributes registry */
+ dnodecount++; /* system attributes layout */
+
+ objset_init(zfs, os, DMU_OST_ZFS, dnodecount);
+ masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
+ assert(moid == MASTER_NODE_OBJ);
+
+ /*
+ * Create the ZAP SA layout now since filesystem object dnodes will
+ * refer to those attributes.
+ */
+ saobjid = fs_set_zpl_attrs(zfs, &fs);
+
+ /*
+ * Populate the dataset with files from the staging directory. Most of
+ * our runtime is spent here.
+ */
+ arg.dirfd = dirfd;
+ arg.zfs = zfs;
+ arg.fs = &fs;
+ SLIST_INIT(&arg.dirs);
+ fs_populate_dir(root, &arg);
+ assert(!SLIST_EMPTY(&arg.dirs));
+ fsnode_foreach(root, fs_foreach_populate, &arg);
+ assert(SLIST_EMPTY(&arg.dirs));
+ rootdirid = arg.rootdirid;
+
+ /*
+ * Create an empty delete queue. We don't do anything with it, but
+ * OpenZFS will refuse to mount filesystems that don't have one.
+ */
+ deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
+ zap_init(&deleteqzap, os, deleteq);
+ zap_write(zfs, &deleteqzap);
+
+ /*
+ * Populate and write the master node object. This is a ZAP object
+ * containing various dataset properties and the object IDs of the root
+ * directory and delete queue.
+ */
+ zap_init(&masterzap, os, masterobj);
+ zap_add_uint64(&masterzap, ZFS_ROOT_OBJ, rootdirid);
+ zap_add_uint64(&masterzap, ZFS_UNLINKED_SET, deleteqid);
+ zap_add_uint64(&masterzap, ZFS_SA_ATTRS, saobjid);
+ zap_add_uint64(&masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
+ zap_add_uint64(&masterzap, "normalization", 0 /* off */);
+ zap_add_uint64(&masterzap, "utf8only", 0 /* off */);
+ zap_add_uint64(&masterzap, "casesensitivity", 0 /* case sensitive */);
+ zap_add_uint64(&masterzap, "acltype", 2 /* NFSv4 */);
+ zap_write(zfs, &masterzap);
+
+ /*
+ * All finished with this object set, we may as well write it now.
+ * The DSL layer will sum up the bytes consumed by each dataset using
+ * information stored in the object set, so it can't be freed just yet.
+ */
+ assert(dsldir != NULL);
+ dsldir->headds->os = os;
+ objset_write(zfs, os);
+
+ if (fakedroot) {
+ free(root->inode);
+ free(root->name);
+ free(root);
+ }
+ free(fs.saoffs);
+}
+
+static void
+fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
+{
+ if (dsldir->headds == NULL)
+ return;
+ if (dsldir->headds->os != NULL)
+ return;
+ fs_build_one(zfs, dsldir, NULL, -1);
+}
+
+/*
+ * Create our datasets and populate them with files.
+ */
+static void
+fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
+{
+ /*
+ * Run through our datasets and find the root fsnode for each one. Each
+ * root fsnode is flagged so that we can figure out which dataset it
+ * belongs to.
+ */
+ dsl_dir_foreach(zfs, &zfs->rootdsldir, fs_layout_one, root);
+
+ /*
+ * Did we find our boot filesystem?
+ */
+ if (zfs->bootfs != NULL && !zap_entry_exists(&zfs->poolprops, "bootfs"))
+ errx(1, "no mounted dataset matches bootfs property `%s'",
+ zfs->bootfs);
+
+ /*
+ * Traverse the file hierarchy starting from the root fsnode. One
+ * dataset, not necessarily the root dataset, must "own" the root
+ * directory by having its mountpoint be equal to the root path.
+ *
+ * As roots of other datasets are encountered during the traversal,
+ * fs_build_one() recursively creates the corresponding object sets and
+ * populates them. Once this function has returned, all datasets will
+ * have been fully populated.
+ */
+ fs_build_one(zfs, root->inode->param, root, dirfd);
+
+ /*
+ * Now create object sets for datasets whose mountpoints weren't found
+ * in the staging directory, either because there is no mountpoint, or
+ * because the mountpoint doesn't correspond to an existing directory.
+ */
+ dsl_dir_foreach(zfs, &zfs->rootdsldir, fs_build_unmounted, NULL);
+}
+
+/*
+ * The entry point to all other code in this file.
+ */
+void
+zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
+{
+ zfs_opt_t *zfs;
+ int dirfd;
+
+ zfs = fsopts->fs_specific;
+
+ /*
+ * Use a fixed seed to provide reproducible pseudo-random numbers for
+ * on-disk structures when needed (e.g., ZAP hash salts).
+ */
+ srandom(1729);
+
+ zfs_check_opts(fsopts);
+
+ dirfd = open(dir, O_DIRECTORY | O_RDONLY);
+ if (dirfd < 0)
+ err(1, "open(%s)", dir);
+
+ vdev_init(zfs, fsopts->maxsize, image);
+ pool_init(zfs);
+ fs_build(zfs, dirfd, root);
+ pool_fini(zfs);
+ vdev_fini(zfs);
+}
Index: usr.sbin/makefs/zfs/Makefile.inc
===================================================================
--- /dev/null
+++ usr.sbin/makefs/zfs/Makefile.inc
@@ -0,0 +1,5 @@
+.PATH: ${SRCDIR}/zfs
+
+SRCS+= nvlist.c
+
+CFLAGS.nvlist.c+= -Wno-cast-qual
Index: usr.sbin/makefs/zfs/nvlist.h
===================================================================
--- /dev/null
+++ usr.sbin/makefs/zfs/nvlist.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2012 Andriy Gapon <avg@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NVLIST_H_
+#define _NVLIST_H_
+
+/* nvp implementation version */
+#define NV_VERSION 0
+
+/* nvlist persistent unique name flags, stored in nvl_nvflags */
+#define NV_UNIQUE_NAME 0x1
+#define NV_UNIQUE_NAME_TYPE 0x2
+
+#define NV_ALIGN4(x) (((x) + 3) & ~3)
+#define NV_ALIGN(x) (((x) + 7) & ~7)
+
+/* nvlist pack encoding */
+#define NV_ENCODE_NATIVE 0
+#define NV_ENCODE_XDR 1
+
+typedef enum {
+ DATA_TYPE_UNKNOWN = 0,
+ DATA_TYPE_BOOLEAN,
+ DATA_TYPE_BYTE,
+ DATA_TYPE_INT16,
+ DATA_TYPE_UINT16,
+ DATA_TYPE_INT32,
+ DATA_TYPE_UINT32,
+ DATA_TYPE_INT64,
+ DATA_TYPE_UINT64,
+ DATA_TYPE_STRING,
+ DATA_TYPE_BYTE_ARRAY,
+ DATA_TYPE_INT16_ARRAY,
+ DATA_TYPE_UINT16_ARRAY,
+ DATA_TYPE_INT32_ARRAY,
+ DATA_TYPE_UINT32_ARRAY,
+ DATA_TYPE_INT64_ARRAY,
+ DATA_TYPE_UINT64_ARRAY,
+ DATA_TYPE_STRING_ARRAY,
+ DATA_TYPE_HRTIME,
+ DATA_TYPE_NVLIST,
+ DATA_TYPE_NVLIST_ARRAY,
+ DATA_TYPE_BOOLEAN_VALUE,
+ DATA_TYPE_INT8,
+ DATA_TYPE_UINT8,
+ DATA_TYPE_BOOLEAN_ARRAY,
+ DATA_TYPE_INT8_ARRAY,
+ DATA_TYPE_UINT8_ARRAY
+} data_type_t;
+
+/*
+ * nvlist header.
+ * nvlist has 4 bytes header followed by version and flags, then nvpairs
+ * and the list is terminated by double zero.
+ */
+typedef struct {
+ char nvh_encoding;
+ char nvh_endian;
+ char nvh_reserved1;
+ char nvh_reserved2;
+} nvs_header_t;
+
+typedef struct {
+ nvs_header_t nv_header;
+ size_t nv_asize;
+ size_t nv_size;
+ uint8_t *nv_data;
+ uint8_t *nv_idx;
+} nvlist_t;
+
+/*
+ * nvpair header.
+ * nvpair has encoded and decoded size
+ * name string (size and data)
+ * data type and number of elements
+ * data
+ */
+typedef struct {
+ unsigned encoded_size;
+ unsigned decoded_size;
+} nvp_header_t;
+
+/*
+ * nvlist stream head.
+ */
+typedef struct {
+ unsigned nvl_version;
+ unsigned nvl_nvflag;
+ nvp_header_t nvl_pair;
+} nvs_data_t;
+
+typedef struct {
+ unsigned nv_size;
+ uint8_t nv_data[]; /* NV_ALIGN4(string) */
+} nv_string_t;
+
+typedef struct {
+ unsigned nv_type; /* data_type_t */
+ unsigned nv_nelem; /* number of elements */
+ uint8_t nv_data[]; /* data stream */
+} nv_pair_data_t;
+
+nvlist_t *nvlist_create(int);
+void nvlist_destroy(nvlist_t *);
+nvlist_t *nvlist_import(const char *, size_t);
+int nvlist_export(nvlist_t *);
+int nvlist_remove(nvlist_t *, const char *, data_type_t);
+int nvpair_type_from_name(const char *);
+nvp_header_t *nvpair_find(nvlist_t *, const char *);
+void nvpair_print(nvp_header_t *, unsigned int);
+void nvlist_print(const nvlist_t *, unsigned int);
+char *nvstring_get(nv_string_t *);
+int nvlist_find(const nvlist_t *, const char *, data_type_t,
+ int *, void *, int *);
+nvp_header_t *nvlist_next_nvpair(nvlist_t *, nvp_header_t *);
+
+int nvlist_add_boolean_value(nvlist_t *, const char *, bool);
+int nvlist_add_byte(nvlist_t *, const char *, uint8_t);
+int nvlist_add_int8(nvlist_t *, const char *, int8_t);
+int nvlist_add_uint8(nvlist_t *, const char *, uint8_t);
+int nvlist_add_int16(nvlist_t *, const char *, int16_t);
+int nvlist_add_uint16(nvlist_t *, const char *, uint16_t);
+int nvlist_add_int32(nvlist_t *, const char *, int32_t);
+int nvlist_add_uint32(nvlist_t *, const char *, uint32_t);
+int nvlist_add_int64(nvlist_t *, const char *, int64_t);
+int nvlist_add_uint64(nvlist_t *, const char *, uint64_t);
+int nvlist_add_string(nvlist_t *, const char *, const char *);
+int nvlist_add_boolean_array(nvlist_t *, const char *, bool *, uint32_t);
+int nvlist_add_byte_array(nvlist_t *, const char *, uint8_t *, uint32_t);
+int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint32_t);
+int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint32_t);
+int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint32_t);
+int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint32_t);
+int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint32_t);
+int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint32_t);
+int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint32_t);
+int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint32_t);
+int nvlist_add_string_array(nvlist_t *, const char *, char * const *, uint32_t);
+int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
+int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint32_t);
+
+#endif /* !_NVLIST_H_ */
Index: usr.sbin/makefs/zfs/nvlist.c
===================================================================
--- /dev/null
+++ usr.sbin/makefs/zfs/nvlist.c
@@ -0,0 +1,1699 @@
+/*-
+ * Copyright 2020 Toomas Soome <tsoome@me.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/endian.h>
+#include <sys/stdint.h>
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "zfs/nvlist.h"
+
+enum xdr_op {
+ XDR_OP_ENCODE = 1,
+ XDR_OP_DECODE = 2
+};
+
+typedef struct xdr {
+ enum xdr_op xdr_op;
+ int (*xdr_getint)(struct xdr *, int *);
+ int (*xdr_putint)(struct xdr *, int);
+ int (*xdr_getuint)(struct xdr *, unsigned *);
+ int (*xdr_putuint)(struct xdr *, unsigned);
+ const uint8_t *xdr_buf;
+ uint8_t *xdr_idx;
+ size_t xdr_buf_size;
+} xdr_t;
+
+static int nvlist_xdr_nvlist(xdr_t *, nvlist_t *);
+static bool nvlist_size_xdr(xdr_t *, size_t *);
+static bool nvlist_size_native(xdr_t *, size_t *);
+static bool xdr_int(xdr_t *, int *);
+static bool xdr_u_int(xdr_t *, unsigned *);
+
+typedef bool (*xdrproc_t)(xdr_t *, void *);
+
+/* Basic primitives for XDR translation operations, getint and putint. */
+static int
+_getint(struct xdr *xdr, int *ip)
+{
+ *ip = be32dec(xdr->xdr_idx);
+ return (sizeof(int));
+}
+
+static int
+_putint(struct xdr *xdr, int i)
+{
+ int *ip = (int *)xdr->xdr_idx;
+
+ *ip = htobe32(i);
+ return (sizeof(int));
+}
+
+static int
+_getuint(struct xdr *xdr, unsigned *ip)
+{
+ *ip = be32dec(xdr->xdr_idx);
+ return (sizeof(unsigned));
+}
+
+static int
+_putuint(struct xdr *xdr, unsigned i)
+{
+ unsigned *up = (unsigned *)xdr->xdr_idx;
+
+ *up = htobe32(i);
+ return (sizeof(int));
+}
+
+static int
+_getint_mem(struct xdr *xdr, int *ip)
+{
+ *ip = *(int *)xdr->xdr_idx;
+ return (sizeof(int));
+}
+
+static int
+_putint_mem(struct xdr *xdr, int i)
+{
+ int *ip = (int *)xdr->xdr_idx;
+
+ *ip = i;
+ return (sizeof(int));
+}
+
+static int
+_getuint_mem(struct xdr *xdr, unsigned *ip)
+{
+ *ip = *(unsigned *)xdr->xdr_idx;
+ return (sizeof(unsigned));
+}
+
+static int
+_putuint_mem(struct xdr *xdr, unsigned i)
+{
+ unsigned *up = (unsigned *)xdr->xdr_idx;
+
+ *up = i;
+ return (sizeof(int));
+}
+
+/*
+ * XDR data translations.
+ */
+static bool
+xdr_short(xdr_t *xdr, short *ip)
+{
+ int i;
+ bool rv;
+
+ i = *ip;
+ if ((rv = xdr_int(xdr, &i))) {
+ if (xdr->xdr_op == XDR_OP_DECODE)
+ *ip = i;
+ }
+ return (rv);
+}
+
+static bool
+xdr_u_short(xdr_t *xdr, unsigned short *ip)
+{
+ unsigned u;
+ bool rv;
+
+ u = *ip;
+ if ((rv = xdr_u_int(xdr, &u))) {
+ if (xdr->xdr_op == XDR_OP_DECODE)
+ *ip = u;
+ }
+ return (rv);
+}
+
+/*
+ * translate xdr->xdr_idx, increment it by size of int.
+ */
+static bool
+xdr_int(xdr_t *xdr, int *ip)
+{
+ bool rv = false;
+ int *i = (int *)xdr->xdr_idx;
+
+ if (xdr->xdr_idx + sizeof(int) > xdr->xdr_buf + xdr->xdr_buf_size)
+ return (rv);
+
+ switch (xdr->xdr_op) {
+ case XDR_OP_ENCODE:
+ /* Encode value *ip, store to buf */
+ xdr->xdr_idx += xdr->xdr_putint(xdr, *ip);
+ rv = true;
+ break;
+
+ case XDR_OP_DECODE:
+ /* Decode buf, return value to *ip */
+ xdr->xdr_idx += xdr->xdr_getint(xdr, i);
+ *ip = *i;
+ rv = true;
+ break;
+ }
+ return (rv);
+}
+
+/*
+ * translate xdr->xdr_idx, increment it by size of unsigned int.
+ */
+static bool
+xdr_u_int(xdr_t *xdr, unsigned *ip)
+{
+ bool rv = false;
+ unsigned *u = (unsigned *)xdr->xdr_idx;
+
+ if (xdr->xdr_idx + sizeof(unsigned) > xdr->xdr_buf + xdr->xdr_buf_size)
+ return (rv);
+
+ switch (xdr->xdr_op) {
+ case XDR_OP_ENCODE:
+ /* Encode value *ip, store to buf */
+ xdr->xdr_idx += xdr->xdr_putuint(xdr, *ip);
+ rv = true;
+ break;
+
+ case XDR_OP_DECODE:
+ /* Decode buf, return value to *ip */
+ xdr->xdr_idx += xdr->xdr_getuint(xdr, u);
+ *ip = *u;
+ rv = true;
+ break;
+ }
+ return (rv);
+}
+
+static bool
+xdr_int64(xdr_t *xdr, int64_t *lp)
+{
+ bool rv = false;
+
+ if (xdr->xdr_idx + sizeof(int64_t) > xdr->xdr_buf + xdr->xdr_buf_size)
+ return (rv);
+
+ switch (xdr->xdr_op) {
+ case XDR_OP_ENCODE:
+ /* Encode value *lp, store to buf */
+ if (xdr->xdr_putint == _putint)
+ *(int64_t *)xdr->xdr_idx = htobe64(*lp);
+ else
+ *(int64_t *)xdr->xdr_idx = *lp;
+ xdr->xdr_idx += sizeof(int64_t);
+ rv = true;
+ break;
+
+ case XDR_OP_DECODE:
+ /* Decode buf, return value to *ip */
+ if (xdr->xdr_getint == _getint)
+ *lp = be64toh(*(int64_t *)xdr->xdr_idx);
+ else
+ *lp = *(int64_t *)xdr->xdr_idx;
+ xdr->xdr_idx += sizeof(int64_t);
+ rv = true;
+ }
+ return (rv);
+}
+
+static bool
+xdr_uint64(xdr_t *xdr, uint64_t *lp)
+{
+ bool rv = false;
+
+ if (xdr->xdr_idx + sizeof(uint64_t) > xdr->xdr_buf + xdr->xdr_buf_size)
+ return (rv);
+
+ switch (xdr->xdr_op) {
+ case XDR_OP_ENCODE:
+ /* Encode value *ip, store to buf */
+ if (xdr->xdr_putint == _putint)
+ *(uint64_t *)xdr->xdr_idx = htobe64(*lp);
+ else
+ *(uint64_t *)xdr->xdr_idx = *lp;
+ xdr->xdr_idx += sizeof(uint64_t);
+ rv = true;
+ break;
+
+ case XDR_OP_DECODE:
+ /* Decode buf, return value to *ip */
+ if (xdr->xdr_getuint == _getuint)
+ *lp = be64toh(*(uint64_t *)xdr->xdr_idx);
+ else
+ *lp = *(uint64_t *)xdr->xdr_idx;
+ xdr->xdr_idx += sizeof(uint64_t);
+ rv = true;
+ }
+ return (rv);
+}
+
+static bool
+xdr_char(xdr_t *xdr, char *cp)
+{
+ int i;
+ bool rv = false;
+
+ i = *cp;
+ if ((rv = xdr_int(xdr, &i))) {
+ if (xdr->xdr_op == XDR_OP_DECODE)
+ *cp = i;
+ }
+ return (rv);
+}
+
+static bool
+xdr_string(xdr_t *xdr, nv_string_t *s)
+{
+ int size = 0;
+ bool rv = false;
+
+ switch (xdr->xdr_op) {
+ case XDR_OP_ENCODE:
+ size = s->nv_size;
+ if (xdr->xdr_idx + sizeof(unsigned) + NV_ALIGN4(size) >
+ xdr->xdr_buf + xdr->xdr_buf_size)
+ break;
+ xdr->xdr_idx += xdr->xdr_putuint(xdr, s->nv_size);
+ xdr->xdr_idx += NV_ALIGN4(size);
+ rv = true;
+ break;
+
+ case XDR_OP_DECODE:
+ if (xdr->xdr_idx + sizeof(unsigned) >
+ xdr->xdr_buf + xdr->xdr_buf_size)
+ break;
+ size = xdr->xdr_getuint(xdr, &s->nv_size);
+ size = NV_ALIGN4(size + s->nv_size);
+ if (xdr->xdr_idx + size > xdr->xdr_buf + xdr->xdr_buf_size)
+ break;
+ xdr->xdr_idx += size;
+ rv = true;
+ break;
+ }
+ return (rv);
+}
+
+static bool
+xdr_array(xdr_t *xdr, const unsigned nelem, const xdrproc_t elproc)
+{
+ bool rv = true;
+ unsigned c = nelem;
+
+ if (!xdr_u_int(xdr, &c))
+ return (false);
+
+ for (unsigned i = 0; i < nelem; i++) {
+ if (!elproc(xdr, xdr->xdr_idx))
+ return (false);
+ }
+ return (rv);
+}
+
+/*
+ * nvlist management functions.
+ */
+void
+nvlist_destroy(nvlist_t *nvl)
+{
+ if (nvl != NULL) {
+ /* Free data if it was allocated by us. */
+ if (nvl->nv_asize > 0)
+ free(nvl->nv_data);
+ }
+ free(nvl);
+}
+
+char *
+nvstring_get(nv_string_t *nvs)
+{
+ char *s;
+
+ s = malloc(nvs->nv_size + 1);
+ if (s != NULL) {
+ bcopy(nvs->nv_data, s, nvs->nv_size);
+ s[nvs->nv_size] = '\0';
+ }
+ return (s);
+}
+
+/*
+ * Create empty nvlist.
+ * The nvlist is terminated by 2x zeros (8 bytes).
+ */
+nvlist_t *
+nvlist_create(int flag)
+{
+ nvlist_t *nvl;
+ nvs_data_t *nvs;
+
+ nvl = calloc(1, sizeof(*nvl));
+ if (nvl == NULL)
+ return (nvl);
+
+ nvl->nv_header.nvh_encoding = NV_ENCODE_XDR;
+ nvl->nv_header.nvh_endian = _BYTE_ORDER == _LITTLE_ENDIAN;
+
+ nvl->nv_asize = nvl->nv_size = sizeof(*nvs);
+ nvs = calloc(1, nvl->nv_asize);
+ if (nvs == NULL) {
+ free(nvl);
+ return (NULL);
+ }
+ /* data in nvlist is byte stream */
+ nvl->nv_data = (uint8_t *)nvs;
+
+ nvs->nvl_version = NV_VERSION;
+ nvs->nvl_nvflag = flag;
+ return (nvl);
+}
+
+static bool
+nvlist_xdr_nvp(xdr_t *xdr, nvlist_t *nvl)
+{
+ nv_string_t *nv_string;
+ nv_pair_data_t *nvp_data;
+ nvlist_t nvlist;
+ unsigned type, nelem;
+ xdr_t nv_xdr;
+
+ nv_string = (nv_string_t *)xdr->xdr_idx;
+ if (!xdr_string(xdr, nv_string)) {
+ return (false);
+ }
+ nvp_data = (nv_pair_data_t *)xdr->xdr_idx;
+
+ type = nvp_data->nv_type;
+ nelem = nvp_data->nv_nelem;
+ if (!xdr_u_int(xdr, &type) || !xdr_u_int(xdr, &nelem))
+ return (false);
+
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ case DATA_TYPE_NVLIST_ARRAY:
+ bzero(&nvlist, sizeof(nvlist));
+ nvlist.nv_data = xdr->xdr_idx;
+ nvlist.nv_idx = nvlist.nv_data;
+
+ /* Set up xdr for this nvlist. */
+ nv_xdr = *xdr;
+ nv_xdr.xdr_buf = nvlist.nv_data;
+ nv_xdr.xdr_idx = nvlist.nv_data;
+ nv_xdr.xdr_buf_size =
+ nvl->nv_data + nvl->nv_size - nvlist.nv_data;
+
+ for (unsigned i = 0; i < nelem; i++) {
+ if (xdr->xdr_op == XDR_OP_ENCODE) {
+ if (!nvlist_size_native(&nv_xdr,
+ &nvlist.nv_size))
+ return (false);
+ } else {
+ if (!nvlist_size_xdr(&nv_xdr,
+ &nvlist.nv_size))
+ return (false);
+ }
+ if (nvlist_xdr_nvlist(xdr, &nvlist) != 0)
+ return (false);
+
+ nvlist.nv_data = nv_xdr.xdr_idx;
+ nvlist.nv_idx = nv_xdr.xdr_idx;
+
+ nv_xdr.xdr_buf = nv_xdr.xdr_idx;
+ nv_xdr.xdr_buf_size =
+ nvl->nv_data + nvl->nv_size - nvlist.nv_data;
+ }
+ break;
+
+ case DATA_TYPE_BOOLEAN:
+ /* BOOLEAN does not take value space */
+ break;
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ if (!xdr_char(xdr, (char *)&nvp_data->nv_data[0]))
+ return (false);
+ break;
+
+ case DATA_TYPE_INT16:
+ if (!xdr_short(xdr, (short *)&nvp_data->nv_data[0]))
+ return (false);
+ break;
+
+ case DATA_TYPE_UINT16:
+ if (!xdr_u_short(xdr, (unsigned short *)&nvp_data->nv_data[0]))
+ return (false);
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_INT32:
+ if (!xdr_int(xdr, (int *)&nvp_data->nv_data[0]))
+ return (false);
+ break;
+
+ case DATA_TYPE_UINT32:
+ if (!xdr_u_int(xdr, (unsigned *)&nvp_data->nv_data[0]))
+ return (false);
+ break;
+
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_INT64:
+ if (!xdr_int64(xdr, (int64_t *)&nvp_data->nv_data[0]))
+ return (false);
+ break;
+
+ case DATA_TYPE_UINT64:
+ if (!xdr_uint64(xdr, (uint64_t *)&nvp_data->nv_data[0]))
+ return (false);
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_STRING:
+ nv_string = (nv_string_t *)&nvp_data->nv_data[0];
+ if (!xdr_string(xdr, nv_string))
+ return (false);
+ break;
+
+ case DATA_TYPE_STRING_ARRAY:
+ nv_string = (nv_string_t *)&nvp_data->nv_data[0];
+ for (unsigned i = 0; i < nelem; i++) {
+ if (!xdr_string(xdr, nv_string))
+ return (false);
+ nv_string = (nv_string_t *)xdr->xdr_idx;
+ }
+ break;
+
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ if (!xdr_array(xdr, nelem, (xdrproc_t)xdr_u_int))
+ return (false);
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ if (!xdr_array(xdr, nelem, (xdrproc_t)xdr_uint64))
+ return (false);
+ break;
+ }
+ return (true);
+}
+
+static int
+nvlist_xdr_nvlist(xdr_t *xdr, nvlist_t *nvl)
+{
+ nvp_header_t *nvph;
+ nvs_data_t *nvs;
+ unsigned encoded_size, decoded_size;
+ int rv;
+
+ nvs = (nvs_data_t *)xdr->xdr_idx;
+ nvph = &nvs->nvl_pair;
+
+ if (!xdr_u_int(xdr, &nvs->nvl_version))
+ return (EINVAL);
+ if (!xdr_u_int(xdr, &nvs->nvl_nvflag))
+ return (EINVAL);
+
+ encoded_size = nvph->encoded_size;
+ decoded_size = nvph->decoded_size;
+
+ if (xdr->xdr_op == XDR_OP_ENCODE) {
+ if (!xdr_u_int(xdr, &nvph->encoded_size))
+ return (EINVAL);
+ if (!xdr_u_int(xdr, &nvph->decoded_size))
+ return (EINVAL);
+ } else {
+ xdr->xdr_idx += 2 * sizeof(unsigned);
+ }
+
+ rv = 0;
+ while (encoded_size && decoded_size) {
+ if (!nvlist_xdr_nvp(xdr, nvl))
+ return (EINVAL);
+
+ nvph = (nvp_header_t *)(xdr->xdr_idx);
+ encoded_size = nvph->encoded_size;
+ decoded_size = nvph->decoded_size;
+ if (xdr->xdr_op == XDR_OP_ENCODE) {
+ if (!xdr_u_int(xdr, &nvph->encoded_size))
+ return (EINVAL);
+ if (!xdr_u_int(xdr, &nvph->decoded_size))
+ return (EINVAL);
+ } else {
+ xdr->xdr_idx += 2 * sizeof(unsigned);
+ }
+ }
+ return (rv);
+}
+
+/*
+ * Calculate nvlist size, translating encoded_size and decoded_size.
+ */
+static bool
+nvlist_size_xdr(xdr_t *xdr, size_t *size)
+{
+ uint8_t *pair;
+ unsigned encoded_size, decoded_size;
+
+ xdr->xdr_idx += 2 * sizeof(unsigned);
+
+ pair = xdr->xdr_idx;
+ if (!xdr_u_int(xdr, &encoded_size) || !xdr_u_int(xdr, &decoded_size))
+ return (false);
+
+ while (encoded_size && decoded_size) {
+ xdr->xdr_idx = pair + encoded_size;
+ pair = xdr->xdr_idx;
+ if (!xdr_u_int(xdr, &encoded_size) ||
+ !xdr_u_int(xdr, &decoded_size))
+ return (false);
+ }
+ *size = xdr->xdr_idx - xdr->xdr_buf;
+
+ return (true);
+}
+
+nvp_header_t *
+nvlist_next_nvpair(nvlist_t *nvl, nvp_header_t *nvh)
+{
+ uint8_t *pair;
+ unsigned encoded_size, decoded_size;
+ xdr_t xdr;
+
+ if (nvl == NULL)
+ return (NULL);
+
+ xdr.xdr_buf = nvl->nv_data;
+ xdr.xdr_idx = nvl->nv_data;
+ xdr.xdr_buf_size = nvl->nv_size;
+
+ xdr.xdr_idx += 2 * sizeof(unsigned);
+
+ /* Skip tp current pair */
+ if (nvh != NULL) {
+ xdr.xdr_idx = (uint8_t *)nvh;
+ }
+
+ pair = xdr.xdr_idx;
+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
+ return (NULL);
+
+ encoded_size = *(unsigned *)xdr.xdr_idx;
+ xdr.xdr_idx += sizeof(unsigned);
+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
+ return (NULL);
+
+ decoded_size = *(unsigned *)xdr.xdr_idx;
+ xdr.xdr_idx += sizeof(unsigned);
+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
+ return (NULL);
+
+ while (encoded_size && decoded_size) {
+ if (nvh == NULL)
+ return ((nvp_header_t *)pair);
+
+ xdr.xdr_idx = pair + encoded_size;
+ nvh = (nvp_header_t *)xdr.xdr_idx;
+
+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
+ return (NULL);
+
+ encoded_size = *(unsigned *)xdr.xdr_idx;
+ xdr.xdr_idx += sizeof(unsigned);
+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
+ return (NULL);
+ decoded_size = *(unsigned *)xdr.xdr_idx;
+ xdr.xdr_idx += sizeof(unsigned);
+ if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size)
+ return (NULL);
+
+ if (encoded_size != 0 && decoded_size != 0) {
+ return (nvh);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Calculate nvlist size by walking in memory data.
+ */
+static bool
+nvlist_size_native(xdr_t *xdr, size_t *size)
+{
+ uint8_t *pair;
+ unsigned encoded_size, decoded_size;
+
+ xdr->xdr_idx += 2 * sizeof(unsigned);
+
+ pair = xdr->xdr_idx;
+ if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size)
+ return (false);
+
+ encoded_size = *(unsigned *)xdr->xdr_idx;
+ xdr->xdr_idx += sizeof(unsigned);
+ if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size)
+ return (false);
+ decoded_size = *(unsigned *)xdr->xdr_idx;
+ xdr->xdr_idx += sizeof(unsigned);
+ while (encoded_size && decoded_size) {
+ xdr->xdr_idx = pair + encoded_size;
+ pair = xdr->xdr_idx;
+ if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size)
+ return (false);
+ encoded_size = *(unsigned *)xdr->xdr_idx;
+ xdr->xdr_idx += sizeof(unsigned);
+ if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size)
+ return (false);
+ decoded_size = *(unsigned *)xdr->xdr_idx;
+ xdr->xdr_idx += sizeof(unsigned);
+ }
+ *size = xdr->xdr_idx - xdr->xdr_buf;
+
+ return (true);
+}
+
+/*
+ * Export nvlist to byte stream format.
+ */
+int
+nvlist_export(nvlist_t *nvl)
+{
+ int rv;
+ xdr_t xdr = {
+ .xdr_op = XDR_OP_ENCODE,
+ .xdr_putint = _putint,
+ .xdr_putuint = _putuint,
+ .xdr_buf = nvl->nv_data,
+ .xdr_idx = nvl->nv_data,
+ .xdr_buf_size = nvl->nv_size
+ };
+
+ if (nvl->nv_header.nvh_encoding != NV_ENCODE_XDR)
+ return (ENOTSUP);
+
+ nvl->nv_idx = nvl->nv_data;
+ rv = nvlist_xdr_nvlist(&xdr, nvl);
+
+ return (rv);
+}
+
+/*
+ * Import nvlist from byte stream.
+ * Determine the stream size and allocate private copy.
+ * Then translate the data.
+ */
+nvlist_t *
+nvlist_import(const char *stream, size_t size)
+{
+ nvlist_t *nvl;
+ xdr_t xdr = {
+ .xdr_op = XDR_OP_DECODE,
+ .xdr_getint = _getint,
+ .xdr_getuint = _getuint
+ };
+
+ /* Check the nvlist head. */
+ if (stream[0] != NV_ENCODE_XDR ||
+ (stream[1] != '\0' && stream[1] != '\1') ||
+ stream[2] != '\0' || stream[3] != '\0' ||
+ be32toh(*(uint32_t *)(stream + 4)) != NV_VERSION ||
+ be32toh(*(uint32_t *)(stream + 8)) != NV_UNIQUE_NAME)
+ return (NULL);
+
+ nvl = malloc(sizeof(*nvl));
+ if (nvl == NULL)
+ return (nvl);
+
+ nvl->nv_header.nvh_encoding = stream[0];
+ nvl->nv_header.nvh_endian = stream[1];
+ nvl->nv_header.nvh_reserved1 = stream[2];
+ nvl->nv_header.nvh_reserved2 = stream[3];
+
+ xdr.xdr_buf = xdr.xdr_idx = (uint8_t *)stream + 4;
+ xdr.xdr_buf_size = size - 4;
+
+ if (!nvlist_size_xdr(&xdr, &nvl->nv_asize)) {
+ free(nvl);
+ return (NULL);
+ }
+ nvl->nv_size = nvl->nv_asize;
+ nvl->nv_data = malloc(nvl->nv_asize);
+ if (nvl->nv_data == NULL) {
+ free(nvl);
+ return (NULL);
+ }
+ nvl->nv_idx = nvl->nv_data;
+ bcopy(stream + 4, nvl->nv_data, nvl->nv_asize);
+
+ xdr.xdr_buf = xdr.xdr_idx = nvl->nv_data;
+ xdr.xdr_buf_size = nvl->nv_asize;
+
+ if (nvlist_xdr_nvlist(&xdr, nvl) != 0) {
+ free(nvl->nv_data);
+ free(nvl);
+ nvl = NULL;
+ }
+
+ return (nvl);
+}
+
+/*
+ * remove pair from this nvlist.
+ */
+int
+nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
+{
+ uint8_t *head, *tail;
+ nvs_data_t *data;
+ nvp_header_t *nvp;
+ nv_string_t *nvp_name;
+ nv_pair_data_t *nvp_data;
+ size_t size;
+ xdr_t xdr;
+
+ if (nvl == NULL || nvl->nv_data == NULL || name == NULL)
+ return (EINVAL);
+
+ /* Make sure the nvlist size is set correct */
+ xdr.xdr_idx = nvl->nv_data;
+ xdr.xdr_buf = xdr.xdr_idx;
+ xdr.xdr_buf_size = nvl->nv_size;
+ if (!nvlist_size_native(&xdr, &nvl->nv_size))
+ return (EINVAL);
+
+ data = (nvs_data_t *)nvl->nv_data;
+ nvp = &data->nvl_pair; /* first pair in nvlist */
+ head = (uint8_t *)nvp;
+
+ while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
+ nvp_name = (nv_string_t *)(nvp + 1);
+
+ nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
+ NV_ALIGN4(nvp_name->nv_size));
+
+ if (strlen(name) == nvp_name->nv_size &&
+ memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0 &&
+ (nvp_data->nv_type == type || type == DATA_TYPE_UNKNOWN)) {
+ /*
+ * set tail to point to next nvpair and size
+ * is the length of the tail.
+ */
+ tail = head + nvp->encoded_size;
+ size = nvl->nv_size - (tail - nvl->nv_data);
+
+ /* adjust the size of the nvlist. */
+ nvl->nv_size -= nvp->encoded_size;
+ bcopy(tail, head, size);
+ return (0);
+ }
+ /* Not our pair, skip to next. */
+ head = head + nvp->encoded_size;
+ nvp = (nvp_header_t *)head;
+ }
+ return (ENOENT);
+}
+
+static int
+clone_nvlist(const nvlist_t *nvl, const uint8_t *ptr, unsigned size,
+ nvlist_t **nvlist)
+{
+ nvlist_t *nv;
+
+ nv = calloc(1, sizeof(*nv));
+ if (nv == NULL)
+ return (ENOMEM);
+
+ nv->nv_header = nvl->nv_header;
+ nv->nv_asize = size;
+ nv->nv_size = size;
+ nv->nv_data = malloc(nv->nv_asize);
+ if (nv->nv_data == NULL) {
+ free(nv);
+ return (ENOMEM);
+ }
+
+ bcopy(ptr, nv->nv_data, nv->nv_asize);
+ *nvlist = nv;
+ return (0);
+}
+
+/*
+ * Return the next nvlist in an nvlist array.
+ */
+static uint8_t *
+nvlist_next(const uint8_t *ptr)
+{
+ nvs_data_t *data;
+ nvp_header_t *nvp;
+
+ data = (nvs_data_t *)ptr;
+ nvp = &data->nvl_pair; /* first pair in nvlist */
+
+ while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
+ nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size);
+ }
+ return ((uint8_t *)nvp + sizeof(*nvp));
+}
+
+/*
+ * Note: nvlist and nvlist array must be freed by caller.
+ */
+int
+nvlist_find(const nvlist_t *nvl, const char *name, data_type_t type,
+ int *elementsp, void *valuep, int *sizep)
+{
+ nvs_data_t *data;
+ nvp_header_t *nvp;
+ nv_string_t *nvp_name;
+ nv_pair_data_t *nvp_data;
+ nvlist_t **nvlist, *nv;
+ uint8_t *ptr;
+ int rv;
+
+ if (nvl == NULL || nvl->nv_data == NULL || name == NULL)
+ return (EINVAL);
+
+ data = (nvs_data_t *)nvl->nv_data;
+ nvp = &data->nvl_pair; /* first pair in nvlist */
+
+ while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
+ nvp_name = (nv_string_t *)((uint8_t *)nvp + sizeof(*nvp));
+ if (nvl->nv_data + nvl->nv_size <
+ nvp_name->nv_data + nvp_name->nv_size)
+ return (EIO);
+
+ nvp_data = (nv_pair_data_t *)
+ NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] +
+ nvp_name->nv_size);
+
+ if (strlen(name) == nvp_name->nv_size &&
+ memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0 &&
+ (nvp_data->nv_type == type || type == DATA_TYPE_UNKNOWN)) {
+ if (elementsp != NULL)
+ *elementsp = nvp_data->nv_nelem;
+ switch (nvp_data->nv_type) {
+ case DATA_TYPE_UINT64:
+ bcopy(nvp_data->nv_data, valuep,
+ sizeof(uint64_t));
+ return (0);
+ case DATA_TYPE_STRING:
+ nvp_name = (nv_string_t *)nvp_data->nv_data;
+ if (sizep != NULL) {
+ *sizep = nvp_name->nv_size;
+ }
+ *(const uint8_t **)valuep =
+ &nvp_name->nv_data[0];
+ return (0);
+ case DATA_TYPE_NVLIST:
+ ptr = &nvp_data->nv_data[0];
+ rv = clone_nvlist(nvl, ptr,
+ nvlist_next(ptr) - ptr, &nv);
+ if (rv == 0) {
+ *(nvlist_t **)valuep = nv;
+ }
+ return (rv);
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ nvlist = calloc(nvp_data->nv_nelem,
+ sizeof(nvlist_t *));
+ if (nvlist == NULL)
+ return (ENOMEM);
+ ptr = &nvp_data->nv_data[0];
+ rv = 0;
+ for (unsigned i = 0; i < nvp_data->nv_nelem;
+ i++) {
+ rv = clone_nvlist(nvl, ptr,
+ nvlist_next(ptr) - ptr, &nvlist[i]);
+ if (rv != 0)
+ goto error;
+ ptr = nvlist_next(ptr);
+ }
+ *(nvlist_t ***)valuep = nvlist;
+ return (rv);
+ }
+ return (EIO);
+ }
+ /* Not our pair, skip to next. */
+ nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size);
+ if (nvl->nv_data + nvl->nv_size < (uint8_t *)nvp)
+ return (EIO);
+ }
+ return (ENOENT);
+error:
+ for (unsigned i = 0; i < nvp_data->nv_nelem; i++) {
+ free(nvlist[i]->nv_data);
+ free(nvlist[i]);
+ }
+ free(nvlist);
+ return (rv);
+}
+
+static int
+get_value_size(data_type_t type, const void *data, uint32_t nelem)
+{
+ uint64_t value_sz = 0;
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ value_sz = 0;
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ /* Our smallest data unit is 32-bit */
+ value_sz = sizeof(uint32_t);
+ break;
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_INT64:
+ value_sz = sizeof(int64_t);
+ break;
+ case DATA_TYPE_UINT64:
+ value_sz = sizeof(uint64_t);
+ break;
+ case DATA_TYPE_STRING:
+ if (data == NULL)
+ value_sz = 0;
+ else
+ value_sz = strlen(data) + 1;
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ value_sz = nelem * sizeof(uint8_t);
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof(uint32_t);
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof(int64_t);
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof(uint64_t);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof(uint64_t);
+
+ if (data != NULL) {
+ char *const *strs = data;
+ uint32_t i;
+
+ for (i = 0; i < nelem; i++) {
+ if (strs[i] == NULL)
+ return (-1);
+ value_sz += strlen(strs[i]) + 1;
+ }
+ }
+ break;
+ case DATA_TYPE_NVLIST:
+ /*
+ * The decoded size of nvlist is constant.
+ */
+ value_sz = NV_ALIGN(6 * 4); /* sizeof nvlist_t */
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof(uint64_t) +
+ (uint64_t)nelem * NV_ALIGN(6 * 4); /* sizeof nvlist_t */
+ break;
+ default:
+ return (-1);
+ }
+
+ return (value_sz > INT32_MAX ? -1 : (int)value_sz);
+}
+
+static int
+get_nvp_data_size(data_type_t type, const void *data, uint32_t nelem)
+{
+ uint64_t value_sz = 0;
+ xdr_t xdr;
+ size_t size;
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ value_sz = 0;
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ /* Our smallest data unit is 32-bit */
+ value_sz = sizeof(uint32_t);
+ break;
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ value_sz = sizeof(uint64_t);
+ break;
+ case DATA_TYPE_STRING:
+ value_sz = 4 + NV_ALIGN4(strlen(data));
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ value_sz = NV_ALIGN4(nelem);
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ value_sz = 4 + (uint64_t)nelem * sizeof(uint32_t);
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ value_sz = 4 + (uint64_t)nelem * sizeof(uint64_t);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ if (data != NULL) {
+ char *const *strs = data;
+ uint32_t i;
+
+ for (i = 0; i < nelem; i++) {
+ value_sz += 4 + NV_ALIGN4(strlen(strs[i]));
+ }
+ }
+ break;
+ case DATA_TYPE_NVLIST:
+ xdr.xdr_idx = ((nvlist_t *)data)->nv_data;
+ xdr.xdr_buf = xdr.xdr_idx;
+ xdr.xdr_buf_size = ((nvlist_t *)data)->nv_size;
+
+ if (!nvlist_size_native(&xdr, &size))
+ return (-1);
+
+ value_sz = size;
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ value_sz = 0;
+ for (uint32_t i = 0; i < nelem; i++) {
+ xdr.xdr_idx = ((nvlist_t **)data)[i]->nv_data;
+ xdr.xdr_buf = xdr.xdr_idx;
+ xdr.xdr_buf_size = ((nvlist_t **)data)[i]->nv_size;
+
+ if (!nvlist_size_native(&xdr, &size))
+ return (-1);
+ value_sz += size;
+ }
+ break;
+ default:
+ return (-1);
+ }
+
+ return (value_sz > INT32_MAX ? -1 : (int)value_sz);
+}
+
+#define NVPE_SIZE(name_len, data_len) \
+ (4 + 4 + 4 + NV_ALIGN4(name_len) + 4 + 4 + data_len)
+#define NVP_SIZE(name_len, data_len) \
+ (NV_ALIGN((4 * 4) + (name_len)) + NV_ALIGN(data_len))
+
+static int
+nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
+ uint32_t nelem, const void *data)
+{
+ nvs_data_t *nvs;
+ nvp_header_t head, *hp;
+ uint8_t *ptr;
+ size_t namelen;
+ int decoded_size, encoded_size;
+ xdr_t xdr = {
+ .xdr_op = XDR_OP_ENCODE,
+ .xdr_putint = _putint_mem,
+ .xdr_putuint = _putuint_mem,
+ .xdr_buf = nvl->nv_data,
+ .xdr_idx = nvl->nv_data,
+ .xdr_buf_size = nvl->nv_size
+ };
+
+ nvs = (nvs_data_t *)nvl->nv_data;
+ if (nvs->nvl_nvflag & NV_UNIQUE_NAME)
+ (void) nvlist_remove(nvl, name, type);
+
+ xdr.xdr_buf = nvl->nv_data;
+ xdr.xdr_idx = nvl->nv_data;
+ xdr.xdr_buf_size = nvl->nv_size;
+ if (!nvlist_size_native(&xdr, &nvl->nv_size))
+ return (EINVAL);
+
+ namelen = strlen(name);
+ if ((decoded_size = get_value_size(type, data, nelem)) < 0)
+ return (EINVAL);
+ if ((encoded_size = get_nvp_data_size(type, data, nelem)) < 0)
+ return (EINVAL);
+
+ /*
+ * The encoded size is calculated as:
+ * encode_size (4) + decode_size (4) +
+ * name string size (4 + NV_ALIGN4(namelen) +
+ * data type (4) + nelem size (4) + datalen
+ *
+ * The decoded size is calculated as:
+ * Note: namelen is with terminating 0.
+ * NV_ALIGN(sizeof(nvpair_t) (4 * 4) + namelen + 1) +
+ * NV_ALIGN(data_len)
+ */
+
+ head.encoded_size = NVPE_SIZE(namelen, encoded_size);
+ head.decoded_size = NVP_SIZE(namelen + 1, decoded_size);
+
+ if (nvl->nv_asize - nvl->nv_size < head.encoded_size + 8) {
+ ptr = realloc(nvl->nv_data, nvl->nv_asize + head.encoded_size);
+ if (ptr == NULL)
+ return (ENOMEM);
+ nvl->nv_data = ptr;
+ nvl->nv_asize += head.encoded_size;
+ }
+ nvl->nv_idx = nvl->nv_data + nvl->nv_size - sizeof(*hp);
+ bzero(nvl->nv_idx, head.encoded_size + 8);
+ hp = (nvp_header_t *)nvl->nv_idx;
+ *hp = head;
+ nvl->nv_idx += sizeof(*hp);
+
+ xdr.xdr_buf = nvl->nv_data;
+ xdr.xdr_buf_size = nvl->nv_asize;
+ xdr.xdr_idx = nvl->nv_idx;
+
+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, namelen);
+ strlcpy((char *)xdr.xdr_idx, name, namelen + 1);
+ xdr.xdr_idx += NV_ALIGN4(namelen);
+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, type);
+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, nelem);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size);
+ bcopy(data, xdr.xdr_idx, nelem);
+ xdr.xdr_idx += NV_ALIGN4(encoded_size);
+ break;
+
+ case DATA_TYPE_STRING:
+ encoded_size = strlen(data);
+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size);
+ strlcpy((char *)xdr.xdr_idx, data, encoded_size + 1);
+ xdr.xdr_idx += NV_ALIGN4(encoded_size);
+ break;
+
+ case DATA_TYPE_STRING_ARRAY:
+ for (uint32_t i = 0; i < nelem; i++) {
+ encoded_size = strlen(((char **)data)[i]);
+ xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size);
+ strlcpy((char *)xdr.xdr_idx, ((char **)data)[i],
+ encoded_size + 1);
+ xdr.xdr_idx += NV_ALIGN4(encoded_size);
+ }
+ break;
+
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ xdr_char(&xdr, (char *)data);
+ break;
+
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_char);
+ break;
+
+ case DATA_TYPE_INT16:
+ xdr_short(&xdr, (short *)data);
+ break;
+
+ case DATA_TYPE_UINT16:
+ xdr_u_short(&xdr, (unsigned short *)data);
+ break;
+
+ case DATA_TYPE_INT16_ARRAY:
+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_short);
+ break;
+
+ case DATA_TYPE_UINT16_ARRAY:
+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_u_short);
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_INT32:
+ xdr_int(&xdr, (int *)data);
+ break;
+
+ case DATA_TYPE_UINT32:
+ xdr_u_int(&xdr, (unsigned int *)data);
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_int);
+ break;
+
+ case DATA_TYPE_UINT32_ARRAY:
+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_u_int);
+ break;
+
+ case DATA_TYPE_INT64:
+ xdr_int64(&xdr, (int64_t *)data);
+ break;
+
+ case DATA_TYPE_UINT64:
+ xdr_uint64(&xdr, (uint64_t *)data);
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_int64);
+ break;
+
+ case DATA_TYPE_UINT64_ARRAY:
+ xdr_array(&xdr, nelem, (xdrproc_t)xdr_uint64);
+ break;
+
+ case DATA_TYPE_NVLIST:
+ bcopy(((nvlist_t *)data)->nv_data, xdr.xdr_idx, encoded_size);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY: {
+ size_t size;
+ xdr_t xdr_nv;
+
+ for (uint32_t i = 0; i < nelem; i++) {
+ xdr_nv.xdr_idx = ((nvlist_t **)data)[i]->nv_data;
+ xdr_nv.xdr_buf = xdr_nv.xdr_idx;
+ xdr_nv.xdr_buf_size = ((nvlist_t **)data)[i]->nv_size;
+
+ if (!nvlist_size_native(&xdr_nv, &size))
+ return (EINVAL);
+
+ bcopy(((nvlist_t **)data)[i]->nv_data, xdr.xdr_idx,
+ size);
+ xdr.xdr_idx += size;
+ }
+ break;
+ }
+ default:
+ bcopy(data, xdr.xdr_idx, encoded_size);
+ }
+
+ nvl->nv_size += head.encoded_size;
+
+ return (0);
+}
+
+int
+nvlist_add_boolean_value(nvlist_t *nvl, const char *name, bool value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1,
+ &value));
+}
+
+int
+nvlist_add_byte(nvlist_t *nvl, const char *name, uint8_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &value));
+}
+
+int
+nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &value));
+}
+
+int
+nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &value));
+}
+
+int
+nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &value));
+}
+
+int
+nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &value));
+}
+
+int
+nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &value));
+}
+
+int
+nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &value));
+}
+
+int
+nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &value));
+}
+
+int
+nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &value));
+}
+
+int
+nvlist_add_string(nvlist_t *nvl, const char *name, const char *value)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, value));
+}
+
+int
+nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+ bool *a, uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_add_byte_array(nvlist_t *nvl, const char *name, uint8_t *a, uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a,
+ uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a,
+ uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a,
+ uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_string_array(nvlist_t *nvl, const char *name,
+ char * const *a, uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
+}
+
+int
+nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a,
+ uint32_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+static const char *typenames[] = {
+ "DATA_TYPE_UNKNOWN",
+ "DATA_TYPE_BOOLEAN",
+ "DATA_TYPE_BYTE",
+ "DATA_TYPE_INT16",
+ "DATA_TYPE_UINT16",
+ "DATA_TYPE_INT32",
+ "DATA_TYPE_UINT32",
+ "DATA_TYPE_INT64",
+ "DATA_TYPE_UINT64",
+ "DATA_TYPE_STRING",
+ "DATA_TYPE_BYTE_ARRAY",
+ "DATA_TYPE_INT16_ARRAY",
+ "DATA_TYPE_UINT16_ARRAY",
+ "DATA_TYPE_INT32_ARRAY",
+ "DATA_TYPE_UINT32_ARRAY",
+ "DATA_TYPE_INT64_ARRAY",
+ "DATA_TYPE_UINT64_ARRAY",
+ "DATA_TYPE_STRING_ARRAY",
+ "DATA_TYPE_HRTIME",
+ "DATA_TYPE_NVLIST",
+ "DATA_TYPE_NVLIST_ARRAY",
+ "DATA_TYPE_BOOLEAN_VALUE",
+ "DATA_TYPE_INT8",
+ "DATA_TYPE_UINT8",
+ "DATA_TYPE_BOOLEAN_ARRAY",
+ "DATA_TYPE_INT8_ARRAY",
+ "DATA_TYPE_UINT8_ARRAY"
+};
+
+int
+nvpair_type_from_name(const char *name)
+{
+ unsigned i;
+
+ for (i = 0; i < nitems(typenames); i++) {
+ if (strcmp(name, typenames[i]) == 0)
+ return (i);
+ }
+ return (0);
+}
+
+nvp_header_t *
+nvpair_find(nvlist_t *nv, const char *name)
+{
+ nvp_header_t *nvh;
+
+ nvh = NULL;
+ while ((nvh = nvlist_next_nvpair(nv, nvh)) != NULL) {
+ nv_string_t *nvp_name;
+
+ nvp_name = (nv_string_t *)(nvh + 1);
+ if (nvp_name->nv_size == strlen(name) &&
+ memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0)
+ break;
+ }
+ return (nvh);
+}
+
+void
+nvpair_print(nvp_header_t *nvp, unsigned int indent)
+{
+ nv_string_t *nvp_name;
+ nv_pair_data_t *nvp_data;
+ nvlist_t nvlist;
+ unsigned i, j;
+ xdr_t xdr = {
+ .xdr_op = XDR_OP_DECODE,
+ .xdr_getint = _getint_mem,
+ .xdr_getuint = _getuint_mem,
+ .xdr_buf = (const uint8_t *)nvp,
+ .xdr_idx = NULL,
+ .xdr_buf_size = nvp->encoded_size
+ };
+
+ nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp));
+ nvp_data = (nv_pair_data_t *)
+ NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] + nvp_name->nv_size);
+
+ for (i = 0; i < indent; i++)
+ printf(" ");
+
+ printf("%s [%d] %.*s", typenames[nvp_data->nv_type],
+ nvp_data->nv_nelem, nvp_name->nv_size, nvp_name->nv_data);
+
+ xdr.xdr_idx = nvp_data->nv_data;
+ switch (nvp_data->nv_type) {
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8: {
+ char c;
+
+ if (xdr_char(&xdr, &c))
+ printf(" = 0x%x\n", c);
+ break;
+ }
+
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16: {
+ unsigned short u;
+
+ if (xdr_u_short(&xdr, &u))
+ printf(" = 0x%hx\n", u);
+ break;
+ }
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32: {
+ unsigned u;
+
+ if (xdr_u_int(&xdr, &u))
+ printf(" = 0x%x\n", u);
+ break;
+ }
+
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64: {
+ uint64_t u;
+
+ if (xdr_uint64(&xdr, &u))
+ printf(" = 0x%jx\n", (uintmax_t)u);
+ break;
+ }
+
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY: {
+ uint64_t *u;
+
+ if (xdr_array(&xdr, nvp_data->nv_nelem,
+ (xdrproc_t)xdr_uint64)) {
+ u = (uint64_t *)(nvp_data->nv_data + sizeof(unsigned));
+ for (i = 0; i < nvp_data->nv_nelem; i++)
+ printf(" [%u] = 0x%jx", i, (uintmax_t)u[i]);
+ printf("\n");
+ }
+
+ break;
+ }
+
+ case DATA_TYPE_STRING:
+ case DATA_TYPE_STRING_ARRAY:
+ nvp_name = (nv_string_t *)&nvp_data->nv_data[0];
+ for (i = 0; i < nvp_data->nv_nelem; i++) {
+ printf(" = \"%.*s\"\n", nvp_name->nv_size,
+ nvp_name->nv_data);
+ }
+ break;
+
+ case DATA_TYPE_NVLIST:
+ printf("\n");
+ nvlist.nv_data = &nvp_data->nv_data[0];
+ nvlist_print(&nvlist, indent + 2);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ nvlist.nv_data = &nvp_data->nv_data[0];
+ for (j = 0; j < nvp_data->nv_nelem; j++) {
+ size_t size;
+
+ printf("[%d]\n", j);
+ nvlist_print(&nvlist, indent + 2);
+ if (j != nvp_data->nv_nelem - 1) {
+ for (i = 0; i < indent; i++)
+ printf(" ");
+ printf("%s %.*s",
+ typenames[nvp_data->nv_type],
+ nvp_name->nv_size,
+ nvp_name->nv_data);
+ }
+ xdr.xdr_idx = nvlist.nv_data;
+ xdr.xdr_buf = xdr.xdr_idx;
+ xdr.xdr_buf_size = nvp->encoded_size -
+ (xdr.xdr_idx - (uint8_t *)nvp);
+
+ if (!nvlist_size_native(&xdr, &size))
+ return;
+
+ nvlist.nv_data += size;
+ }
+ break;
+
+ default:
+ printf("\n");
+ }
+}
+
+void
+nvlist_print(const nvlist_t *nvl, unsigned int indent)
+{
+ nvs_data_t *data;
+ nvp_header_t *nvp;
+
+ data = (nvs_data_t *)nvl->nv_data;
+ nvp = &data->nvl_pair; /* first pair in nvlist */
+ while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
+ nvpair_print(nvp, indent);
+ nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size);
+ }
+ printf("%*s\n", indent + 13, "End of nvlist");
+}
Index: usr.sbin/makefs/zfs/zfsimpl.h
===================================================================
--- /dev/null
+++ usr.sbin/makefs/zfs/zfsimpl.h
@@ -0,0 +1,2119 @@
+/*-
+ * Copyright (c) 2002 McAfee, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Marshall
+ * Kirk McKusick and McAfee Research,, the Security Research Division of
+ * McAfee, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as
+ * part of the DARPA CHATS research program
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2013 by Saso Kiselkov. All rights reserved.
+ */
+/*
+ * Copyright (c) 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/queue.h>
+
+#ifndef _ZFSIMPL_H_
+#define _ZFSIMPL_H_
+
+#define MAXNAMELEN 256
+
+#define _NOTE(s)
+
+/*
+ * AVL comparator helpers
+ */
+#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
+#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define AVL_PCMP(a, b) \
+ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
+typedef enum { B_FALSE, B_TRUE } boolean_t;
+
+/* CRC64 table */
+#define ZFS_CRC64_POLY 0xC96C5795D7870F42UL /* ECMA-182, reflected form */
+
+/*
+ * Macros for various sorts of alignment and rounding when the alignment
+ * is known to be a power of 2.
+ */
+#define P2ALIGN(x, align) ((x) & -(align))
+#define P2PHASE(x, align) ((x) & ((align) - 1))
+#define P2NPHASE(x, align) (-(x) & ((align) - 1))
+#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
+#define P2END(x, align) (-(~(x) & -(align)))
+#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
+#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1)
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) \
+ ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
+#define BF64_SET(x, low, len, val) \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define BF32_SET_SB(x, low, len, shift, bias, val) \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define BF64_SET_SB(x, low, len, shift, bias, val) \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * Macros to reverse byte order
+ */
+#define BSWAP_8(x) ((x) & 0xff)
+#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
+#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
+#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
+
+#define SPA_MINBLOCKSHIFT 9
+#define SPA_OLDMAXBLOCKSHIFT 17
+#define SPA_MAXBLOCKSHIFT 24
+#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_OLDMAXBLOCKSIZE (1ULL << SPA_OLDMAXBLOCKSHIFT)
+#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
+#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
+#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+ uint64_t dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+ uint64_t zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
+ * secret and is suitable for use in MAC algorithms as the key.
+ */
+typedef struct zio_cksum_salt {
+ uint8_t zcs_bytes[32];
+} zio_cksum_salt_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | vdev1 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1 |G| offset1 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2 | vdev2 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3 |G| offset2 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4 | vdev3 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5 |G| offset3 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9 | physical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | logical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | fill count |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * c | checksum[0] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * d | checksum[1] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * e | checksum[2] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * f | checksum[3] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev virtual device ID
+ * offset offset into virtual device
+ * LSIZE logical size
+ * PSIZE physical size (after compression)
+ * ASIZE allocated size (including RAID-Z parity and gang block headers)
+ * GRID RAID-Z layout information (reserved for future use)
+ * cksum checksum function
+ * comp compression function
+ * G gang block indicator
+ * B byteorder (endianness)
+ * D dedup
+ * X encryption (on version 30, which is not supported)
+ * E blkptr_t contains embedded data (see below)
+ * lvl level of indirection
+ * type DMU object type
+ * phys birth txg of block allocation; zero if same as logical birth txg
+ * log. birth transaction group in which the block was logically born
+ * fill count number of non-zero blocks under this bp
+ * checksum[4] 256-bit checksum of the data this bp describes
+ */
+
+/*
+ * "Embedded" blkptr_t's don't actually point to a block, instead they
+ * have a data payload embedded in the blkptr_t itself. See the comment
+ * in blkptr.c for more details.
+ *
+ * The blkptr_t is laid out as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | payload |
+ * 1 | payload |
+ * 2 | payload |
+ * 3 | payload |
+ * 4 | payload |
+ * 5 | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | payload |
+ * 8 | payload |
+ * 9 | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | logical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | payload |
+ * c | payload |
+ * d | payload |
+ * e | payload |
+ * f | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * payload contains the embedded data
+ * B (byteorder) byteorder (endianness)
+ * D (dedup) padding (set to zero)
+ * X encryption (set to zero; see above)
+ * E (embedded) set to one
+ * lvl indirection level
+ * type DMU object type
+ * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
+ * comp compression function of payload
+ * PSIZE size of payload after compression, in bytes
+ * LSIZE logical size of payload, in bytes
+ * note that 25 bits is enough to store the largest
+ * "normal" BP's LSIZE (2^16 * 2^9) in bytes
+ * log. birth transaction group in which the block was logically born
+ *
+ * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
+ * bp's they are stored in units of SPA_MINBLOCKSHIFT.
+ * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
+ * The B, D, X, lvl, type, and comp fields are stored the same as with normal
+ * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
+ * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
+ * other macros, as they assert that they are only used on BP's of the correct
+ * "embedded-ness".
+ */
+
+#define BPE_GET_ETYPE(bp) \
+ (assert(BP_IS_EMBEDDED(bp)), \
+ BF64_GET((bp)->blk_prop, 40, 8))
+#define BPE_SET_ETYPE(bp, t) do { \
+ assert(BP_IS_EMBEDDED(bp)); \
+ BF64_SET((bp)->blk_prop, 40, 8, t); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BPE_GET_LSIZE(bp) \
+ (assert(BP_IS_EMBEDDED(bp)), \
+ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
+#define BPE_SET_LSIZE(bp, x) do { \
+ assert(BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BPE_GET_PSIZE(bp) \
+ (assert(BP_IS_EMBEDDED(bp)), \
+ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
+#define BPE_SET_PSIZE(bp, x) do { \
+ assert(BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+typedef enum bp_embedded_type {
+ BP_EMBEDDED_TYPE_DATA,
+ BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
+ NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
+} bp_embedded_type_t;
+
+#define BPE_NUM_WORDS 14
+#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define BPE_IS_PAYLOADWORD(bp, wp) \
+ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
+#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
+#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+
+typedef struct blkptr {
+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[2]; /* Extra space for the future */
+ uint64_t blk_phys_birth; /* txg when block was allocated */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define DVA_GET_ASIZE(dva) \
+ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_ASIZE(dva, x) \
+ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
+ SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
+#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
+#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define DVA_GET_OFFSET(dva) \
+ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_OFFSET(dva, x) \
+ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
+#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define BP_GET_LSIZE(bp) \
+ (BP_IS_EMBEDDED(bp) ? \
+ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
+ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_LSIZE(bp, x) do { \
+ assert(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, \
+ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BP_GET_PSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define BP_SET_PSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
+
+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_PHYSICAL_BIRTH(bp) \
+ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
+#define BP_SET_BIRTH(bp, logical, physical) \
+{ \
+ assert(!BP_IS_EMBEDDED(bp)); \
+ (bp)->blk_birth = (logical); \
+ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
+
+#define BP_GET_FILL(bp) \
+ ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)
+
+#define BP_SET_FILL(bp, fill) \
+{ \
+ (bp)->blk_fill = fill; \
+}
+
+#define BP_GET_ASIZE(bp) \
+ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_GET_UCSIZE(bp) \
+ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+
+#define BP_GET_NDVAS(bp) \
+ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define DVA_EQUAL(dva1, dva2) \
+ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+ (dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
+ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
+ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
+ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
+ ((zc1).zc_word[3] - (zc2).zc_word[3])))
+
+
+#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
+
+#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
+{ \
+ (zcp)->zc_word[0] = w0; \
+ (zcp)->zc_word[1] = w1; \
+ (zcp)->zc_word[2] = w2; \
+ (zcp)->zc_word[3] = w3; \
+}
+
+#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
+#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
+ (dva)->dva_word[1] == 0ULL)
+#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp))
+#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+#define BP_ZERO(bp) \
+{ \
+ (bp)->blk_dva[0].dva_word[0] = 0; \
+ (bp)->blk_dva[0].dva_word[1] = 0; \
+ (bp)->blk_dva[1].dva_word[0] = 0; \
+ (bp)->blk_dva[1].dva_word[1] = 0; \
+ (bp)->blk_dva[2].dva_word[0] = 0; \
+ (bp)->blk_dva[2].dva_word[1] = 0; \
+ (bp)->blk_prop = 0; \
+ (bp)->blk_pad[0] = 0; \
+ (bp)->blk_pad[1] = 0; \
+ (bp)->blk_phys_birth = 0; \
+ (bp)->blk_birth = 0; \
+ (bp)->blk_fill = 0; \
+ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
+}
+
+#if BYTE_ORDER == _BIG_ENDIAN
+#define ZFS_HOST_BYTEORDER (0ULL)
+#else
+#define ZFS_HOST_BYTEORDER (1ULL)
+#endif
+
+#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
+#define BPE_NUM_WORDS 14
+#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define BPE_IS_PAYLOADWORD(bp, wp) \
+ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
+#define TXG_INITIAL 4
+
+/*
+ * Embedded checksum
+ */
+#define ZEC_MAGIC 0x210da7ab10c7a11ULL
+
+typedef struct zio_eck {
+ uint64_t zec_magic; /* for validation, endianness */
+ zio_cksum_t zec_cksum; /* 256-bit checksum */
+} zio_eck_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
+#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
+#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_eck_t) - \
+ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+ sizeof (uint64_t))
+
+typedef struct zio_gbh {
+ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
+ uint64_t zg_filler[SPA_GBH_FILLER];
+ zio_eck_t zg_tail;
+} zio_gbh_phys_t;
+
+#define VDEV_RAIDZ_MAXPARITY 3
+
+#define VDEV_PAD_SIZE (8 << 10)
+/* 2 padding areas (vl_pad1 and vl_be) to skip */
+#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
+#define VDEV_PHYS_SIZE (112 << 10)
+#define VDEV_UBERBLOCK_RING (128 << 10)
+
+/*
+ * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
+ * ring when MMP is enabled.
+ */
+#define MMP_BLOCKS_PER_LABEL 1
+
+/* The largest uberblock we support is 8k. */
+#define MAX_UBERBLOCK_SHIFT (13)
+#define VDEV_UBERBLOCK_SHIFT(vd) \
+ MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT)
+#define VDEV_UBERBLOCK_COUNT(vd) \
+ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
+#define VDEV_UBERBLOCK_OFFSET(vd, n) \
+ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
+#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
+
+typedef struct vdev_phys {
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+ zio_eck_t vp_zbt;
+} vdev_phys_t;
+
+typedef enum vbe_vers {
+ /* The bootenv file is stored as ascii text in the envblock */
+ VB_RAW = 0,
+
+ /*
+ * The bootenv file is converted to an nvlist and then packed into the
+ * envblock.
+ */
+ VB_NVLIST = 1
+} vbe_vers_t;
+
+typedef struct vdev_boot_envblock {
+ uint64_t vbe_version;
+ char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) -
+ sizeof (zio_eck_t)];
+ zio_eck_t vbe_zbt;
+} vdev_boot_envblock_t;
+
+_Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE,
+ "incorrect vdev_boot_envblock size");
+
+typedef struct vdev_label {
+ char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
+ vdev_boot_envblock_t vl_be; /* 8K */
+ vdev_phys_t vl_vdev_phys; /* 112K */
+ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
+} vdev_label_t; /* 256K total */
+
+/*
+ * vdev_dirty() flags
+ */
+#define VDD_METASLAB 0x01
+#define VDD_DTL 0x02
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
+#define VDEV_LABELS 4
+
+enum zio_checksum {
+ ZIO_CHECKSUM_INHERIT = 0,
+ ZIO_CHECKSUM_ON,
+ ZIO_CHECKSUM_OFF,
+ ZIO_CHECKSUM_LABEL,
+ ZIO_CHECKSUM_GANG_HEADER,
+ ZIO_CHECKSUM_ZILOG,
+ ZIO_CHECKSUM_FLETCHER_2,
+ ZIO_CHECKSUM_FLETCHER_4,
+ ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_ZILOG2,
+ ZIO_CHECKSUM_NOPARITY,
+ ZIO_CHECKSUM_SHA512,
+ ZIO_CHECKSUM_SKEIN,
+ ZIO_CHECKSUM_EDONR,
+ ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
+#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_EMPTY,
+ ZIO_COMPRESS_GZIP_1,
+ ZIO_COMPRESS_GZIP_2,
+ ZIO_COMPRESS_GZIP_3,
+ ZIO_COMPRESS_GZIP_4,
+ ZIO_COMPRESS_GZIP_5,
+ ZIO_COMPRESS_GZIP_6,
+ ZIO_COMPRESS_GZIP_7,
+ ZIO_COMPRESS_GZIP_8,
+ ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
+ ZIO_COMPRESS_LZ4,
+ ZIO_COMPRESS_ZSTD,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
+enum zio_zstd_levels {
+ ZIO_ZSTD_LEVEL_INHERIT = 0,
+ ZIO_ZSTD_LEVEL_1,
+#define ZIO_ZSTD_LEVEL_MIN ZIO_ZSTD_LEVEL_1
+ ZIO_ZSTD_LEVEL_2,
+ ZIO_ZSTD_LEVEL_3,
+#define ZIO_ZSTD_LEVEL_DEFAULT ZIO_ZSTD_LEVEL_3
+ ZIO_ZSTD_LEVEL_4,
+ ZIO_ZSTD_LEVEL_5,
+ ZIO_ZSTD_LEVEL_6,
+ ZIO_ZSTD_LEVEL_7,
+ ZIO_ZSTD_LEVEL_8,
+ ZIO_ZSTD_LEVEL_9,
+ ZIO_ZSTD_LEVEL_10,
+ ZIO_ZSTD_LEVEL_11,
+ ZIO_ZSTD_LEVEL_12,
+ ZIO_ZSTD_LEVEL_13,
+ ZIO_ZSTD_LEVEL_14,
+ ZIO_ZSTD_LEVEL_15,
+ ZIO_ZSTD_LEVEL_16,
+ ZIO_ZSTD_LEVEL_17,
+ ZIO_ZSTD_LEVEL_18,
+ ZIO_ZSTD_LEVEL_19,
+#define ZIO_ZSTD_LEVEL_MAX ZIO_ZSTD_LEVEL_19
+ ZIO_ZSTD_LEVEL_RESERVE = 101, /* Leave room for new positive levels */
+ ZIO_ZSTD_LEVEL_FAST, /* Fast levels are negative */
+ ZIO_ZSTD_LEVEL_FAST_1,
+#define ZIO_ZSTD_LEVEL_FAST_DEFAULT ZIO_ZSTD_LEVEL_FAST_1
+ ZIO_ZSTD_LEVEL_FAST_2,
+ ZIO_ZSTD_LEVEL_FAST_3,
+ ZIO_ZSTD_LEVEL_FAST_4,
+ ZIO_ZSTD_LEVEL_FAST_5,
+ ZIO_ZSTD_LEVEL_FAST_6,
+ ZIO_ZSTD_LEVEL_FAST_7,
+ ZIO_ZSTD_LEVEL_FAST_8,
+ ZIO_ZSTD_LEVEL_FAST_9,
+ ZIO_ZSTD_LEVEL_FAST_10,
+ ZIO_ZSTD_LEVEL_FAST_20,
+ ZIO_ZSTD_LEVEL_FAST_30,
+ ZIO_ZSTD_LEVEL_FAST_40,
+ ZIO_ZSTD_LEVEL_FAST_50,
+ ZIO_ZSTD_LEVEL_FAST_60,
+ ZIO_ZSTD_LEVEL_FAST_70,
+ ZIO_ZSTD_LEVEL_FAST_80,
+ ZIO_ZSTD_LEVEL_FAST_90,
+ ZIO_ZSTD_LEVEL_FAST_100,
+ ZIO_ZSTD_LEVEL_FAST_500,
+ ZIO_ZSTD_LEVEL_FAST_1000,
+#define ZIO_ZSTD_LEVEL_FAST_MAX ZIO_ZSTD_LEVEL_FAST_1000
+ ZIO_ZSTD_LEVEL_AUTO = 251, /* Reserved for future use */
+ ZIO_ZSTD_LEVEL_LEVELS
+};
+
+#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+
+/*
+ * On-disk version number.
+ */
+#define SPA_VERSION_1 1ULL
+#define SPA_VERSION_2 2ULL
+#define SPA_VERSION_3 3ULL
+#define SPA_VERSION_4 4ULL
+#define SPA_VERSION_5 5ULL
+#define SPA_VERSION_6 6ULL
+#define SPA_VERSION_7 7ULL
+#define SPA_VERSION_8 8ULL
+#define SPA_VERSION_9 9ULL
+#define SPA_VERSION_10 10ULL
+#define SPA_VERSION_11 11ULL
+#define SPA_VERSION_12 12ULL
+#define SPA_VERSION_13 13ULL
+#define SPA_VERSION_14 14ULL
+#define SPA_VERSION_15 15ULL
+#define SPA_VERSION_16 16ULL
+#define SPA_VERSION_17 17ULL
+#define SPA_VERSION_18 18ULL
+#define SPA_VERSION_19 19ULL
+#define SPA_VERSION_20 20ULL
+#define SPA_VERSION_21 21ULL
+#define SPA_VERSION_22 22ULL
+#define SPA_VERSION_23 23ULL
+#define SPA_VERSION_24 24ULL
+#define SPA_VERSION_25 25ULL
+#define SPA_VERSION_26 26ULL
+#define SPA_VERSION_27 27ULL
+#define SPA_VERSION_28 28ULL
+#define SPA_VERSION_5000 5000ULL
+
+/*
+ * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
+ * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
+ * and do the appropriate changes. Also bump the version number in
+ * usr/src/grub/capability.
+ */
+#define SPA_VERSION SPA_VERSION_5000
+#define SPA_VERSION_STRING "5000"
+
+/*
+ * Symbolic names for the changes that caused a SPA_VERSION switch.
+ * Used in the code when checking for presence or absence of a feature.
+ * Feel free to define multiple symbolic names for each version if there
+ * were multiple changes to on-disk structures during that version.
+ *
+ * NOTE: When checking the current SPA_VERSION in your code, be sure
+ * to use spa_version() since it reports the version of the
+ * last synced uberblock. Checking the in-flight version can
+ * be dangerous in some cases.
+ */
+#define SPA_VERSION_INITIAL SPA_VERSION_1
+#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
+#define SPA_VERSION_SPARES SPA_VERSION_3
+#define SPA_VERSION_RAID6 SPA_VERSION_3
+#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3
+#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
+#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
+#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
+#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
+#define SPA_VERSION_BOOTFS SPA_VERSION_6
+#define SPA_VERSION_SLOGS SPA_VERSION_7
+#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
+#define SPA_VERSION_FUID SPA_VERSION_9
+#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
+#define SPA_VERSION_REFQUOTA SPA_VERSION_9
+#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
+#define SPA_VERSION_L2CACHE SPA_VERSION_10
+#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
+#define SPA_VERSION_ORIGIN SPA_VERSION_11
+#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
+#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
+#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
+#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
+#define SPA_VERSION_USERSPACE SPA_VERSION_15
+#define SPA_VERSION_STMF_PROP SPA_VERSION_16
+#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
+#define SPA_VERSION_USERREFS SPA_VERSION_18
+#define SPA_VERSION_HOLES SPA_VERSION_19
+#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
+#define SPA_VERSION_DEDUP SPA_VERSION_21
+#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
+#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
+#define SPA_VERSION_SA SPA_VERSION_24
+#define SPA_VERSION_SCAN SPA_VERSION_25
+#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
+#define SPA_VERSION_DEADLISTS SPA_VERSION_26
+#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
+#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
+#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
+#define SPA_VERSION_FEATURES SPA_VERSION_5000
+
+#define SPA_VERSION_IS_SUPPORTED(v) \
+ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
+ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
+
+/*
+ * The following are configuration names used in the nvlist describing a pool's
+ * configuration.
+ */
+#define ZPOOL_CONFIG_VERSION "version"
+#define ZPOOL_CONFIG_POOL_NAME "name"
+#define ZPOOL_CONFIG_POOL_STATE "state"
+#define ZPOOL_CONFIG_POOL_TXG "txg"
+#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
+#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
+#define ZPOOL_CONFIG_TOP_GUID "top_guid"
+#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
+#define ZPOOL_CONFIG_TYPE "type"
+#define ZPOOL_CONFIG_CHILDREN "children"
+#define ZPOOL_CONFIG_ID "id"
+#define ZPOOL_CONFIG_GUID "guid"
+#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
+#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
+#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
+#define ZPOOL_CONFIG_PATH "path"
+#define ZPOOL_CONFIG_DEVID "devid"
+#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
+#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
+#define ZPOOL_CONFIG_ASHIFT "ashift"
+#define ZPOOL_CONFIG_ASIZE "asize"
+#define ZPOOL_CONFIG_DTL "DTL"
+#define ZPOOL_CONFIG_STATS "stats"
+#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
+#define ZPOOL_CONFIG_ERRCOUNT "error_count"
+#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
+#define ZPOOL_CONFIG_SPARES "spares"
+#define ZPOOL_CONFIG_IS_SPARE "is_spare"
+#define ZPOOL_CONFIG_NPARITY "nparity"
+#define ZPOOL_CONFIG_HOSTID "hostid"
+#define ZPOOL_CONFIG_HOSTNAME "hostname"
+#define ZPOOL_CONFIG_IS_LOG "is_log"
+#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
+#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
+
+/*
+ * The persistent vdev state is stored as separate values rather than a single
+ * 'vdev_state' entry. This is because a device can be in multiple states, such
+ * as offline and degraded.
+ */
+#define ZPOOL_CONFIG_OFFLINE "offline"
+#define ZPOOL_CONFIG_FAULTED "faulted"
+#define ZPOOL_CONFIG_DEGRADED "degraded"
+#define ZPOOL_CONFIG_REMOVED "removed"
+#define ZPOOL_CONFIG_FRU "fru"
+#define ZPOOL_CONFIG_AUX_STATE "aux_state"
+
+#define VDEV_TYPE_ROOT "root"
+#define VDEV_TYPE_MIRROR "mirror"
+#define VDEV_TYPE_REPLACING "replacing"
+#define VDEV_TYPE_RAIDZ "raidz"
+#define VDEV_TYPE_DISK "disk"
+#define VDEV_TYPE_FILE "file"
+#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_HOLE "hole"
+#define VDEV_TYPE_SPARE "spare"
+#define VDEV_TYPE_LOG "log"
+#define VDEV_TYPE_L2CACHE "l2cache"
+#define VDEV_TYPE_INDIRECT "indirect"
+
+/*
+ * This is needed in userland to report the minimum necessary device size.
+ */
+#define SPA_MINDEVSIZE (64ULL << 20)
+
+/*
+ * The location of the pool configuration repository, shared between kernel and
+ * userland.
+ */
+#define ZPOOL_CACHE "/boot/zfs/zpool.cache"
+
+/*
+ * vdev states are ordered from least to most healthy.
+ * A vdev that's CANT_OPEN or below is considered unusable.
+ */
+typedef enum vdev_state {
+ VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
+ VDEV_STATE_CLOSED, /* Not currently open */
+ VDEV_STATE_OFFLINE, /* Not allowed to open */
+ VDEV_STATE_REMOVED, /* Explicitly removed from system */
+ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
+ VDEV_STATE_FAULTED, /* External request to fault device */
+ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
+ VDEV_STATE_HEALTHY /* Presumed good */
+} vdev_state_t;
+
+/*
+ * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
+ * of the vdev stats structure uses these constants to distinguish why.
+ */
+typedef enum vdev_aux {
+ VDEV_AUX_NONE, /* no error */
+ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
+ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
+ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
+ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
+ VDEV_AUX_TOO_SMALL, /* vdev size is too small */
+ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
+ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
+ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
+ VDEV_AUX_SPARED /* hot spare used in another pool */
+} vdev_aux_t;
+
+/*
+ * pool state. The following states are written to disk as part of the normal
+ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are
+ * software abstractions used at various levels to communicate pool state.
+ */
+typedef enum pool_state {
+ POOL_STATE_ACTIVE = 0, /* In active use */
+ POOL_STATE_EXPORTED, /* Explicitly exported */
+ POOL_STATE_DESTROYED, /* Explicitly destroyed */
+ POOL_STATE_SPARE, /* Reserved for hot spare use */
+ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
+ POOL_STATE_UNAVAIL, /* Internal libzfs state */
+ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
+} pool_state_t;
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved. When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked. If the ub_version field is moved, we may not detect
+ * version mismatch. If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
+#define UBERBLOCK_SHIFT 10 /* up to 1K */
+
+#define MMP_MAGIC 0xa11cea11 /* all-see-all */
+
+#define MMP_INTERVAL_VALID_BIT 0x01
+#define MMP_SEQ_VALID_BIT 0x02
+#define MMP_FAIL_INT_VALID_BIT 0x04
+
+#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
+ ubp->ub_mmp_magic == MMP_MAGIC)
+#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_INTERVAL_VALID_BIT))
+#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_SEQ_VALID_BIT))
+#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_FAIL_INT_VALID_BIT))
+
+#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
+ >> 8)
+#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
+ >> 32)
+#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
+ >> 48)
+
+typedef struct uberblock {
+ uint64_t ub_magic; /* UBERBLOCK_MAGIC */
+ uint64_t ub_version; /* SPA_VERSION */
+ uint64_t ub_txg; /* txg of last sync */
+ uint64_t ub_guid_sum; /* sum of all vdev guids */
+ uint64_t ub_timestamp; /* UTC time of last sync */
+ blkptr_t ub_rootbp; /* MOS objset_phys_t */
+ /* highest SPA_VERSION supported by software that wrote this txg */
+ uint64_t ub_software_version;
+ /* Maybe missing in uberblocks we read, but always written */
+ uint64_t ub_mmp_magic;
+ /*
+ * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
+ * Otherwise, nanosec since last MMP write.
+ */
+ uint64_t ub_mmp_delay;
+
+ /*
+ * The ub_mmp_config contains the multihost write interval, multihost
+ * fail intervals, sequence number for sub-second granularity, and
+ * valid bit mask. This layout is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * This allows a write_interval of (2^24/1000)s, over 4.5 hours
+ *
+ * VALID Bits:
+ * - 0x01 - Write Interval (ms)
+ * - 0x02 - Sequence number exists
+ * - 0x04 - Fail Intervals
+ * - 0xf8 - Reserved
+ */
+ uint64_t ub_mmp_config;
+
+ /*
+ * ub_checkpoint_txg indicates two things about the current uberblock:
+ *
+ * 1] If it is not zero then this uberblock is a checkpoint. If it is
+ * zero, then this uberblock is not a checkpoint.
+ *
+ * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+ * the ub_txg that the uberblock had at the time we moved it to
+ * the MOS config.
+ *
+ * The field is set when we checkpoint the uberblock and continues to
+ * hold that value even after we've rewound (unlike the ub_txg that
+ * is reset to a higher value).
+ *
+ * Besides checks used to determine whether we are reopening the
+ * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+ * the value of the field is used to determine which ZIL blocks have
+ * been allocated according to the ms_sm when we are rewinding to a
+ * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+ * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+ */
+ uint64_t ub_checkpoint_txg;
+} uberblock_t;
+
+/*
+ * Flags.
+ */
+#define DNODE_MUST_BE_ALLOCATED 1
+#define DNODE_MUST_BE_FREE 2
+
+/*
+ * Fixed constants.
+ */
+#define DNODE_SHIFT 9 /* 512 bytes */
+#define DN_MIN_INDBLKSHIFT 12 /* 4k */
+#define DN_MAX_INDBLKSHIFT 17 /* 128k */
+#define DNODE_BLOCK_SHIFT 14 /* 16k */
+#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
+#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
+#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
+
+/*
+ * Derived constants.
+ */
+#define DNODE_MIN_SIZE (1 << DNODE_SHIFT)
+#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT)
+#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT)
+#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT)
+#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT)
+#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \
+ (1 << SPA_BLKPTRSHIFT))
+#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT)
+#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE))
+#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> \
+ SPA_BLKPTRSHIFT)
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
+
+#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
+#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
+ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
+ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
+
+#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
+
+/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
+#define DNODE_FLAG_USED_BYTES (1<<0)
+#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
+
+/* Does dnode have a SA spill blkptr in bonus? */
+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
+
+typedef struct dnode_phys {
+ uint8_t dn_type; /* dmu_object_type_t */
+ uint8_t dn_indblkshift; /* ln2(indirect block size) */
+ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
+ uint8_t dn_nblkptr; /* length of dn_blkptr */
+ uint8_t dn_bonustype; /* type of data in bonus buffer */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_flags; /* DNODE_FLAG_* */
+ uint16_t dn_datablkszsec; /* data block size in 512b sectors */
+ uint16_t dn_bonuslen; /* length of dn_bonus */
+ uint8_t dn_extra_slots; /* # of subsequent slots consumed */
+ uint8_t dn_pad2[3];
+
+ /* accounting is protected by dn_dirty_mtx */
+ uint64_t dn_maxblkid; /* largest allocated block ID */
+ uint64_t dn_used; /* bytes (or sectors) of disk space */
+
+ uint64_t dn_pad3[4];
+
+ /*
+ * The tail region is 448 bytes for a 512 byte dnode, and
+ * correspondingly larger for larger dnode sizes. The spill
+ * block pointer, when present, is always at the end of the tail
+ * region. There are three ways this space may be used, using
+ * a 512 byte dnode for this diagram:
+ *
+ * 0 64 128 192 256 320 384 448 (offset)
+ * +---------------+---------------+---------------+-------+
+ * | dn_blkptr[0] | dn_blkptr[1] | dn_blkptr[2] | / |
+ * +---------------+---------------+---------------+-------+
+ * | dn_blkptr[0] | dn_bonus[0..319] |
+ * +---------------+-----------------------+---------------+
+ * | dn_blkptr[0] | dn_bonus[0..191] | dn_spill |
+ * +---------------+-----------------------+---------------+
+ */
+ union {
+ blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
+ struct {
+ blkptr_t __dn_ignore1;
+ uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN];
+ };
+ struct {
+ blkptr_t __dn_ignore2;
+ uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
+ sizeof (blkptr_t)];
+ blkptr_t dn_spill;
+ };
+ };
+} dnode_phys_t;
+
+#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \
+ (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
+
+typedef enum dmu_object_byteswap {
+ DMU_BSWAP_UINT8,
+ DMU_BSWAP_UINT16,
+ DMU_BSWAP_UINT32,
+ DMU_BSWAP_UINT64,
+ DMU_BSWAP_ZAP,
+ DMU_BSWAP_DNODE,
+ DMU_BSWAP_OBJSET,
+ DMU_BSWAP_ZNODE,
+ DMU_BSWAP_OLDACL,
+ DMU_BSWAP_ACL,
+ /*
+ * Allocating a new byteswap type number makes the on-disk format
+ * incompatible with any other format that uses the same number.
+ *
+ * Data can usually be structured to work with one of the
+ * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
+ */
+ DMU_BSWAP_NUMFUNCS
+} dmu_object_byteswap_t;
+
+#define DMU_OT_NEWTYPE 0x80
+#define DMU_OT_METADATA 0x40
+#define DMU_OT_BYTESWAP_MASK 0x3f
+
+/*
+ * Defines a uint8_t object type. Object types specify if the data
+ * in the object is metadata (boolean) and how to byteswap the data
+ * (dmu_object_byteswap_t).
+ */
+#define DMU_OT(byteswap, metadata) \
+ (DMU_OT_NEWTYPE | \
+ ((metadata) ? DMU_OT_METADATA : 0) | \
+ ((byteswap) & DMU_OT_BYTESWAP_MASK))
+
+typedef enum dmu_object_type {
+ DMU_OT_NONE,
+ /* general: */
+ DMU_OT_OBJECT_DIRECTORY, /* ZAP */
+ DMU_OT_OBJECT_ARRAY, /* UINT64 */
+ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
+ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
+ DMU_OT_BPOBJ, /* UINT64 */
+ DMU_OT_BPOBJ_HDR, /* UINT64 */
+ /* spa: */
+ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
+ DMU_OT_SPACE_MAP, /* UINT64 */
+ /* zil: */
+ DMU_OT_INTENT_LOG, /* UINT64 */
+ /* dmu: */
+ DMU_OT_DNODE, /* DNODE */
+ DMU_OT_OBJSET, /* OBJSET */
+ /* dsl: */
+ DMU_OT_DSL_DIR, /* UINT64 */
+ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
+ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
+ DMU_OT_DSL_PROPS, /* ZAP */
+ DMU_OT_DSL_DATASET, /* UINT64 */
+ /* zpl: */
+ DMU_OT_ZNODE, /* ZNODE */
+ DMU_OT_OLDACL, /* Old ACL */
+ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
+ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
+ DMU_OT_MASTER_NODE, /* ZAP */
+ DMU_OT_UNLINKED_SET, /* ZAP */
+ /* zvol: */
+ DMU_OT_ZVOL, /* UINT8 */
+ DMU_OT_ZVOL_PROP, /* ZAP */
+ /* other; for testing only! */
+ DMU_OT_PLAIN_OTHER, /* UINT8 */
+ DMU_OT_UINT64_OTHER, /* UINT64 */
+ DMU_OT_ZAP_OTHER, /* ZAP */
+ /* new object types: */
+ DMU_OT_ERROR_LOG, /* ZAP */
+ DMU_OT_SPA_HISTORY, /* UINT8 */
+ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
+ DMU_OT_POOL_PROPS, /* ZAP */
+ DMU_OT_DSL_PERMS, /* ZAP */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_SYSACL, /* SYSACL */
+ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
+ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
+ DMU_OT_NEXT_CLONES, /* ZAP */
+ DMU_OT_SCAN_QUEUE, /* ZAP */
+ DMU_OT_USERGROUP_USED, /* ZAP */
+ DMU_OT_USERGROUP_QUOTA, /* ZAP */
+ DMU_OT_USERREFS, /* ZAP */
+ DMU_OT_DDT_ZAP, /* ZAP */
+ DMU_OT_DDT_STATS, /* ZAP */
+ DMU_OT_SA, /* System attr */
+ DMU_OT_SA_MASTER_NODE, /* ZAP */
+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
+ DMU_OT_SCAN_XLATE, /* ZAP */
+ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
+ DMU_OT_DEADLIST, /* ZAP */
+ DMU_OT_DEADLIST_HDR, /* UINT64 */
+ DMU_OT_DSL_CLONES, /* ZAP */
+ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
+ DMU_OT_NUMTYPES,
+
+ /*
+ * Names for valid types declared with DMU_OT().
+ */
+ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
+ DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
+ DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
+ DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
+ DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
+ DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
+ DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
+ DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
+ DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
+ DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE)
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+ DMU_OST_NONE,
+ DMU_OST_META,
+ DMU_OST_ZFS,
+ DMU_OST_ZVOL,
+ DMU_OST_OTHER, /* For testing only! */
+ DMU_OST_ANY, /* Be careful! */
+ DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+#define ZAP_MAXVALUELEN (1024 * 8)
+
+/*
+ * header for all bonus and spill buffers.
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attribues which are determined by the "layout number"
+ */
+
+#define SA_MAGIC 0x2F505A /* ZFS SA */
+typedef struct sa_hdr_phys {
+ uint32_t sa_magic;
+ uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
+ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
+ /* ... Data follows the lengths. */
+} sa_hdr_phys_t;
+
+/*
+ * sa_hdr_phys -> sa_layout_info
+ *
+ * 16 10 0
+ * +--------+-------+
+ * | hdrsz |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ * 2 ==> 16 byte header
+ *
+ */
+
+#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+ BF32_SET_SB(x, 10, 6, 3, 0, size); \
+ BF32_SET(x, 0, 10, num); \
+}
+
+#define SA_ATTR_BSWAP(x) BF32_GET(x, 16, 8)
+#define SA_ATTR_LENGTH(x) BF32_GET(x, 24, 16)
+#define SA_ATTR_NUM(x) BF32_GET(x, 0, 16)
+#define SA_ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+ BF64_SET(x, 24, 16, length); \
+ BF64_SET(x, 16, 8, bswap); \
+ BF64_SET(x, 0, 16, attr); \
+}
+
+#define SA_MODE_OFFSET 0
+#define SA_SIZE_OFFSET 8
+#define SA_GEN_OFFSET 16
+#define SA_UID_OFFSET 24
+#define SA_GID_OFFSET 32
+#define SA_PARENT_OFFSET 40
+#define SA_FLAGS_OFFSET 48
+#define SA_ATIME_OFFSET 56
+#define SA_MTIME_OFFSET 72
+#define SA_CTIME_OFFSET 88
+#define SA_CRTIME_OFFSET 104
+#define SA_LINKS_OFFSET 120
+//#define SA_PROJID_OFFSET 128
+
+#define SA_REGISTRY "REGISTRY"
+#define SA_LAYOUTS "LAYOUTS"
+
+typedef enum sa_bswap_type {
+ SA_UINT64_ARRAY,
+ SA_UINT32_ARRAY,
+ SA_UINT16_ARRAY,
+ SA_UINT8_ARRAY,
+ SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t sa_attr_type_t;
+
+#define ZIO_OBJSET_MAC_LEN 32
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log. All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
+ uint64_t zh_replay_seq; /* highest replayed sequence number */
+ blkptr_t zh_log; /* log chain */
+ uint64_t zh_claim_seq; /* highest claimed sequence number */
+ uint64_t zh_pad[5];
+} zil_header_t;
+
+#define OBJSET_PHYS_SIZE_V2 2048
+#define OBJSET_PHYS_SIZE_V3 4096
+
+typedef struct objset_phys {
+ dnode_phys_t os_meta_dnode;
+ zil_header_t os_zil_header;
+ uint64_t os_type;
+ uint64_t os_flags;
+ uint8_t os_portable_mac[ZIO_OBJSET_MAC_LEN];
+ uint8_t os_local_mac[ZIO_OBJSET_MAC_LEN];
+ char os_pad0[OBJSET_PHYS_SIZE_V2 - sizeof (dnode_phys_t)*3 -
+ sizeof (zil_header_t) - sizeof (uint64_t)*2 -
+ 2*ZIO_OBJSET_MAC_LEN];
+ dnode_phys_t os_userused_dnode;
+ dnode_phys_t os_groupused_dnode;
+ dnode_phys_t os_projectused_dnode;
+ char os_pad1[OBJSET_PHYS_SIZE_V3 - OBJSET_PHYS_SIZE_V2 -
+ sizeof (dnode_phys_t)];
+} objset_phys_t;
+
+#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
+#define SPACE_MAP_HISTOGRAM_SIZE 32
+
+typedef struct space_map_phys {
+ /* object number: not needed but kept for backwards compatibility */
+ uint64_t smp_object;
+
+ /* length of the object in bytes */
+ uint64_t smp_length;
+
+ /* space allocated from the map */
+ int64_t smp_alloc;
+
+ /* reserved */
+ uint64_t smp_pad[5];
+
+ /*
+ * The smp_histogram maintains a histogram of free regions. Each
+ * bucket, smp_histogram[i], contains the number of free regions
+ * whose size is:
+ * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
+ *
+ * Note that, if log space map feature is enabled, histograms of
+ * space maps that belong to metaslabs will take into account any
+ * unflushed changes for their metaslabs, even though the actual
+ * space map doesn't have entries for these changes.
+ */
+ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
+} space_map_phys_t;
+
+typedef enum {
+ SM_ALLOC,
+ SM_FREE
+} maptype_t;
+
+typedef struct space_map_entry {
+ maptype_t sme_type;
+ uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */
+ uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */
+ uint64_t sme_run; /* max is 2^36; units of sm_shift */
+
+ /*
+ * The following fields are not part of the actual space map entry
+ * on-disk and they are populated with the values from the debug
+ * entry most recently visited starting from the beginning to the
+ * end of the space map.
+ */
+ uint64_t sme_txg;
+ uint64_t sme_sync_pass;
+} space_map_entry_t;
+
+/* one-word entry constants */
+#define SM_DEBUG_PREFIX 2
+#define SM_OFFSET_BITS 47
+#define SM_RUN_BITS 15
+
+/* two-word entry constants */
+#define SM2_PREFIX 3
+#define SM2_OFFSET_BITS 63
+#define SM2_RUN_BITS 36
+
+#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2)
+#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2)
+
+#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2)
+#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2)
+#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
+#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
+#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
+#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
+
+#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS)
+#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS)
+#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
+#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
+#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
+#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
+#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
+#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL)
+
+#define SM2_RUN_DECODE(x) (BF64_DECODE(x, 24, SM2_RUN_BITS) + 1)
+#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 24, SM2_RUN_BITS)
+#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, 24)
+#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, 24)
+#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1)
+#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
+#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS)
+#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
+#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL)
+#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL)
+
+typedef struct dsl_dir_phys {
+ uint64_t dd_creation_time; /* not actually used */
+ uint64_t dd_head_dataset_obj;
+ uint64_t dd_parent_obj;
+ uint64_t dd_clone_parent_obj;
+ uint64_t dd_child_dir_zapobj;
+ /*
+ * how much space our children are accounting for; for leaf
+ * datasets, == physical space used by fs + snaps
+ */
+ uint64_t dd_used_bytes;
+ uint64_t dd_compressed_bytes;
+ uint64_t dd_uncompressed_bytes;
+ /* Administrative quota setting */
+ uint64_t dd_quota;
+ /* Administrative reservation setting */
+ uint64_t dd_reserved;
+ uint64_t dd_props_zapobj;
+ uint64_t dd_pad[7];
+ uint64_t dd_clones;
+ uint64_t dd_pad1[13]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+typedef struct dsl_dataset_phys {
+ uint64_t ds_dir_obj;
+ uint64_t ds_prev_snap_obj;
+ uint64_t ds_prev_snap_txg;
+ uint64_t ds_next_snap_obj;
+ uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */
+ uint64_t ds_num_children; /* clone/snap children; ==0 for head */
+ uint64_t ds_creation_time; /* seconds since 1970 */
+ uint64_t ds_creation_txg;
+ uint64_t ds_deadlist_obj;
+ uint64_t ds_used_bytes;
+ uint64_t ds_compressed_bytes;
+ uint64_t ds_uncompressed_bytes;
+ uint64_t ds_unique_bytes; /* only relevant to snapshots */
+ /*
+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
+ * collisions. The ds_guid is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.
+ */
+ uint64_t ds_fsid_guid;
+ uint64_t ds_guid;
+ uint64_t ds_flags;
+ blkptr_t ds_bp;
+ uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
+} dsl_dataset_phys_t;
+
+typedef struct dsl_deadlist_phys {
+ uint64_t dl_used;
+ uint64_t dl_comp;
+ uint64_t dl_uncomp;
+ uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t))
+
+typedef struct bpobj_phys {
+ uint64_t bpo_num_blkptrs;
+ uint64_t bpo_bytes;
+ uint64_t bpo_comp;
+ uint64_t bpo_uncomp;
+ uint64_t bpo_subobjs;
+ uint64_t bpo_num_subobjs;
+ uint64_t bpo_num_freed;
+} bpobj_phys_t;
+
+/*
+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
+ */
+#define DMU_POOL_DIRECTORY_OBJECT 1
+#define DMU_POOL_CONFIG "config"
+#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
+#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
+#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
+#define DMU_POOL_ROOT_DATASET "root_dataset"
+#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
+#define DMU_POOL_ERRLOG_LAST "errlog_last"
+#define DMU_POOL_SPARES "spares"
+#define DMU_POOL_DEFLATE "deflate"
+#define DMU_POOL_HISTORY "history"
+#define DMU_POOL_PROPS "pool_props"
+#define DMU_POOL_FREE_BPOBJ "free_bpobj"
+#define DMU_POOL_BPTREE_OBJ "bptree_obj"
+#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
+#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
+#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
+#define DMU_POOL_REMOVING "com.delphix:removing"
+#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
+#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
+#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
+
+#define ZAP_MAGIC 0x2F52AB2ABULL
+
+#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_block_shift)
+
+#define ZAP_MAXCD (uint32_t)(-1)
+#define ZAP_HASHBITS 28
+#define MZAP_ENT_LEN 64
+#define MZAP_ENT_MAX \
+ ((MZAP_MAX_BLKSZ - sizeof(mzap_phys_t)) / sizeof(mzap_ent_phys_t) + 1)
+#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
+#define MZAP_MAX_BLKSZ SPA_OLDMAXBLOCKSIZE
+
+typedef struct mzap_ent_phys {
+ uint64_t mze_value;
+ uint32_t mze_cd;
+ uint16_t mze_pad; /* in case we want to chain them someday */
+ char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+ uint64_t mz_block_type; /* ZBT_MICRO */
+ uint64_t mz_salt;
+ uint64_t mz_normflags;
+ uint64_t mz_pad[5];
+ mzap_ent_phys_t mz_chunk[1];
+ /* actually variable size depending on block size */
+} mzap_phys_t;
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+#define ZBT_LEAF ((1ULL << 63) + 0)
+#define ZBT_HEADER ((1ULL << 63) + 1)
+#define ZBT_MICRO ((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/*
+ * the embedded pointer table takes up half a block:
+ * block size / entry size (2^3) / 2
+ */
+#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
+
+/*
+ * The embedded pointer table starts half-way through the block. Since
+ * the pointer table itself is half the block, it starts at (64-bit)
+ * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
+ */
+#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
+ ((uint64_t *)(zap)->zap_phys) \
+ [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
+
+#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+ uint64_t zap_block_type; /* ZBT_HEADER */
+ uint64_t zap_magic; /* ZAP_MAGIC */
+
+ struct zap_table_phys {
+ uint64_t zt_blk; /* starting block number */
+ uint64_t zt_numblks; /* number of blocks */
+ uint64_t zt_shift; /* bits to index it */
+ uint64_t zt_nextblk; /* next (larger) copy start block */
+ uint64_t zt_blks_copied; /* number source blocks copied */
+ } zap_ptrtbl;
+
+ uint64_t zap_freeblk; /* the next free block */
+ uint64_t zap_num_leafs; /* number of leafs */
+ uint64_t zap_num_entries; /* number of entries */
+ uint64_t zap_salt; /* salt to stir into hash function */
+ uint64_t zap_normflags; /* flags for u8_textprep_str() */
+ uint64_t zap_flags; /* zap_flags_t */
+ /*
+ * This structure is followed by padding, and then the embedded
+ * pointer table. The embedded pointer table takes up second
+ * half of the block. It is accessed using the
+ * ZAP_EMBEDDED_PTRTBL_ENT() macro.
+ */
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+struct spa;
+typedef struct fat_zap {
+ int zap_block_shift; /* block size shift */
+ zap_phys_t *zap_phys;
+ const struct spa *zap_spa;
+ const dnode_phys_t *zap_dnode;
+} fat_zap_t;
+
+#define ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+#define ZAP_LEAF_CHUNKSIZE 24
+
+/*
+ * The amount of space available for chunks is:
+ * block size (1<<l->l_bs) - hash entry size (2) * number of hash
+ * entries - header space (2*chunksize)
+ */
+#define ZAP_LEAF_NUMCHUNKS(l) \
+ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
+ ZAP_LEAF_CHUNKSIZE - 2)
+
+/*
+ * The amount of space within the chunk available for the array is:
+ * chunk size - space for type (1) - space for next pointer (2)
+ */
+#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
+
+#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
+ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * Low water mark: when there are only this many chunks free, start
+ * growing the ptrtbl. Ideally, this should be larger than a
+ * "reasonably-sized" entry. 20 chunks is more than enough for the
+ * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
+ * while still being only around 3% for 16k blocks.
+ */
+#define ZAP_LEAF_LOW_WATER (20)
+
+/*
+ * The leaf hash table has block size / 2^5 (32) number of entries,
+ * which should be more than enough for the maximum number of entries,
+ * which is less than block size / CHUNKSIZE (24) / minimum number of
+ * chunks per entry (3).
+ */
+#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
+#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
+
+/*
+ * The chunks start immediately after the hash table. The end of the
+ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
+ * chunk_t.
+ */
+#define ZAP_LEAF_CHUNK(l, idx) \
+ ((zap_leaf_chunk_t *)(void *) \
+ ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
+
+#define ZAP_LEAF_HASH(l, h) \
+ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
+ ((h) >> \
+ (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len)))
+#define ZAP_LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[ZAP_LEAF_HASH(l, h)])
+
+typedef enum zap_chunk_type {
+ ZAP_CHUNK_FREE = 253,
+ ZAP_CHUNK_ENTRY = 252,
+ ZAP_CHUNK_ARRAY = 251,
+ ZAP_CHUNK_TYPE_MAX = 250
+} zap_chunk_type_t;
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+ struct zap_leaf_header {
+ uint64_t lh_block_type; /* ZBT_LEAF */
+ uint64_t lh_pad1;
+ uint64_t lh_prefix; /* hash prefix of this leaf */
+ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
+ uint16_t lh_nfree; /* number free chunks */
+ uint16_t lh_nentries; /* number of entries */
+ uint16_t lh_prefix_len; /* num bits used to id this */
+
+/* above is accessable to zap, below is zap_leaf private */
+
+ uint16_t lh_freelist; /* chunk head of free list */
+ uint8_t lh_pad2[12];
+ } l_hdr; /* 2 24-byte chunks */
+
+ /*
+ * The header is followed by a hash table with
+ * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
+ * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
+ * zap_leaf_chunk structures. These structures are accessed
+ * with the ZAP_LEAF_CHUNK() macro.
+ */
+
+ uint16_t l_hash[1];
+} zap_leaf_phys_t;
+
+typedef union zap_leaf_chunk {
+ struct zap_leaf_entry {
+ uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
+ uint8_t le_value_intlen; /* size of ints */
+ uint16_t le_next; /* next entry in hash chain */
+ uint16_t le_name_chunk; /* first chunk of the name */
+ uint16_t le_name_numints; /* bytes in name, incl null */
+ uint16_t le_value_chunk; /* first chunk of the value */
+ uint16_t le_value_numints; /* value length in ints */
+ uint32_t le_cd; /* collision differentiator */
+ uint64_t le_hash; /* hash value of the name */
+ } l_entry;
+ struct zap_leaf_array {
+ uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
+ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t la_next; /* next blk or CHAIN_END */
+ } l_array;
+ struct zap_leaf_free {
+ uint8_t lf_type; /* always ZAP_CHUNK_FREE */
+ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t lf_next; /* next in free list, or CHAIN_END */
+ } l_free;
+} zap_leaf_chunk_t;
+
+typedef struct zap_leaf {
+ int l_bs; /* block size shift */
+ zap_leaf_phys_t *l_phys;
+} zap_leaf_t;
+
+#define ZAP_MAXNAMELEN 256
+#define ZAP_MAXVALUELEN (1024 * 8)
+
+#define ACE_READ_DATA 0x00000001 /* file: read data */
+#define ACE_LIST_DIRECTORY 0x00000001 /* dir: list files */
+#define ACE_WRITE_DATA 0x00000002 /* file: write data */
+#define ACE_ADD_FILE 0x00000002 /* dir: create file */
+#define ACE_APPEND_DATA 0x00000004 /* file: append data */
+#define ACE_ADD_SUBDIRECTORY 0x00000004 /* dir: create subdir */
+#define ACE_READ_NAMED_ATTRS 0x00000008 /* FILE_READ_EA */
+#define ACE_WRITE_NAMED_ATTRS 0x00000010 /* FILE_WRITE_EA */
+#define ACE_EXECUTE 0x00000020 /* file: execute */
+#define ACE_TRAVERSE 0x00000020 /* dir: lookup name */
+#define ACE_DELETE_CHILD 0x00000040 /* dir: unlink child */
+#define ACE_READ_ATTRIBUTES 0x00000080 /* (all) stat, etc. */
+#define ACE_WRITE_ATTRIBUTES 0x00000100 /* (all) utimes, etc. */
+#define ACE_DELETE 0x00010000 /* (all) unlink self */
+#define ACE_READ_ACL 0x00020000 /* (all) getsecattr */
+#define ACE_WRITE_ACL 0x00040000 /* (all) setsecattr */
+#define ACE_WRITE_OWNER 0x00080000 /* (all) chown */
+#define ACE_SYNCHRONIZE 0x00100000 /* (all) */
+
+#define ACE_FILE_INHERIT_ACE 0x0001
+#define ACE_DIRECTORY_INHERIT_ACE 0x0002
+#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004
+#define ACE_INHERIT_ONLY_ACE 0x0008
+#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010
+#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020
+#define ACE_IDENTIFIER_GROUP 0x0040
+#define ACE_INHERITED_ACE 0x0080
+#define ACE_OWNER 0x1000
+#define ACE_GROUP 0x2000
+#define ACE_EVERYONE 0x4000
+
+#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000
+#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001
+#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002
+#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003
+
+typedef struct zfs_ace_hdr {
+ uint16_t z_type;
+ uint16_t z_flags;
+ uint32_t z_access_mask;
+} zfs_ace_hdr_t;
+
+/*
+ * Define special zfs pflags
+ */
+#define ZFS_XATTR 0x1 /* is an extended attribute */
+#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
+#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
+#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */
+#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */
+#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
+#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
+#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
+#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
+
+#define ZFS_READONLY 0x0000000100000000ull
+#define ZFS_HIDDEN 0x0000000200000000ull
+#define ZFS_SYSTEM 0x0000000400000000ull
+#define ZFS_ARCHIVE 0x0000000800000000ull
+#define ZFS_IMMUTABLE 0x0000001000000000ull
+#define ZFS_NOUNLINK 0x0000002000000000ull
+#define ZFS_APPENDONLY 0x0000004000000000ull
+#define ZFS_NODUMP 0x0000008000000000ull
+#define ZFS_OPAQUE 0x0000010000000000ull
+#define ZFS_AV_QUARANTINED 0x0000020000000000ull
+#define ZFS_AV_MODIFIED 0x0000040000000000ull
+#define ZFS_REPARSE 0x0000080000000000ull
+#define ZFS_OFFLINE 0x0000100000000000ull
+#define ZFS_SPARSE 0x0000200000000000ull
+
+#define MASTER_NODE_OBJ 1
+
+/*
+ * special attributes for master node.
+ */
+
+#define ZFS_FSID "FSID"
+#define ZFS_UNLINKED_SET "DELETE_QUEUE"
+#define ZFS_ROOT_OBJ "ROOT"
+#define ZPL_VERSION_OBJ "VERSION"
+#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE"
+#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS"
+#define ZFS_SA_ATTRS "SA_ATTRS"
+
+#define ZFS_FLAG_BLOCKPERPAGE 0x1
+#define ZFS_FLAG_NOGROWBLOCKS 0x2
+
+/*
+ * ZPL version - rev'd whenever an incompatible on-disk format change
+ * occurs. Independent of SPA/DMU/ZAP versioning.
+ */
+
+#define ZPL_VERSION 1ULL
+
+/*
+ * The directory entry has the type (currently unused on Solaris) in the
+ * top 4 bits, and the object number in the low 48 bits. The "middle"
+ * 12 bits are unused.
+ */
+#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
+#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
+#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
+
+typedef struct ace {
+ uid_t a_who; /* uid or gid */
+ uint32_t a_access_mask; /* read,write,... */
+ uint16_t a_flags; /* see below */
+ uint16_t a_type; /* allow or deny */
+} ace_t;
+
+#define ACE_SLOT_CNT 6
+
+typedef struct zfs_znode_acl {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_count; /* Number of ACEs */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_pad; /* pad */
+ ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_znode_acl_t;
+
+/*
+ * This is the persistent portion of the znode. It is stored
+ * in the "bonus buffer" of the file. Short symbolic links
+ * are also stored in the bonus buffer.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_pad[4]; /* 144 - future */
+ zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we only use this space to store symbolic links.
+ */
+} znode_phys_t;
+
+/*
+ * In-core vdev representation.
+ */
+struct vdev;
+struct spa;
+typedef int vdev_phys_read_t(struct vdev *, void *, off_t, void *, size_t);
+typedef int vdev_phys_write_t(struct vdev *, off_t, void *, size_t);
+typedef int vdev_read_t(struct vdev *, const blkptr_t *, void *, off_t, size_t);
+
+typedef STAILQ_HEAD(vdev_list, vdev) vdev_list_t;
+
+typedef struct vdev_indirect_mapping_entry_phys {
+ /*
+ * Decode with DVA_MAPPING_* macros.
+ * Contains:
+ * the source offset (low 63 bits)
+ * the one-bit "mark", used for garbage collection (by zdb)
+ */
+ uint64_t vimep_src;
+
+ /*
+ * Note: the DVA's asize is 24 bits, and can thus store ranges
+ * up to 8GB.
+ */
+ dva_t vimep_dst;
+} vdev_indirect_mapping_entry_phys_t;
+
+#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \
+ BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \
+ BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#if 0
+typedef struct vdev_indirect_mapping_entry {
+ vdev_indirect_mapping_entry_phys_t vime_mapping;
+ uint32_t vime_obsolete_count;
+ list_node_t vime_node;
+} vdev_indirect_mapping_entry_t;
+#endif
+
+/*
+ * This is stored in the bonus buffer of the mapping object, see comment of
+ * vdev_indirect_config for more details.
+ */
+typedef struct vdev_indirect_mapping_phys {
+ uint64_t vimp_max_offset;
+ uint64_t vimp_bytes_mapped;
+ uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */
+
+ /*
+ * For each entry in the mapping object, this object contains an
+ * entry representing the number of bytes of that mapping entry
+ * that were no longer in use by the pool at the time this indirect
+ * vdev was last condensed.
+ */
+ uint64_t vimp_counts_object;
+} vdev_indirect_mapping_phys_t;
+
+#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t))
+
+typedef struct vdev_indirect_mapping {
+ uint64_t vim_object;
+ boolean_t vim_havecounts;
+
+ /* vim_entries segment offset currently in memory. */
+ uint64_t vim_entry_offset;
+ /* vim_entries segment size. */
+ size_t vim_num_entries;
+
+ /* Needed by dnode_read() */
+ const void *vim_spa;
+ dnode_phys_t *vim_dn;
+
+ /*
+ * An ordered array of mapping entries, sorted by source offset.
+ * Note that vim_entries is needed during a removal (and contains
+ * mappings that have been synced to disk so far) to handle frees
+ * from the removing device.
+ */
+ vdev_indirect_mapping_entry_phys_t *vim_entries;
+ objset_phys_t *vim_objset;
+ vdev_indirect_mapping_phys_t *vim_phys;
+} vdev_indirect_mapping_t;
+
+/*
+ * On-disk indirect vdev state.
+ *
+ * An indirect vdev is described exclusively in the MOS config of a pool.
+ * The config for an indirect vdev includes several fields, which are
+ * accessed in memory by a vdev_indirect_config_t.
+ */
+typedef struct vdev_indirect_config {
+ /*
+ * Object (in MOS) which contains the indirect mapping. This object
+ * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
+ * vimep_src. The bonus buffer for this object is a
+ * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
+ * removal is initiated.
+ *
+ * Note that this object can be empty if none of the data on the vdev
+ * has been copied yet.
+ */
+ uint64_t vic_mapping_object;
+
+ /*
+ * Object (in MOS) which contains the birth times for the mapping
+ * entries. This object contains an array of
+ * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
+ * buffer for this object is a vdev_indirect_birth_phys_t. This object
+ * is allocated when a vdev removal is initiated.
+ *
+ * Note that this object can be empty if none of the vdev has yet been
+ * copied.
+ */
+ uint64_t vic_births_object;
+
+/*
+ * This is the vdev ID which was removed previous to this vdev, or
+ * UINT64_MAX if there are no previously removed vdevs.
+ */
+ uint64_t vic_prev_indirect_vdev;
+} vdev_indirect_config_t;
+
+typedef struct vdev {
+ STAILQ_ENTRY(vdev) v_childlink; /* link in parent's child list */
+ STAILQ_ENTRY(vdev) v_alllink; /* link in global vdev list */
+ vdev_list_t v_children; /* children of this vdev */
+ const char *v_name; /* vdev name */
+ uint64_t v_guid; /* vdev guid */
+ uint64_t v_id; /* index in parent */
+ uint64_t v_psize; /* physical device capacity */
+ int v_ashift; /* offset to block shift */
+ int v_nparity; /* # parity for raidz */
+ struct vdev *v_top; /* parent vdev */
+ size_t v_nchildren; /* # children */
+ vdev_state_t v_state; /* current state */
+ vdev_phys_read_t *v_phys_read; /* read from raw leaf vdev */
+ vdev_phys_write_t *v_phys_write; /* write to raw leaf vdev */
+ vdev_read_t *v_read; /* read from vdev */
+ void *v_priv; /* data for read/write function */
+ boolean_t v_islog;
+ struct spa *v_spa; /* link to spa */
+ /*
+ * Values stored in the config for an indirect or removing vdev.
+ */
+ vdev_indirect_config_t vdev_indirect_config;
+ vdev_indirect_mapping_t *v_mapping;
+} vdev_t;
+
+/*
+ * In-core pool representation.
+ */
+typedef STAILQ_HEAD(spa_list, spa) spa_list_t;
+
+typedef struct spa {
+ STAILQ_ENTRY(spa) spa_link; /* link in global pool list */
+ char *spa_name; /* pool name */
+ uint64_t spa_guid; /* pool guid */
+ uint64_t spa_txg; /* most recent transaction */
+ struct uberblock *spa_uberblock; /* best uberblock so far */
+ vdev_t *spa_root_vdev; /* toplevel vdev container */
+ objset_phys_t *spa_mos; /* MOS for this pool */
+ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
+ void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
+ boolean_t spa_with_log; /* this pool has log */
+
+ struct uberblock spa_uberblock_master; /* best uberblock so far */
+ objset_phys_t spa_mos_master; /* MOS for this pool */
+ struct uberblock spa_uberblock_checkpoint; /* checkpoint uberblock */
+ objset_phys_t spa_mos_checkpoint; /* Checkpoint MOS */
+ void *spa_bootenv; /* bootenv from pool label */
+} spa_t;
+
+spa_t *spa_create(uint64_t guid, const char *name);
+
+/* IO related arguments. */
+typedef struct zio {
+ spa_t *io_spa;
+ blkptr_t *io_bp;
+ void *io_data;
+ uint64_t io_size;
+ uint64_t io_offset;
+
+ /* Stuff for the vdev stack */
+ vdev_t *io_vd;
+ void *io_vsd;
+
+ int io_error;
+} zio_t;
+
+#if 0 /* XXXMJ */
+static void decode_embedded_bp_compressed(const blkptr_t *, void *);
+#endif
+
+#endif /* _ZFSIMPL_H_ */
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Nov 19, 12:50 AM (9 h, 27 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14707690
Default Alt Text
D35248.id106145.diff (229 KB)
Attached To
Mode
D35248: makefs: Add ZFS support
Attached
Detach File
Event Timeline
Log In to Comment