Page MenuHomeFreeBSD

D32697.id97787.diff
No OneTemporary

D32697.id97787.diff

diff --git a/include/Makefile b/include/Makefile
--- a/include/Makefile
+++ b/include/Makefile
@@ -50,7 +50,7 @@
fs/procfs fs/smbfs fs/udf fs/unionfs \
geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \
geom/mirror geom/mountver geom/multipath geom/nop \
- geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
+ geom/raid geom/raid3 geom/shsec geom/stripe geom/union geom/virstor \
net/altq \
net/route \
netgraph/atm netgraph/netflow \
diff --git a/lib/geom/Makefile.classes b/lib/geom/Makefile.classes
--- a/lib/geom/Makefile.classes
+++ b/lib/geom/Makefile.classes
@@ -22,4 +22,5 @@
GEOM_CLASSES+= raid3
GEOM_CLASSES+= shsec
GEOM_CLASSES+= stripe
+GEOM_CLASSES+= union
GEOM_CLASSES+= virstor
diff --git a/lib/geom/union/Makefile b/lib/geom/union/Makefile
new file mode 100644
--- /dev/null
+++ b/lib/geom/union/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+PACKAGE=runtime
+.PATH: ${.CURDIR:H:H}/misc
+
+GEOM_CLASS= union
+
+.include <bsd.lib.mk>
diff --git a/lib/geom/union/Makefile.depend b/lib/geom/union/Makefile.depend
new file mode 100644
--- /dev/null
+++ b/lib/geom/union/Makefile.depend
@@ -0,0 +1,19 @@
+# $FreeBSD$
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+ gnu/lib/csu \
+ include \
+ include/xlocale \
+ lib/${CSU_DIR} \
+ lib/libc \
+ lib/libcompiler_rt \
+ lib/libgeom \
+ sbin/geom/core \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif
diff --git a/lib/geom/union/geom_union.c b/lib/geom/union/geom_union.c
new file mode 100644
--- /dev/null
+++ b/lib/geom/union/geom_union.c
@@ -0,0 +1,82 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Marshall Kirk McKusick <mckusick@mckusick.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <libgeom.h>
+#include <geom/union/g_union.h>
+
+#include "core/geom.h"
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_UNION_VERSION;
+
+struct g_command class_commands[] = {
+ { "create", G_FLAG_LOADKLD, NULL,
+ {
+ { 'o', "offset", "0", G_TYPE_NUMBER },
+ { 's', "size", "0", G_TYPE_NUMBER },
+ { 'S', "secsize", "0", G_TYPE_NUMBER },
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ { 'Z', "gunionname", G_VAL_OPTIONAL, G_TYPE_STRING },
+ G_OPT_SENTINEL
+ },
+ "[-v] [-o offset] [-s size] [-S secsize] [-Z gunionname] "
+ "upperdev lowerdev"
+ },
+ { "destroy", 0, NULL,
+ {
+ { 'f', "force", NULL, G_TYPE_BOOL },
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ },
+ "[-fv] prov ..."
+ },
+ { "reset", 0, NULL,
+ {
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ },
+ "[-v] prov ..."
+ },
+ { "commit", 0, NULL,
+ {
+ { 'f', "force", NULL, G_TYPE_BOOL },
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ },
+ "[-v] prov ..."
+ },
+ { "revert", 0, NULL,
+ {
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ },
+ "[-v] prov ..."
+ },
+ G_CMD_SENTINEL
+};
diff --git a/lib/geom/union/gunion.8 b/lib/geom/union/gunion.8
new file mode 100644
--- /dev/null
+++ b/lib/geom/union/gunion.8
@@ -0,0 +1,305 @@
+.\"
+.\" Copyright (c) 2021 Marshall Kirk McKusick <mckusick@mckusick.com>
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd October 30, 2021
+.Dt GUNION 8
+.Os
+.Sh NAME
+.Nm gunion
+.Nd "control utility for UNION GEOM class"
+.Sh SYNOPSIS
+.Nm
+.Cm create
+.Op Fl v
+.Op Fl o Ar offset
+.Op Fl s Ar size
+.Op Fl S Ar secsize
+.Op Fl Z Ar gunionname
+.Ar upperdev lowerdev
+.Nm
+.Cm destroy
+.Op Fl fv
+.Ar prov ...
+.Nm
+.Cm reset
+.Op Fl v
+.Ar prov ...
+.Nm
+.Cm revert
+.Op Fl v
+.Ar prov ...
+.Nm
+.Cm commit
+.Op Fl fv
+.Ar prov ...
+.Nm
+.Cm list
+.Nm
+.Cm status
+.Nm
+.Cm load
+.Nm
+.Cm unload
+.Sh DESCRIPTION
+The
+.Nm
+utility is used to track changes to a read-only disk on a writable disk.
+Logically, a writable disk is placed over a read-only disk.
+Write requests are intercepted and stored on the writable disk.
+Read requests are first checked to see if they have been written
+on the top (writable disk) and if found are returned.
+If they have not been written on the top disk,
+then they are read from the lower disk.
+.Pp
+The
+.Nm
+utility can be especially useful if you have a large disk with a
+corrupted filesystem that you are unsure of how to repair.
+You can use
+.Nm
+to place another disk over the corrupted disk and then attempt
+to repair the filesystem.
+If the repair fails, you can revert all the changes in the upper disk
+and be back to the unchanged state of the lower disk thus allowing you
+to try another approach to repairing it.
+If the repair is successful you can request that all the writes recorded
+on the top disk be written to the lower disk.
+.Pp
+Another use of the
+.Nm
+utility is to try out upgrades to your system.
+Place the upper disk over the disk holding your filesystem that
+is to be upgraded and then run the upgrade on it.
+If it works, commit it;
+if it fails, revert the upgrade.
+An example is given below.
+.Pp
+The upper disk must be at least the size of the disk that it covers.
+The union metadata exists only for the
+period of time that the union is instantiated,
+so it is important to commit the updates before destroying the union.
+If the top disk is about 2.5 percent larger for 512 byte sector disks
+(or 0.5 percent larger for 4K sector disks) than the disk that it covers,
+it is posible (thought not currently implemented) to save the union
+metadata between instantiations of the union device.
+.Pp
+If you do not have physical media available to use for the upper layer, the
+.Xr md 4
+disk can be used instead.
+When used in
+.Cm swap
+mode the changes are all held in buffer memory.
+Pages get pushed out to the swap when the system is under memory pressure,
+otherwise they stay in the operating memory.
+If long-term persistance is desired,
+.Cm vnode
+mode can be used in which a regular file is used as backing store.
+The disk space used by the file is based on the amount of data that
+is written to the top device.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width "destroy"
+.It Cm create
+Set up a union provider on the two given devices.
+The first device given is used as the top device and must be writable.
+The second device given is used as the bottom device and need only be readable.
+The second device may be mounted read-only but it is recommended
+that it be unmounted and accessed only through a mount of the union device.
+If the operation succeeds, the new provider should appear with name
+.Pa /dev/ Ns Ao Ar upperdev Ac Ns - Ns Ao Ar lowerdev Ac Ns Pa .union .
+An alternate name can be specified with the
+.Fl Z
+flag.
+The kernel module
+.Pa geom_union.ko
+will be loaded if it is not loaded already.
+.Pp
+Additional options include:
+.Bl -tag -width "-Z gunionname"
+.It Fl o Ar offset
+Where to begin on the original provider.
+The default is to start at the beginning of the disk (i.e., at offset 0).
+This option may be used to skip over partitioning information stored
+at the beginning of a disk.
+The offset must be a multiple of the sector size.
+.It Fl s Ar size
+Size of the transparent provider.
+The default is to be the same size as the lower disk.
+Any extra space at the end of the upper disk may be used to store
+union metadata.
+.It Fl S Ar secsize
+Sector size of the transparent provider.
+The default is to be the same sector size as the lower disk.
+.It Fl v
+Be more verbose.
+.It Fl Z Ar gunionname
+The name of the new provider.
+The suffix
+.Dq .union
+will be appended to the provider name.
+.El
+.It Cm destroy
+Turn off the given union providers.
+.Pp
+Additional options include:
+.Bl -tag -width "-f"
+.It Fl f
+Force the removal of the specified provider.
+.It Fl v
+Be more verbose.
+.El
+.It Cm revert
+Discard all the changes made in the top layer thus reverting to the
+original state of the lower device.
+.It Cm commit
+Write all the changes made in the top device to the lower device
+thus committing the lower device to have the same data as the union.
+.Pp
+Additional options include:
+.Bl -tag -width "-f"
+.It Fl f
+The
+.Cm commit
+command will not allow the lower device to be mounted while the
+.Cm commit
+operation is being done.
+However, the
+.Fl f
+flag may be specified to allow the lower device to be mounted read-only.
+To prevent a filesystem panic on the mounted lower-device filesystem,
+immediately after the
+.Cm commit
+operation finishes the lower-device filesystem should be unmounted
+and then remounted to update its metadata state.
+If the lower-device filesystem is UFS/FFS,
+it is simply necessary to upgrade from read-only to read-write as the
+filesystem will reload its in-kernel state as part of making that change.
+.It Fl v
+Be more verbose.
+.El
+.It Cm reset
+Reset statistics for the given union providers.
+.It Cm list
+See
+.Xr geom 8 .
+.It Cm status
+See
+.Xr geom 8 .
+.It Cm load
+See
+.Xr geom 8 .
+.It Cm unload
+See
+.Xr geom 8 .
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, and 1 if the command fails.
+.Sh EXAMPLES
+The following example shows how to create and destroy a
+union provider with disks
+.Pa /dev/da0p1
+as the read-only disk on the bottom and
+.Pa /dev/md0
+as the wriable disk on the top.
+.Bd -literal -offset indent
+gunion create -v md0 da0p1
+mount /dev/md0-da0p1.union /mnt
+.Ed
+.Pp
+Proceed to make changes in /mnt filesystem.
+If they are successful and you want to keep them.
+.Bd -literal -offset indent
+gunion commit -v md0-da0p1.union
+.Ed
+.Pp
+If they are unsuccessful and you want to roll back.
+.Bd -literal -offset indent
+gunion revert -v md0-da0p1.union
+.Ed
+.Pp
+When done eliminate the union.
+.Bd -literal -offset indent
+gunion destroy -v md0-da0p1.union
+.Ed
+.Pp
+All uncommitted changes will be discarded when the union is destroyed.
+.Pp
+If you use the name of the full disk, for example
+.Pa da0
+and it is labelled,
+then a union name will appear for the disk as
+.Pa md0-da0.union
+as well as for each partition on the disk as
+.Pa md0-da0p1.union ,
+.Pa md0-da0p2.union ,
+etc.
+A commit operation can be done only on
+.Pa md0-da0.union
+and will commit changes to all the partitions.
+If partition level commits are desired,
+then a union must be created for each partition.
+.Pp
+The traffic statistics for the given
+union providers can be obtained with the
+.Cm list
+command.
+The example below shows the number of bytes written with
+.Xr newfs 8 :
+.Bd -literal -offset indent
+gunion create md0 da0p1
+newfs /dev/md0-da0p1.union
+gunion list
+.Ed
+.Sh SYSCTL VARIABLES
+The following
+.Xr sysctl 8
+variables can be used to control the behavior of the
+.Nm UNION
+GEOM class.
+The default value is shown next to each variable.
+.Bl -tag -width indent
+.It Va kern.geom.union.debug : No 0
+Debug level of the
+.Nm UNION
+GEOM class.
+This can be set to a number between 0 and 3 inclusive.
+If set to 0, no debug information is printed.
+If set to 1, all the verbose messages are logged.
+If set to 2, addition error-related information is logged.
+If set to 3, the maximum amount of debug information is printed.
+.El
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr geom 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in
+.Fx 14.0 .
+.Sh AUTHORS
+.An Marshall Kirk McKusick Aq Mt mckusick@mckusick.com
diff --git a/sbin/geom/core/geom.8 b/sbin/geom/core/geom.8
--- a/sbin/geom/core/geom.8
+++ b/sbin/geom/core/geom.8
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd September 14, 2018
+.Dd October 30, 2021
.Dt GEOM 8
.Os
.Sh NAME
@@ -162,6 +162,8 @@
.It
STRIPE
.It
+UNION
+.It
VIRSTOR
.El
.Sh ENVIRONMENT
@@ -210,6 +212,7 @@
.Xr gsched 8 ,
.Xr gshsec 8 ,
.Xr gstripe 8 ,
+.Xr gunion 8 ,
.Xr gvirstor 8
.Sh HISTORY
The
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3703,6 +3703,7 @@
geom/raid3/g_raid3_ctl.c optional geom_raid3
geom/shsec/g_shsec.c optional geom_shsec
geom/stripe/g_stripe.c optional geom_stripe
+geom/union/g_union.c optional geom_union
geom/uzip/g_uzip.c optional geom_uzip
geom/uzip/g_uzip_lzma.c optional geom_uzip
geom/uzip/g_uzip_wrkthr.c optional geom_uzip
diff --git a/sys/geom/union/g_union.h b/sys/geom/union/g_union.h
new file mode 100644
--- /dev/null
+++ b/sys/geom/union/g_union.h
@@ -0,0 +1,123 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Marshall Kirk McKusick <mckusick@mckusick.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _G_UNION_H_
+#define _G_UNION_H_
+
+#define G_UNION_CLASS_NAME "UNION"
+#define G_UNION_VERSION 1
+#define G_UNION_SUFFIX ".union"
+/*
+ * Special flag to instruct gunion to passthrough the underlying provider's
+ * physical path
+ */
+#define G_UNION_PHYSPATH_PASSTHROUGH "\255"
+
+#ifdef _KERNEL
+#define G_UNION_DEBUG(lvl, ...) \
+ _GEOM_DEBUG("GEOM_UNION", g_union_debug, (lvl), NULL, __VA_ARGS__)
+#define G_UNION_LOGREQLVL(lvl, bp, ...) \
+ _GEOM_DEBUG("GEOM_UNION", g_union_debug, (lvl), (bp), __VA_ARGS__)
+#define G_UNION_LOGREQ(bp, ...) G_UNION_LOGREQLVL(3, bp, __VA_ARGS__)
+
+/*
+ * State maintained by each instance of a UNION GEOM.
+ */
+struct g_union_softc {
+ struct rwlock sc_rwlock; /* writemap lock */
+ uint64_t **sc_writemap_root; /* root of write map */
+ uint64_t *sc_leafused; /* 1 => leaf has allocation */
+ uint64_t sc_map_size; /* size of write map */
+ long sc_root_size; /* entries in root node */
+ long sc_leaf_size; /* entries in leaf node */
+ long sc_bits_per_leaf; /* bits per leaf node entry */
+ long sc_writemap_memory; /* memory used by writemap */
+ off_t sc_offset; /* starting offset in lower */
+ off_t sc_size; /* size of union geom */
+ off_t sc_sectorsize; /* sector size of geom */
+ struct g_consumer *sc_uppercp; /* upper-level provider */
+ struct g_consumer *sc_lowercp; /* lower-level provider */
+ long sc_flags; /* see flags below */
+ long sc_reads; /* number of reads done */
+ long sc_wrotebytes; /* number of bytes written */
+ long sc_writes; /* number of writes done */
+ long sc_readbytes; /* number of bytes read */
+ long sc_deletes; /* number of deletes done */
+ long sc_getattrs; /* number of getattrs done */
+ long sc_flushes; /* number of flushes done */
+ long sc_cmd0s; /* number of cmd0's done */
+ long sc_cmd1s; /* number of cmd1's done */
+ long sc_cmd2s; /* number of cmd2's done */
+ long sc_speedups; /* number of speedups done */
+};
+/*
+ * UNION flags
+ */
+#define DOING_COMMIT 0x00000001 /* a commit command is in progress */
+
+#define DOING_COMMIT_BITNUM 0 /* a commit command is in progress */
+
+#define BITS_PER_ENTRY (sizeof(uint64_t) * NBBY)
+#define G_RLOCK(sc) rw_rlock(&(sc)->sc_rwlock)
+#define G_RUNLOCK(sc) rw_runlock(&(sc)->sc_rwlock)
+#define G_WLOCK(sc) rw_wlock(&(sc)->sc_rwlock)
+#define G_WUNLOCK(sc) rw_wunlock(&(sc)->sc_rwlock)
+
+/*
+ * The writelock is held while a commit operation is in progress.
+ * While held union device may not be used or in use.
+ * Returns == 0 if lock was successfully obtained.
+ */
+static inline int
+g_union_get_writelock(struct g_union_softc *sc)
+{
+
+ return (atomic_testandset_long(&sc->sc_flags, DOING_COMMIT_BITNUM));
+}
+
+static inline void
+g_union_rel_writelock(struct g_union_softc *sc)
+{
+ long ret __diagused;
+
+ ret = atomic_testandclear_long(&sc->sc_flags, DOING_COMMIT_BITNUM);
+ KASSERT(ret != 0, ("UNION GEOM releasing unheld lock"));
+}
+
+/*
+ * Used to track a set of read requests within a single BIO_READ request.
+ */
+struct g_union_iotrack {
+ long io_numios; /* number of I/O operations in progress */
+ long io_error; /* non-I/O errors */
+};
+
+#endif /* _KERNEL */
+
+#endif /* _G_UNION_H_ */
diff --git a/sys/geom/union/g_union.c b/sys/geom/union/g_union.c
new file mode 100644
--- /dev/null
+++ b/sys/geom/union/g_union.c
@@ -0,0 +1,1239 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Marshall Kirk McKusick <mckusick@mckusick.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/ctype.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <geom/geom.h>
+#include <geom/geom_dbg.h>
+#include <geom/union/g_union.h>
+
+SYSCTL_DECL(_kern_geom);
+static SYSCTL_NODE(_kern_geom, OID_AUTO, union, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "GEOM_UNION stuff");
+static u_int g_union_debug = 0;
+SYSCTL_UINT(_kern_geom_union, OID_AUTO, debug, CTLFLAG_RW, &g_union_debug, 0,
+ "Debug level");
+
+static void g_union_config(struct gctl_req *req, struct g_class *mp,
+ const char *verb);
+static g_access_t g_union_access;
+static g_start_t g_union_start;
+static g_dumpconf_t g_union_dumpconf;
+static g_orphan_t g_union_orphan;
+static int g_union_destroy_geom(struct gctl_req *req, struct g_class *mp,
+ struct g_geom *gp);
+static g_provgone_t g_union_providergone;
+static g_resize_t g_union_resize;
+
+struct g_class g_union_class = {
+ .name = G_UNION_CLASS_NAME,
+ .version = G_VERSION,
+ .ctlreq = g_union_config,
+ .access = g_union_access,
+ .start = g_union_start,
+ .dumpconf = g_union_dumpconf,
+ .orphan = g_union_orphan,
+ .destroy_geom = g_union_destroy_geom,
+ .providergone = g_union_providergone,
+ .resize = g_union_resize,
+};
+
+static void g_union_ctl_create(struct gctl_req *req, struct g_class *mp, bool);
+static intmax_t g_union_fetcharg(struct gctl_req *req, const char *name);
+static bool g_union_verify_nprefix(const char *name);
+static void g_union_ctl_destroy(struct gctl_req *req, struct g_class *mp, bool);
+static struct g_geom *g_union_find_geom(struct g_class *mp, const char *name);
+static void g_union_ctl_reset(struct gctl_req *req, struct g_class *mp, bool);
+static void g_union_ctl_revert(struct gctl_req *req, struct g_class *mp, bool);
+static void g_union_revert(struct g_union_softc *sc);
+static void g_union_ctl_commit(struct gctl_req *req, struct g_class *mp, bool);
+static void g_union_setmap(struct bio *bp, struct g_union_softc *sc);
+static bool g_union_getmap(struct bio *bp, struct g_union_softc *sc,
+ off_t *len2read);
+static void g_union_done(struct bio *bp);
+static void g_union_kerneldump(struct bio *bp, struct g_union_softc *sc);
+static int g_union_dumper(void *, void *, vm_offset_t, off_t, size_t);
+static int g_union_destroy(struct g_geom *gp, bool force, bool verbose);
+
+/*
+ * Operate on union-specific configuration commands.
+ */
+static void
+g_union_config(struct gctl_req *req, struct g_class *mp, const char *verb)
+{
+ uint32_t *version, *verbose;
+
+ g_topology_assert();
+
+ version = gctl_get_paraml(req, "version", sizeof(*version));
+ if (version == NULL) {
+ gctl_error(req, "No '%s' argument.", "version");
+ return;
+ }
+ if (*version != G_UNION_VERSION) {
+ gctl_error(req, "Userland and kernel parts are out of sync.");
+ return;
+ }
+ verbose = gctl_get_paraml(req, "verbose", sizeof(*verbose));
+ if (verbose == NULL) {
+ gctl_error(req, "No '%s' argument.", "verbose");
+ return;
+ }
+ if (strcmp(verb, "create") == 0) {
+ g_union_ctl_create(req, mp, *verbose);
+ return;
+ } else if (strcmp(verb, "destroy") == 0) {
+ g_union_ctl_destroy(req, mp, *verbose);
+ return;
+ } else if (strcmp(verb, "reset") == 0) {
+ g_union_ctl_reset(req, mp, *verbose);
+ return;
+ } else if (strcmp(verb, "revert") == 0) {
+ g_union_ctl_revert(req, mp, *verbose);
+ return;
+ } else if (strcmp(verb, "commit") == 0) {
+ g_union_ctl_commit(req, mp, *verbose);
+ return;
+ }
+
+ gctl_error(req, "Unknown verb.");
+}
+
+/*
+ * Create a union device.
+ */
+static void
+g_union_ctl_create(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ struct g_provider *upperpp, *lowerpp, *newpp;
+ struct g_consumer *uppercp, *lowercp;
+ struct g_union_softc *sc;
+ struct g_geom_alias *gap;
+ struct g_geom *gp;
+ intmax_t offset, secsize, size, needed;
+ const char *gunionname;
+ int *nargs, error, i, n;
+ char name[64];
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument", "nargs");
+ return;
+ }
+ if (*nargs < 2) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+ if (*nargs > 2) {
+ gctl_error(req, "Extra device(s).");
+ return;
+ }
+
+ offset = g_union_fetcharg(req, "offset");
+ size = g_union_fetcharg(req, "size");
+ secsize = g_union_fetcharg(req, "secsize");
+ gunionname = gctl_get_asciiparam(req, "gunionname");
+
+ upperpp = gctl_get_provider(req, "arg0");
+ lowerpp = gctl_get_provider(req, "arg1");
+ if (upperpp == NULL || lowerpp == NULL)
+ /* error message provided by gctl_get_provider() */
+ return;
+ /* Create the union */
+ if (secsize == 0)
+ secsize = lowerpp->sectorsize;
+ else if ((secsize % lowerpp->sectorsize) != 0) {
+ gctl_error(req, "Sector size %jd is not a multiple of lower "
+ "provider %s's %jd sector size.", (intmax_t)secsize,
+ lowerpp->name, (intmax_t)lowerpp->sectorsize);
+ return;
+ }
+ if (secsize > maxphys) {
+ gctl_error(req, "Too big secsize %jd for lower provider %s.",
+ (intmax_t)secsize, lowerpp->name);
+ return;
+ }
+ if (secsize % upperpp->sectorsize != 0) {
+ gctl_error(req, "Sector size %jd is not a multiple of upper "
+ "provider %s's %jd sector size.", (intmax_t)secsize,
+ upperpp->name, (intmax_t)upperpp->sectorsize);
+ return;
+ }
+ if ((offset % secsize) != 0) {
+ gctl_error(req, "Offset %jd is not a multiple of lower "
+ "provider %s's %jd sector size.", (intmax_t)offset,
+ lowerpp->name, (intmax_t)lowerpp->sectorsize);
+ return;
+ }
+ if (size == 0)
+ size = lowerpp->mediasize - offset;
+ else
+ size -= offset;
+ if ((size % secsize) != 0) {
+ gctl_error(req, "Size %jd is not a multiple of sector size "
+ "%jd.", (intmax_t)size, (intmax_t)secsize);
+ return;
+ }
+ if (offset + size < lowerpp->mediasize) {
+ gctl_error(req, "Size %jd is too small for lower provider %s, "
+ "needs %jd.", (intmax_t)(offset + size), lowerpp->name,
+ lowerpp->mediasize);
+ return;
+ }
+ if (size > upperpp->mediasize) {
+ gctl_error(req, "Upper provider %s size (%jd) is too small, "
+ "needs %jd.", upperpp->name, (intmax_t)upperpp->mediasize,
+ (intmax_t)size);
+ return;
+ }
+ if (gunionname != NULL && !g_union_verify_nprefix(gunionname)) {
+ gctl_error(req, "Gunion name %s must be alphanumeric.",
+ gunionname);
+ return;
+ }
+ if (gunionname != NULL) {
+ n = snprintf(name, sizeof(name), "%s%s", gunionname,
+ G_UNION_SUFFIX);
+ } else {
+ n = snprintf(name, sizeof(name), "%s-%s%s", upperpp->name,
+ lowerpp->name, G_UNION_SUFFIX);
+ }
+ if (n <= 0 || n >= sizeof(name)) {
+ gctl_error(req, "Invalid provider name.");
+ return;
+ }
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (strcmp(gp->name, name) == 0) {
+ gctl_error(req, "Provider %s already exists.", name);
+ return;
+ }
+ }
+ gp = g_new_geomf(mp, "%s", name);
+ sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
+ rw_init(&sc->sc_rwlock, "gunion");
+ sc->sc_offset = offset;
+ sc->sc_size = size;
+ sc->sc_sectorsize = secsize;
+ sc->sc_reads = 0;
+ sc->sc_writes = 0;
+ sc->sc_deletes = 0;
+ sc->sc_getattrs = 0;
+ sc->sc_flushes = 0;
+ sc->sc_speedups = 0;
+ sc->sc_cmd0s = 0;
+ sc->sc_cmd1s = 0;
+ sc->sc_cmd2s = 0;
+ sc->sc_readbytes = 0;
+ sc->sc_wrotebytes = 0;
+ sc->sc_writemap_memory = 0;
+ gp->softc = sc;
+
+ newpp = g_new_providerf(gp, "%s", gp->name);
+ newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
+ newpp->mediasize = size;
+ newpp->sectorsize = secsize;
+ LIST_FOREACH(gap, &upperpp->aliases, ga_next)
+ g_provider_add_alias(newpp, "%s%s", gap->ga_alias,
+ G_UNION_SUFFIX);
+ LIST_FOREACH(gap, &lowerpp->aliases, ga_next)
+ g_provider_add_alias(newpp, "%s%s", gap->ga_alias,
+ G_UNION_SUFFIX);
+ lowercp = g_new_consumer(gp);
+ lowercp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+ if ((error = g_attach(lowercp, lowerpp)) != 0) {
+ gctl_error(req, "Error %d: cannot attach to provider %s.",
+ error, lowerpp->name);
+ goto fail1;
+ }
+ /* request read and exclusive access for lower */
+ if ((error = g_access(lowercp, 1, 0, 1)) != 0) {
+ gctl_error(req, "Error %d: cannot obtain exclusive access to "
+ "%s.\n\tMust be unmounted or mounted read-only.", error,
+ lowerpp->name);
+ goto fail2;
+ }
+ uppercp = g_new_consumer(gp);
+ uppercp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+ if ((error = g_attach(uppercp, upperpp)) != 0) {
+ gctl_error(req, "Error %d: cannot attach to provider %s.",
+ error, upperpp->name);
+ goto fail3;
+ }
+ /* request read, write, and exclusive access for upper */
+ if ((error = g_access(uppercp, 1, 1, 1)) != 0) {
+ gctl_error(req, "Error %d: cannot obtain write access to %s.",
+ error, upperpp->name);
+ goto fail4;
+ }
+ sc->sc_uppercp = uppercp;
+ sc->sc_lowercp = lowercp;
+
+ newpp->flags |= (upperpp->flags & G_PF_ACCEPT_UNMAPPED) &
+ (lowerpp->flags & G_PF_ACCEPT_UNMAPPED);
+ g_error_provider(newpp, 0);
+ /*
+ * Allocate the map that tracks the sectors that have been written
+ * to the top layer. We use a 2-level hierarchy as that lets us
+ * map up to 1 petabyte using allocations of less than 33 Mb
+ * when using 4K byte sectors (or 268 Mb with 512 byte sectors).
+ *
+ * We totally populate the leaf nodes rather than allocating them
+ * as they are first used because their usage occurs in the
+ * g_union_start() routine that may be running in the g_down
+ * thread which cannot sleep.
+ */
+ sc->sc_map_size = roundup(size / secsize, BITS_PER_ENTRY);
+ needed = sc->sc_map_size / BITS_PER_ENTRY;
+ for (sc->sc_root_size = 1;
+ sc->sc_root_size * sc->sc_root_size < needed;
+ sc->sc_root_size++)
+ continue;
+ sc->sc_writemap_root = g_malloc(sc->sc_root_size * sizeof(uint64_t),
+ M_WAITOK | M_ZERO);
+ sc->sc_leaf_size = sc->sc_root_size;
+ sc->sc_bits_per_leaf = sc->sc_leaf_size * BITS_PER_ENTRY;
+ sc->sc_leafused = g_malloc(roundup(sc->sc_root_size, BITS_PER_ENTRY),
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < sc->sc_root_size; i++)
+ sc->sc_writemap_root[i] =
+ g_malloc(sc->sc_leaf_size * sizeof(uint64_t),
+ M_WAITOK | M_ZERO);
+ sc->sc_writemap_memory =
+ (sc->sc_root_size + sc->sc_root_size * sc->sc_leaf_size) *
+ sizeof(uint64_t) + roundup(sc->sc_root_size, BITS_PER_ENTRY);
+ if (verbose)
+ printf("Device %s created with memory map size %jd.\n",
+ gp->name, sc->sc_writemap_memory);
+ G_UNION_DEBUG(1, "Device %s created with memory map size %jd.",
+ gp->name, sc->sc_writemap_memory);
+ return;
+
+fail4:
+ g_detach(uppercp);
+fail3:
+ g_destroy_consumer(uppercp);
+ g_access(lowercp, -1, 0, -1);
+fail2:
+ g_detach(lowercp);
+fail1:
+ g_destroy_consumer(lowercp);
+ g_destroy_provider(newpp);
+ g_destroy_geom(gp);
+}
+
+/*
+ * Fetch named option and verify that it is positive.
+ */
+static intmax_t
+g_union_fetcharg(struct gctl_req *req, const char *name)
+{
+ intmax_t *val;
+
+ val = gctl_get_paraml_opt(req, name, sizeof(*val));
+ if (val == NULL)
+ return (0);
+ if (*val >= 0)
+ return (*val);
+ gctl_error(req, "Invalid '%s': negative value, using default", name);
+ return (0);
+}
+
+/*
+ * Verify that a name is alphanumeric.
+ */
+static bool
+g_union_verify_nprefix(const char *name)
+{
+ int i;
+
+ for (i = 0; i < strlen(name); i++) {
+ if (isalpha(name[i]) == 0 && isdigit(name[i]) == 0) {
+ return (false);
+ }
+ }
+ return (true);
+}
+
+/*
+ * Destroy a union device.
+ */
+static void
+g_union_ctl_destroy(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ int *nargs, *force, error, i;
+ struct g_geom *gp;
+ const char *name;
+ char param[16];
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument", "nargs");
+ return;
+ }
+ if (*nargs <= 0) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+ force = gctl_get_paraml(req, "force", sizeof(*force));
+ if (force == NULL) {
+ gctl_error(req, "No 'force' argument");
+ return;
+ }
+
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ name = gctl_get_asciiparam(req, param);
+ if (name == NULL) {
+ gctl_error(req, "No 'arg%d' argument", i);
+ return;
+ }
+ if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0)
+ name += strlen(_PATH_DEV);
+ gp = g_union_find_geom(mp, name);
+ if (gp == NULL) {
+ gctl_error(req, "Device %s is invalid.", name);
+ return;
+ }
+ if ((error = g_union_destroy(gp, *force, verbose) != 0)) {
+ gctl_error(req, "Error %d: cannot destroy device %s.",
+ error, gp->name);
+ return;
+ }
+ }
+}
+
+/*
+ * Find a union geom.
+ */
+static struct g_geom *
+g_union_find_geom(struct g_class *mp, const char *name)
+{
+ struct g_geom *gp;
+
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (strcmp(gp->name, name) == 0)
+ return (gp);
+ }
+ return (NULL);
+}
+
+/*
+ * Zero out all the statistics associated with a union device.
+ */
+static void
+g_union_ctl_reset(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ struct g_union_softc *sc;
+ struct g_provider *pp;
+ struct g_geom *gp;
+ char param[16];
+ int i, *nargs;
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument", "nargs");
+ return;
+ }
+ if (*nargs <= 0) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ pp = gctl_get_provider(req, param);
+ if (pp == NULL)
+ return;
+ gp = pp->geom;
+ if (gp->class != mp) {
+ gctl_error(req, "Provider %s is invalid.", pp->name);
+ return;
+ }
+ sc = gp->softc;
+ sc->sc_reads = 0;
+ sc->sc_writes = 0;
+ sc->sc_deletes = 0;
+ sc->sc_getattrs = 0;
+ sc->sc_flushes = 0;
+ sc->sc_speedups = 0;
+ sc->sc_cmd0s = 0;
+ sc->sc_cmd1s = 0;
+ sc->sc_cmd2s = 0;
+ sc->sc_readbytes = 0;
+ sc->sc_wrotebytes = 0;
+ if (verbose)
+ printf("Device %s has been reset.\n", gp->name);
+ G_UNION_DEBUG(1, "Device %s has been reset.", gp->name);
+ }
+}
+
+/*
+ * Revert all write requests made to the top layer of the union.
+ */
+static void
+g_union_ctl_revert(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ struct g_provider *pp;
+ struct g_geom *gp;
+ char param[16];
+ int i, *nargs;
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument", "nargs");
+ return;
+ }
+ if (*nargs <= 0) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ pp = gctl_get_provider(req, param);
+ if (pp == NULL)
+ return;
+ gp = pp->geom;
+ if (gp->class != mp) {
+ gctl_error(req, "Provider %s is invalid.", pp->name);
+ return;
+ }
+ g_union_revert(gp->softc);
+ if (verbose)
+ printf("Device %s has been reverted.\n", gp->name);
+ G_UNION_DEBUG(1, "Device %s has been reverted.", gp->name);
+ }
+}
+
+/*
+ * Revert union writes by zero'ing out the writemap.
+ */
+static void
+g_union_revert(struct g_union_softc *sc)
+{
+ int i;
+
+ G_WLOCK(sc);
+ for (i = 0; i < sc->sc_root_size; i++)
+ memset(sc->sc_writemap_root[i], 0,
+ sc->sc_leaf_size * sizeof(uint64_t));
+ memset(sc->sc_leafused, 0, roundup(sc->sc_root_size, BITS_PER_ENTRY));
+ G_WUNLOCK(sc);
+}
+
+/*
+ * Commit all the writes made in the top layer to the lower layer.
+ */
+static void
+g_union_ctl_commit(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ struct g_union_softc *sc;
+ struct g_provider *pp, *lowerpp;
+ struct g_consumer *lowercp;
+ struct g_geom *gp;
+ struct bio *bp;
+ char param[16];
+ off_t len2rd, len2wt, savelen;
+ int i, error, *nargs, *force;
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument", "nargs");
+ return;
+ }
+ if (*nargs <= 0) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+ force = gctl_get_paraml(req, "force", sizeof(*force));
+ if (force == NULL) {
+ gctl_error(req, "No 'force' argument");
+ return;
+ }
+
+ /* Get a bio buffer to do our I/O */
+ bp = g_alloc_bio();
+ bp->bio_data = g_malloc(MAXBSIZE, M_WAITOK);
+ bp->bio_done = biodone;
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ pp = gctl_get_provider(req, param);
+ if (pp == NULL)
+ continue;
+ gp = pp->geom;
+ if (gp->class != mp) {
+ gctl_error(req, "Provider %s is invalid.", pp->name);
+ continue;
+ }
+ sc = gp->softc;
+ if (g_union_get_writelock(sc) != 0) {
+ gctl_error(req, "Commit already in progress for "
+ "provider %s.", pp->name);
+ continue;
+ }
+
+ /* upgrade to write access for lower */
+ lowercp = sc->sc_lowercp;
+ lowerpp = lowercp->provider;
+ /*
+ * No mount or other use of union is allowed, unless the
+ * -f flag is given which allows read-only mount or usage.
+ */
+ if ((*force == false && pp->acr > 0) || pp->acw > 0 ||
+ pp->ace > 0) {
+ gctl_error(req, "Unable to get exclusive access for "
+ "writing of %s.\n\tNote that %s cannot be mounted "
+ "or otherwise\n\topen during a commit unless the "
+ "-f flag is used.", pp->name, pp->name);
+ g_union_rel_writelock(sc);
+ continue;
+ }
+ /*
+ * No mount or other use of lower media is allowed, unless the
+ * -f flag is given which allows read-only mount or usage.
+ */
+ if ((*force == false && lowerpp->acr > lowercp->acr) ||
+ lowerpp->acw > lowercp->acw ||
+ lowerpp->ace > lowercp->ace) {
+ gctl_error(req, "provider %s is unable to get "
+ "exclusive access to %s\n\tfor writing. Note that "
+ "%s cannot be mounted or otherwise open\n\tduring "
+ "a commit unless the -f flag is used.", pp->name,
+ lowerpp->name, lowerpp->name);
+ g_union_rel_writelock(sc);
+ continue;
+ }
+ if ((error = g_access(lowercp, 0, 1, 0)) != 0) {
+ gctl_error(req, "Error %d: provider %s is unable to "
+ "access %s for writing.", error, pp->name,
+ lowerpp->name);
+ g_union_rel_writelock(sc);
+ continue;
+ }
+ /* Loop over write map copying across written blocks */
+ bp->bio_offset = 0;
+ bp->bio_length = sc->sc_map_size * sc->sc_sectorsize;
+ G_RLOCK(sc);
+ while (bp->bio_length > 0) {
+ if (!g_union_getmap(bp, sc, &len2rd)) {
+ /* not written, so skip */
+ bp->bio_offset += len2rd;
+ bp->bio_length -= len2rd;
+ continue;
+ }
+ G_RUNLOCK(sc);
+ /* need to read then write len2rd sectors */
+ for ( ; len2rd > 0; len2rd -= len2wt) {
+ /* limit ourselves to MAXBSIZE size I/Os */
+ len2wt = len2rd;
+ if (len2wt > MAXBSIZE)
+ len2wt = MAXBSIZE;
+ savelen = bp->bio_length;
+ bp->bio_length = len2wt;
+ bp->bio_cmd = BIO_READ;
+ g_io_request(bp, sc->sc_uppercp);
+ if ((error = biowait(bp, "rdunion")) != 0) {
+ gctl_error(req, "Commit read error %d "
+ "in provider %s, commit aborted.",
+ error, pp->name);
+ goto cleanup;
+ }
+ bp->bio_flags &= ~BIO_DONE;
+ bp->bio_cmd = BIO_WRITE;
+ g_io_request(bp, lowercp);
+ if ((error = biowait(bp, "wtunion")) != 0) {
+ gctl_error(req, "Commit write error %d "
+ "in provider %s, commit aborted.",
+ error, pp->name);
+ goto cleanup;
+ }
+ bp->bio_flags &= ~BIO_DONE;
+ bp->bio_offset += len2wt;
+ bp->bio_length = savelen - len2wt;
+ }
+ G_RLOCK(sc);
+ }
+ G_RUNLOCK(sc);
+ /* clear the write map */
+ g_union_revert(sc);
+cleanup:
+ /* return lower to previous access */
+ if ((error = g_access(lowercp, 0, -1, 0)) != 0) {
+ G_UNION_DEBUG(2, "Error %d: device %s could not reset "
+ "access to %s (r=0 w=-1 e=0).", error, pp->name,
+ lowerpp->name);
+ }
+ g_union_rel_writelock(sc);
+ if (verbose)
+ printf("Device %s has been committed.\n", gp->name);
+ G_UNION_DEBUG(1, "Device %s has been committed.", gp->name);
+ }
+ g_free(bp->bio_data);
+ g_destroy_bio(bp);
+}
+
+/*
+ * Generally allow access unless a commit is in progress.
+ */
+static int
+g_union_access(struct g_provider *pp, int r, int w, int e)
+{
+ struct g_union_softc *sc;
+
+ sc = pp->geom->softc;
+ if (sc == NULL) {
+ if (r <= 0 && w <= 0 && e <= 0)
+ return (0);
+ return (ENXIO);
+ }
+ r += pp->acr;
+ w += pp->acw;
+ e += pp->ace;
+ if (g_union_get_writelock(sc) != 0) {
+ if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0)
+ return (0);
+ return (EBUSY);
+ }
+ g_union_rel_writelock(sc);
+ return (0);
+}
+
+/*
+ * Initiate an I/O operation on the union device.
+ */
+static void
+g_union_start(struct bio *bp)
+{
+ struct g_union_softc *sc;
+ struct g_consumer *cp, *firstcp;
+ struct bio *cbp, *firstbp;
+ struct g_union_iotrack *iotrackp;
+ off_t rdlen, len2rd, offset;
+ char *level;
+ int iocnt;
+
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL) {
+ g_io_deliver(bp, ENOMEM);
+ return;
+ }
+ sc = bp->bio_to->geom->softc;
+ cbp->bio_offset += sc->sc_offset;
+ cbp->bio_done = g_std_done;
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ /*
+ * The usual read case is that we either read the top layer
+ * if the block has been previously written or the bottom layer
+ * if it has not been written. However, it is possible that
+ * only part of the block has been written, For example we may
+ * have written a UFS/FFS file fragment comprising several
+ * sectors out of an 8-sector block. Here, if the entire
+ * 8-sector block is read for example by a snapshot needing
+ * to copy the full block, then we need to read the written
+ * sectors from the upper level and the unwritten sectors from
+ * the lower level. We do this by alternately reading from the
+ * top and bottom layers until we complete the read. As
+ * requests for partially written blocks are uncommon, we
+ * make no attempt to optimize the code.
+ */
+ atomic_add_long(&sc->sc_reads, 1);
+ atomic_add_long(&sc->sc_readbytes, bp->bio_length);
+ rdlen = cbp->bio_length;
+ G_RLOCK(sc);
+ for (iocnt = 0; ; iocnt++) {
+ if (g_union_getmap(cbp, sc, &len2rd)) {
+ /* read top */
+ cp = sc->sc_uppercp;
+ level = "upper";
+ } else {
+ /* read bottom */
+ cp = sc->sc_lowercp;
+ level = "lower";
+ }
+ /* Check if only a single read is required */
+ if (iocnt == 0 && rdlen == len2rd) {
+ G_UNION_LOGREQ(cbp, "Sending %jd byte read "
+ "request to %s level.", len2rd, level);
+ g_io_request(cbp, cp);
+ G_RUNLOCK(sc);
+ return;
+ }
+ if (iocnt == 0) {
+ iotrackp = g_malloc(sizeof(*iotrackp),
+ M_NOWAIT | M_ZERO);
+ if (iotrackp == NULL) {
+ cbp->bio_error = ENOMEM;
+ g_std_done(cbp);
+ G_RUNLOCK(sc);
+ return;
+ }
+ iotrackp->io_numios = 1;
+ } else {
+ atomic_add_long(&iotrackp->io_numios, 1);
+ }
+ cbp->bio_done = g_union_done;
+ cbp->bio_caller1 = iotrackp;
+ cbp->bio_length = len2rd;
+ offset = cbp->bio_offset + len2rd;
+ rdlen -= len2rd;
+ G_UNION_LOGREQ(cbp, "Sending %jd byte read "
+ "request to %s level.", len2rd, level);
+ /*
+ * To avoid prematurely notifying our consumer
+ * that their I/O has completed, we have to delay
+ * issuing our first I/O request until we have
+ * issued all the additional I/O requests.
+ */
+ if (iocnt > 0) {
+ g_io_request(cbp, cp);
+ } else {
+ firstbp = cbp;
+ firstcp = cp;
+ }
+ if (rdlen == len2rd)
+ break;
+ /* set up for next read */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL) {
+ iotrackp->io_error = ENOMEM;
+ atomic_add_long(&iotrackp->io_numios, -1);
+ break;
+ }
+ cbp->bio_offset = offset;
+ cbp->bio_length = rdlen;
+ atomic_add_long(&sc->sc_reads, 1);
+ }
+ /* We have issued all our I/O, so start the first one */
+ g_io_request(firstbp, firstcp);
+ G_RUNLOCK(sc);
+ return;
+ case BIO_WRITE:
+ /*
+ * Writes are always done to the top level. Here, we
+ * record the blocks that we are writing.
+ */
+ G_UNION_LOGREQ(cbp, "Sending %jd byte write request to upper "
+ "level.", cbp->bio_length);
+ atomic_add_long(&sc->sc_writes, 1);
+ atomic_add_long(&sc->sc_wrotebytes, bp->bio_length);
+ g_union_setmap(cbp, sc);
+ break;
+ case BIO_DELETE:
+ G_UNION_LOGREQ(bp, "Delete request received.");
+ atomic_add_long(&sc->sc_deletes, 1);
+ break;
+ case BIO_GETATTR:
+ G_UNION_LOGREQ(bp, "Getattr request received.");
+ atomic_add_long(&sc->sc_getattrs, 1);
+ if (strcmp(bp->bio_attribute, "GEOM::kerneldump") != 0)
+ /* forward the GETATTR to the lower-level device */
+ break;
+ g_union_kerneldump(bp, sc);
+ return;
+ case BIO_FLUSH:
+ G_UNION_LOGREQ(bp, "Flush request received.");
+ atomic_add_long(&sc->sc_flushes, 1);
+ break;
+ case BIO_SPEEDUP:
+ G_UNION_LOGREQ(bp, "Speedup request received.");
+ atomic_add_long(&sc->sc_speedups, 1);
+ break;
+ case BIO_CMD0:
+ G_UNION_LOGREQ(bp, "Cmd0 request received.");
+ atomic_add_long(&sc->sc_cmd0s, 1);
+ break;
+ case BIO_CMD1:
+ G_UNION_LOGREQ(bp, "Cmd1 request received.");
+ atomic_add_long(&sc->sc_cmd1s, 1);
+ break;
+ case BIO_CMD2:
+ G_UNION_LOGREQ(bp, "Cmd2 request received.");
+ atomic_add_long(&sc->sc_cmd2s, 1);
+ break;
+ }
+ /*
+ * All commands other than write are passed through to the
+ * upper-level device since it is writable and thus able to
+ * respond to delete, flush, and speedup requests.
+ */
+ g_io_request(cbp, sc->sc_uppercp);
+}
+
+/*
+ * Used only when completing a BIO_READ operation.
+ */
+static void
+g_union_done(struct bio *bp)
+{
+ struct g_union_iotrack *iotrackp;
+
+ iotrackp = bp->bio_caller1;
+ if (iotrackp->io_error != 0 && bp->bio_error == 0)
+ bp->bio_error = iotrackp->io_error;
+ iotrackp->io_error = 0;
+ if (atomic_fetchadd_long(&iotrackp->io_numios, -1) == 0)
+ g_free(iotrackp);
+ g_std_done(bp);
+}
+
+/*
+ * Record blocks that have been written in the map.
+ */
+static void
+g_union_setmap(struct bio *bp, struct g_union_softc *sc)
+{
+ size_t root_idx;
+ uint64_t **leaf;
+ uint64_t *wordp;
+ off_t start, numsec;
+
+ KASSERT(bp->bio_offset % sc->sc_sectorsize == 0,
+ ("g_union_setmap: offset not on sector boundry"));
+ KASSERT(bp->bio_length % sc->sc_sectorsize == 0,
+ ("g_union_setmap: length not a multiple of sectors"));
+ start = bp->bio_offset / sc->sc_sectorsize;
+ numsec = bp->bio_length / sc->sc_sectorsize;
+ KASSERT(start + numsec < sc->sc_map_size,
+ ("g_union_setmap: block %jd is out of range", start + numsec));
+ G_WLOCK(sc);
+ for ( ; numsec > 0; numsec--, start++) {
+ root_idx = start / sc->sc_bits_per_leaf;
+ leaf = &sc->sc_writemap_root[root_idx];
+ wordp = &(*leaf)
+ [(start % sc->sc_bits_per_leaf) / BITS_PER_ENTRY];
+ *wordp |= 1ULL << (start % BITS_PER_ENTRY);
+ sc->sc_leafused[root_idx / BITS_PER_ENTRY] |=
+ 1 << (root_idx % BITS_PER_ENTRY);
+ }
+ G_WUNLOCK(sc);
+}
+
+/*
+ * Check map to determine whether blocks have been written.
+ *
+ * Return true if they have been written so should be read from the top
+ * layer. Return false if they have not been written so should be read
+ * from the bottom layer. Return in len2read the bytes to be read. See
+ * the comment above the BIO_READ implementation in g_union_start() for
+ * an explantion of why len2read may be shorter than the buffer length.
+ */
+static bool
+g_union_getmap(struct bio *bp, struct g_union_softc *sc, off_t *len2read)
+{
+ off_t start, numsec, leafresid, bitloc;
+ bool first, maptype, retval;
+ uint64_t *leaf, word;
+ size_t root_idx;
+
+ KASSERT(bp->bio_offset % sc->sc_sectorsize == 0,
+ ("g_union_getmap: offset not on sector boundry"));
+ KASSERT(bp->bio_length % sc->sc_sectorsize == 0,
+ ("g_union_getmap: length not a multiple of sectors"));
+ start = bp->bio_offset / sc->sc_sectorsize;
+ numsec = bp->bio_length / sc->sc_sectorsize;
+ G_UNION_DEBUG(3, "g_union_getmap: check %jd sectors starting at %jd\n",
+ numsec, start);
+ KASSERT(start + numsec <= sc->sc_map_size,
+ ("g_union_getmap: block %jd is out of range", start + numsec));
+ root_idx = start / sc->sc_bits_per_leaf;
+ first = true;
+ while (numsec > 0) {
+ /* Check first if the leaf records any written sectors */
+ root_idx = start / sc->sc_bits_per_leaf;
+ leafresid = sc->sc_bits_per_leaf -
+ (start % sc->sc_bits_per_leaf);
+ if (((sc->sc_leafused[root_idx / BITS_PER_ENTRY]) &
+ (1ULL << (root_idx % BITS_PER_ENTRY))) == 0) {
+ if (first) {
+ maptype = false;
+ first = false;
+ }
+ if (maptype)
+ break;
+ numsec -= leafresid;
+ start += leafresid;
+ continue;
+ }
+ /* Check up to a word boundry, then check word by word */
+ leaf = sc->sc_writemap_root[root_idx];
+ word = leaf[(start % sc->sc_bits_per_leaf) / BITS_PER_ENTRY];
+ bitloc = start % BITS_PER_ENTRY;
+ if (bitloc == 0 && (word == 0 || word == ~0)) {
+ if (first) {
+ if (word == 0)
+ maptype = false;
+ else
+ maptype = true;
+ first = false;
+ }
+ if ((word == 0 && maptype) ||
+ (word == ~0 && !maptype))
+ break;
+ numsec -= BITS_PER_ENTRY;
+ start += BITS_PER_ENTRY;
+ continue;
+ }
+ for ( ; bitloc < BITS_PER_ENTRY; bitloc ++) {
+ retval = (word & (1ULL << bitloc)) != 0;
+ if (first) {
+ maptype = retval;
+ first = false;
+ }
+ if (maptype == retval) {
+ numsec--;
+ start++;
+ continue;
+ }
+ goto out;
+ }
+ }
+out:
+ if (numsec < 0)
+ numsec = 0;
+ *len2read = bp->bio_length - (numsec * sc->sc_sectorsize);
+ G_UNION_DEBUG(3, "g_union_getmap: return maptype %swritten for %jd "
+ "sectors ending at %jd\n", maptype ? "" : "NOT ",
+ *len2read / sc->sc_sectorsize, start - 1);
+ return (maptype);
+}
+
+/*
+ * Fill in details for a BIO_GETATTR request.
+ */
+static void
+g_union_kerneldump(struct bio *bp, struct g_union_softc *sc)
+{
+ struct g_kerneldump *gkd;
+ struct g_geom *gp;
+ struct g_provider *pp;
+
+ gkd = (struct g_kerneldump *)bp->bio_data;
+ gp = bp->bio_to->geom;
+ g_trace(G_T_TOPOLOGY, "%s(%s, %jd, %jd)", __func__, gp->name,
+ (intmax_t)gkd->offset, (intmax_t)gkd->length);
+
+ pp = LIST_FIRST(&gp->provider);
+
+ gkd->di.dumper = g_union_dumper;
+ gkd->di.priv = sc;
+ gkd->di.blocksize = pp->sectorsize;
+ gkd->di.maxiosize = DFLTPHYS;
+ gkd->di.mediaoffset = sc->sc_offset + gkd->offset;
+ if (gkd->offset > sc->sc_size) {
+ g_io_deliver(bp, ENODEV);
+ return;
+ }
+ if (gkd->offset + gkd->length > sc->sc_size)
+ gkd->length = sc->sc_size - gkd->offset;
+ gkd->di.mediasize = gkd->length;
+ g_io_deliver(bp, 0);
+}
+
+/*
+ * Handler for g_union_kerneldump().
+ */
+static int
+g_union_dumper(void *priv, void *virtual, vm_offset_t physical, off_t offset,
+ size_t length)
+{
+
+ return (0);
+}
+
+/*
+ * List union statistics.
+ */
+static void
+g_union_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
+ struct g_consumer *cp, struct g_provider *pp)
+{
+ struct g_union_softc *sc;
+
+ if (pp != NULL || cp != NULL || gp->softc == NULL)
+ return;
+ sc = gp->softc;
+ sbuf_printf(sb, "%s<Reads>%ju</Reads>\n", indent, sc->sc_reads);
+ sbuf_printf(sb, "%s<Writes>%ju</Writes>\n", indent, sc->sc_writes);
+ sbuf_printf(sb, "%s<Deletes>%ju</Deletes>\n", indent, sc->sc_deletes);
+ sbuf_printf(sb, "%s<Getattrs>%ju</Getattrs>\n", indent, sc->sc_getattrs);
+ sbuf_printf(sb, "%s<Flushes>%ju</Flushes>\n", indent, sc->sc_flushes);
+ sbuf_printf(sb, "%s<Speedups>%ju</Speedups>\n", indent, sc->sc_speedups);
+ sbuf_printf(sb, "%s<Cmd0s>%ju</Cmd0s>\n", indent, sc->sc_cmd0s);
+ sbuf_printf(sb, "%s<Cmd1s>%ju</Cmd1s>\n", indent, sc->sc_cmd1s);
+ sbuf_printf(sb, "%s<Cmd2s>%ju</Cmd2s>\n", indent, sc->sc_cmd2s);
+ sbuf_printf(sb, "%s<ReadBytes>%ju</ReadBytes>\n", indent,
+ sc->sc_readbytes);
+ sbuf_printf(sb, "%s<WroteBytes>%ju</WroteBytes>\n", indent,
+ sc->sc_wrotebytes);
+ sbuf_printf(sb, "%s<WroteBytes>%ju</WroteBytes>\n", indent,
+ sc->sc_wrotebytes);
+ sbuf_printf(sb, "%s<Offset>%jd</Offset>\n", indent,
+ (intmax_t)sc->sc_offset);
+}
+
+/*
+ * Clean up an orphaned geom.
+ */
+static void
+g_union_orphan(struct g_consumer *cp)
+{
+
+ g_topology_assert();
+ g_union_destroy(cp->geom, 1, false);
+}
+
+/*
+ * Clean up a union geom.
+ */
+static int
+g_union_destroy_geom(struct gctl_req *req, struct g_class *mp,
+ struct g_geom *gp)
+{
+
+ return (g_union_destroy(gp, 0, false));
+}
+
+/*
+ * Clean up a union device.
+ */
+static int
+g_union_destroy(struct g_geom *gp, bool force, bool verbose)
+{
+ struct g_union_softc *sc;
+ struct g_provider *pp;
+ int error;
+
+ g_topology_assert();
+ sc = gp->softc;
+ if (sc == NULL)
+ return (ENXIO);
+ pp = LIST_FIRST(&gp->provider);
+ if ((sc->sc_flags & DOING_COMMIT) != 0 ||
+ (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0))) {
+ if (force) {
+ if (verbose)
+ printf("Device %s is still in use, so "
+ "is being forcibly removed.\n", pp->name);
+ G_UNION_DEBUG(1, "Device %s is still in use, so "
+ "is being forcibly removed.", pp->name);
+ } else {
+ if (verbose)
+ printf("Device %s is still open "
+ "(r=%d w=%d e=%d).\n", pp->name, pp->acr,
+ pp->acw, pp->ace);
+ G_UNION_DEBUG(1, "Device %s is still open "
+ "(r=%d w=%d e=%d).", pp->name, pp->acr,
+ pp->acw, pp->ace);
+ return (EBUSY);
+ }
+ } else {
+ if (verbose)
+ printf("Device %s removed.\n", gp->name);
+ G_UNION_DEBUG(1, "Device %s removed.", gp->name);
+ }
+ /* Close consumers */
+ if ((error = g_access(sc->sc_lowercp, -1, 0, -1)) != 0)
+ G_UNION_DEBUG(2, "Error %d: device %s could not reset access "
+ "to %s.", error, pp->name, sc->sc_lowercp->provider->name);
+ if ((error = g_access(sc->sc_uppercp, -1, -1, -1)) != 0)
+ G_UNION_DEBUG(2, "Error %d: device %s could not reset access "
+ "to %s.", error, pp->name, sc->sc_uppercp->provider->name);
+
+ g_wither_geom(gp, ENXIO);
+
+ return (0);
+}
+
+/*
+ * Clean up a union provider.
+ */
+static void
+g_union_providergone(struct g_provider *pp)
+{
+ struct g_geom *gp;
+ struct g_union_softc *sc;
+ size_t i;
+
+ gp = pp->geom;
+ sc = gp->softc;
+ gp->softc = NULL;
+ for (i = 0; i < sc->sc_root_size; i++)
+ g_free(sc->sc_writemap_root[i]);
+ g_free(sc->sc_writemap_root);
+ rw_destroy(&sc->sc_rwlock);
+ g_free(sc);
+}
+
+/*
+ * Respond to a resized provider.
+ */
+static void
+g_union_resize(struct g_consumer *cp)
+{
+ struct g_union_softc *sc;
+ struct g_geom *gp;
+
+ g_topology_assert();
+
+ gp = cp->geom;
+ sc = gp->softc;
+
+ /*
+ * If size has gotten bigger, ignore it and just keep using
+ * the space we already had. Otherwise we are done.
+ */
+ if (sc->sc_size < cp->provider->mediasize - sc->sc_offset)
+ return;
+ g_union_destroy(gp, 1, false);
+}
+
+DECLARE_GEOM_CLASS(g_union_class, g_union);
+MODULE_VERSION(geom_union, 0);
diff --git a/sys/modules/geom/Makefile b/sys/modules/geom/Makefile
--- a/sys/modules/geom/Makefile
+++ b/sys/modules/geom/Makefile
@@ -21,6 +21,7 @@
geom_raid3 \
geom_shsec \
geom_stripe \
+ geom_union \
geom_uzip \
geom_vinum \
geom_virstor \
diff --git a/sys/modules/geom/geom_union/Makefile b/sys/modules/geom/geom_union/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/geom/geom_union/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/geom/union
+
+KMOD= geom_union
+SRCS= g_union.c
+
+.include <bsd.kmod.mk>

File Metadata

Mime Type
text/plain
Expires
Tue, Apr 29, 8:32 AM (16 h, 31 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17838951
Default Alt Text
D32697.id97787.diff (55 KB)

Event Timeline