Page MenuHomeFreeBSD

D41483.id.diff
No OneTemporary

D41483.id.diff

This file is larger than 256 KB, so syntax highlighting was skipped.
diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile
--- a/cddl/lib/libzpool/Makefile
+++ b/cddl/lib/libzpool/Makefile
@@ -135,6 +135,7 @@
uberblock.c \
unique.c \
vdev.c \
+ vdev_cache.c \
vdev_draid.c \
vdev_draid_rand.c \
vdev_file.c \
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -326,6 +326,7 @@
contrib/openzfs/module/zfs/uberblock.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/unique.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev.c optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_draid.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_draid_rand.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}"
diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk
--- a/sys/conf/kern.pre.mk
+++ b/sys/conf/kern.pre.mk
@@ -252,7 +252,8 @@
# Special flags for managing the compat compiles for ZFS
ZFS_CFLAGS+= -I$S/contrib/openzfs/module/icp/include \
${CDDL_CFLAGS} -DBUILDING_ZFS -DHAVE_UIO_ZEROCOPY \
- -DWITH_NETDUMP -D__KERNEL__ -D_SYS_CONDVAR_H_ -DSMP
+ -DWITH_NETDUMP -D__KERNEL__ -D_SYS_CONDVAR_H_ -DSMP \
+ -DIN_FREEBSD_BASE
.if ${MACHINE_ARCH} == "amd64"
ZFS_CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \
diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META
--- a/sys/contrib/openzfs/META
+++ b/sys/contrib/openzfs/META
@@ -1,10 +1,10 @@
Meta: 1
Name: zfs
Branch: 1.0
-Version: 2.2.0
-Release: rc1
+Version: 2.1.99
+Release: 1
Release-Tags: relext
License: CDDL
Author: OpenZFS
-Linux-Maximum: 6.3
+Linux-Maximum: 6.2
Linux-Minimum: 3.10
diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/arc_summary
--- a/sys/contrib/openzfs/cmd/arc_summary
+++ b/sys/contrib/openzfs/cmd/arc_summary
@@ -64,6 +64,7 @@
SECTION_PATHS = {'arc': 'arcstats',
'dmu': 'dmu_tx',
'l2arc': 'arcstats', # L2ARC stuff lives in arcstats
+ 'vdev': 'vdev_cache_stats',
'zfetch': 'zfetchstats',
'zil': 'zil'}
@@ -89,6 +90,8 @@
# Requires py36-sysctl on FreeBSD
import sysctl
+ VDEV_CACHE_SIZE = 'vdev.cache_size'
+
def is_value(ctl):
return ctl.type != sysctl.CTLTYPE_NODE
@@ -132,6 +135,8 @@
SPL_PATH = '/sys/module/spl/parameters'
TUNABLES_PATH = '/sys/module/zfs/parameters'
+ VDEV_CACHE_SIZE = 'zfs_vdev_cache_size'
+
def load_kstats(section):
path = os.path.join(KSTAT_PATH, section)
with open(path) as f:
@@ -837,8 +842,7 @@
('Free on write:', 'l2_free_on_write'),
('R/W clashes:', 'l2_rw_clash'),
('Bad checksums:', 'l2_cksum_bad'),
- ('Read errors:', 'l2_io_error'),
- ('Write errors:', 'l2_writes_error'))
+ ('I/O errors:', 'l2_io_error'))
for title, value in l2_todo:
prt_i1(title, f_hits(arc_stats[value]))
@@ -874,20 +878,28 @@
prt_i2('Miss ratio:',
f_perc(arc_stats['l2_misses'], l2_access_total),
f_hits(arc_stats['l2_misses']))
+ prt_i1('Feeds:', f_hits(arc_stats['l2_feeds']))
print()
- print('L2ARC I/O:')
- prt_i2('Reads:',
- f_bytes(arc_stats['l2_read_bytes']),
- f_hits(arc_stats['l2_hits']))
- prt_i2('Writes:',
- f_bytes(arc_stats['l2_write_bytes']),
- f_hits(arc_stats['l2_writes_sent']))
+ print('L2ARC writes:')
+
+ if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']:
+ prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent']))
+ prt_i2('Done ratio:',
+ f_perc(arc_stats['l2_writes_done'],
+ arc_stats['l2_writes_sent']),
+ f_hits(arc_stats['l2_writes_done']))
+ prt_i2('Error ratio:',
+ f_perc(arc_stats['l2_writes_error'],
+ arc_stats['l2_writes_sent']),
+ f_hits(arc_stats['l2_writes_error']))
+ else:
+ prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent']))
print()
print('L2ARC evicts:')
- prt_i1('L1 cached:', f_hits(arc_stats['l2_evict_l1cached']))
- prt_i1('While reading:', f_hits(arc_stats['l2_evict_reading']))
+ prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry']))
+ prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading']))
print()
@@ -947,6 +959,35 @@
print()
+def section_vdev(kstats_dict):
+ """Collect information on VDEV caches"""
+
+ # Currently [Nov 2017] the VDEV cache is disabled, because it is actually
+ # harmful. When this is the case, we just skip the whole entry. See
+ # https://github.com/openzfs/zfs/blob/master/module/zfs/vdev_cache.c
+ # for details
+ tunables = get_vdev_params()
+
+ if tunables[VDEV_CACHE_SIZE] == '0':
+ print('VDEV cache disabled, skipping section\n')
+ return
+
+ vdev_stats = isolate_section('vdev_cache_stats', kstats_dict)
+
+ vdev_cache_total = int(vdev_stats['hits']) +\
+ int(vdev_stats['misses']) +\
+ int(vdev_stats['delegations'])
+
+ prt_1('VDEV cache summary:', f_hits(vdev_cache_total))
+ prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total),
+ f_hits(vdev_stats['hits']))
+ prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total),
+ f_hits(vdev_stats['misses']))
+ prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total),
+ f_hits(vdev_stats['delegations']))
+ print()
+
+
def section_zil(kstats_dict):
"""Collect information on the ZFS Intent Log. Some of the information
taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h
@@ -974,6 +1015,7 @@
'l2arc': section_l2arc,
'spl': section_spl,
'tunables': section_tunables,
+ 'vdev': section_vdev,
'zil': section_zil}
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
--- a/sys/contrib/openzfs/cmd/zdb/zdb.c
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@@ -33,7 +33,6 @@
* under sponsorship from the FreeBSD Foundation.
* Copyright (c) 2021 Allan Jude
* Copyright (c) 2021 Toomas Soome <tsoome@me.com>
- * Copyright (c) 2023, Klara Inc.
*/
#include <stdio.h>
@@ -327,7 +326,7 @@
int err;
struct sublivelist_verify *sv = args;
- zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
+ zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
sizeof (sublivelist_verify_block_refcnt_t));
err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
@@ -391,7 +390,7 @@
{
(void) args;
sublivelist_verify_t sv;
- zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
sizeof (sublivelist_verify_block_t));
int err = sublivelist_verify_func(&sv, dle);
zfs_btree_clear(&sv.sv_leftover);
@@ -683,7 +682,7 @@
(void) printf("Verifying deleted livelist entries\n");
sublivelist_verify_t sv;
- zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
sizeof (sublivelist_verify_block_t));
iterate_deleted_livelists(spa, livelist_verify, &sv);
@@ -717,7 +716,7 @@
mv.mv_start = m->ms_start;
mv.mv_end = m->ms_start + m->ms_size;
zfs_btree_create(&mv.mv_livelist_allocs,
- livelist_block_compare, NULL,
+ livelist_block_compare,
sizeof (sublivelist_verify_block_t));
mv_populate_livelist_allocs(&mv, &sv);
@@ -790,11 +789,8 @@
"\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
"\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
"\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
- "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
- "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
- "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
"\t%s [-v] <bookmark>\n"
- "\t%s -C [-A] [-U <cache>] [<poolname>]\n"
+ "\t%s -C [-A] [-U <cache>]\n"
"\t%s -l [-Aqu] <device>\n"
"\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
"[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
@@ -806,7 +802,7 @@
"\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
"<poolname>\n\n",
cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
- cmdname, cmdname, cmdname, cmdname, cmdname);
+ cmdname, cmdname, cmdname, cmdname);
(void) fprintf(stderr, " Dataset name must include at least one "
"separator character '/' or '@'\n");
@@ -829,8 +825,6 @@
(void) fprintf(stderr, " Options to control amount of output:\n");
(void) fprintf(stderr, " -b --block-stats "
"block statistics\n");
- (void) fprintf(stderr, " -B --backup "
- "backup stream\n");
(void) fprintf(stderr, " -c --checksum "
"checksum all metadata (twice for all data) blocks\n");
(void) fprintf(stderr, " -C --config "
@@ -4881,81 +4875,6 @@
return (err);
}
-static int
-dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
-{
- const char *p = (const char *)buf;
- ssize_t nwritten;
-
- (void) os;
- (void) arg;
-
- /* Write the data out, handling short writes and signals. */
- while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
- if (nwritten < 0) {
- if (errno == EINTR)
- continue;
- return (errno);
- }
- p += nwritten;
- len -= nwritten;
- }
-
- return (0);
-}
-
-static void
-dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
-{
- boolean_t embed = B_FALSE;
- boolean_t large_block = B_FALSE;
- boolean_t compress = B_FALSE;
- boolean_t raw = B_FALSE;
-
- const char *c;
- for (c = flagstr; c != NULL && *c != '\0'; c++) {
- switch (*c) {
- case 'e':
- embed = B_TRUE;
- break;
- case 'L':
- large_block = B_TRUE;
- break;
- case 'c':
- compress = B_TRUE;
- break;
- case 'w':
- raw = B_TRUE;
- break;
- default:
- fprintf(stderr, "dump_backup: invalid flag "
- "'%c'\n", *c);
- return;
- }
- }
-
- if (isatty(STDOUT_FILENO)) {
- fprintf(stderr, "dump_backup: stream cannot be written "
- "to a terminal\n");
- return;
- }
-
- offset_t off = 0;
- dmu_send_outparams_t out = {
- .dso_outfunc = dump_backup_bytes,
- .dso_dryrun = B_FALSE,
- };
-
- int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
- large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
- &off, &out);
- if (err != 0) {
- fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
- strerror(err));
- return;
- }
-}
-
static int
zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
{
@@ -8546,9 +8465,9 @@
*/
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
- ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
- NULL, NULL));
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+ ZIO_FLAG_OPTIONAL, NULL, NULL));
}
error = zio_wait(zio);
@@ -8642,6 +8561,7 @@
zio_nowait(zio_vdev_child_io(czio, bp, vd,
offset, pabd, psize, ZIO_TYPE_READ,
ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
@@ -8775,7 +8695,6 @@
struct option long_options[] = {
{"ignore-assertions", no_argument, NULL, 'A'},
{"block-stats", no_argument, NULL, 'b'},
- {"backup", no_argument, NULL, 'B'},
{"checksum", no_argument, NULL, 'c'},
{"config", no_argument, NULL, 'C'},
{"datasets", no_argument, NULL, 'd'},
@@ -8817,11 +8736,10 @@
};
while ((c = getopt_long(argc, argv,
- "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
+ "AbcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
long_options, NULL)) != -1) {
switch (c) {
case 'b':
- case 'B':
case 'c':
case 'C':
case 'd':
@@ -8969,7 +8887,7 @@
verbose = MAX(verbose, 1);
for (c = 0; c < 256; c++) {
- if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
+ if (dump_all && strchr("AeEFkKlLNOPrRSXy", c) == NULL)
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
@@ -9155,8 +9073,7 @@
checkpoint_pool, error);
}
- } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
- objset_id == 0) {
+ } else if (target_is_spa || dump_opt['R'] || objset_id == 0) {
zdb_set_skip_mmp(target);
error = spa_open_rewind(target, &spa, FTAG, policy,
NULL);
@@ -9292,10 +9209,7 @@
strerror(errno));
}
}
- if (dump_opt['B']) {
- dump_backup(target, objset_id,
- argc > 0 ? argv[0] : NULL);
- } else if (os != NULL) {
+ if (os != NULL) {
dump_objset(os);
} else if (zopt_object_args > 0 && !dump_opt['m']) {
dump_objset(spa->spa_meta_objset);
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c
--- a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c
@@ -369,7 +369,9 @@
return (NULL);
}
- if ((event = list_remove_head(&agent_events)) != NULL) {
+ if ((event = (list_head(&agent_events))) != NULL) {
+ list_remove(&agent_events, event);
+
(void) pthread_mutex_unlock(&agent_lock);
/* dispatch to all event subscribers */
@@ -432,7 +434,8 @@
(void) pthread_join(g_agents_tid, NULL);
/* drain any pending events */
- while ((event = list_remove_head(&agent_events)) != NULL) {
+ while ((event = (list_head(&agent_events))) != NULL) {
+ list_remove(&agent_events, event);
nvlist_free(event->ae_nvl);
free(event);
}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c
--- a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c
@@ -1288,14 +1288,17 @@
tpool_destroy(g_tpool);
}
- while ((pool = list_remove_head(&g_pool_list)) != NULL) {
+ while ((pool = (list_head(&g_pool_list))) != NULL) {
+ list_remove(&g_pool_list, pool);
zpool_close(pool->uap_zhp);
free(pool);
}
list_destroy(&g_pool_list);
- while ((device = list_remove_head(&g_device_list)) != NULL)
+ while ((device = (list_head(&g_device_list))) != NULL) {
+ list_remove(&g_device_list, device);
free(device);
+ }
list_destroy(&g_device_list);
libzfs_fini(g_zfshdl);
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
--- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
@@ -6057,8 +6057,8 @@
if (p != NULL)
rid = p->pw_uid;
else if (*endch != '\0') {
- (void) snprintf(errbuf, sizeof (errbuf),
- gettext("invalid user %s\n"), curr);
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid user %s\n"), curr);
allow_usage(un, B_TRUE, errbuf);
}
} else if (opts->group) {
@@ -6071,9 +6071,8 @@
if (g != NULL)
rid = g->gr_gid;
else if (*endch != '\0') {
- (void) snprintf(errbuf, sizeof (errbuf),
- gettext("invalid group %s\n"),
- curr);
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid group %s\n"), curr);
allow_usage(un, B_TRUE, errbuf);
}
} else {
@@ -6098,9 +6097,8 @@
who_type = ZFS_DELEG_GROUP;
rid = g->gr_gid;
} else {
- (void) snprintf(errbuf, sizeof (errbuf),
- gettext("invalid user/group %s\n"),
- curr);
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid user/group %s\n"), curr);
allow_usage(un, B_TRUE, errbuf);
}
}
diff --git a/sys/contrib/openzfs/cmd/zilstat.in b/sys/contrib/openzfs/cmd/zilstat.in
--- a/sys/contrib/openzfs/cmd/zilstat.in
+++ b/sys/contrib/openzfs/cmd/zilstat.in
@@ -36,49 +36,31 @@
from argparse import RawTextHelpFormatter
cols = {
- # hdr: [size, scale, kstat name]
+ # hdr: [size, scale, kstat name]
"time": [8, -1, "time"],
"pool": [12, -1, "pool"],
"ds": [12, -1, "dataset_name"],
"obj": [12, -1, "objset"],
- "cc": [5, 1000, "zil_commit_count"],
- "cwc": [5, 1000, "zil_commit_writer_count"],
- "ic": [5, 1000, "zil_itx_count"],
- "iic": [5, 1000, "zil_itx_indirect_count"],
- "iib": [5, 1024, "zil_itx_indirect_bytes"],
- "icc": [5, 1000, "zil_itx_copied_count"],
- "icb": [5, 1024, "zil_itx_copied_bytes"],
- "inc": [5, 1000, "zil_itx_needcopy_count"],
- "inb": [5, 1024, "zil_itx_needcopy_bytes"],
- "idc": [5, 1000, "icc+inc"],
- "idb": [5, 1024, "icb+inb"],
- "iwc": [5, 1000, "iic+idc"],
- "iwb": [5, 1024, "iib+idb"],
- "imnc": [6, 1000, "zil_itx_metaslab_normal_count"],
- "imnb": [6, 1024, "zil_itx_metaslab_normal_bytes"],
- "imnw": [6, 1024, "zil_itx_metaslab_normal_write"],
- "imna": [6, 1024, "zil_itx_metaslab_normal_alloc"],
- "imsc": [6, 1000, "zil_itx_metaslab_slog_count"],
- "imsb": [6, 1024, "zil_itx_metaslab_slog_bytes"],
- "imsw": [6, 1024, "zil_itx_metaslab_slog_write"],
- "imsa": [6, 1024, "zil_itx_metaslab_slog_alloc"],
- "imc": [5, 1000, "imnc+imsc"],
- "imb": [5, 1024, "imnb+imsb"],
- "imw": [5, 1024, "imnw+imsw"],
- "ima": [5, 1024, "imna+imsa"],
- "se%": [3, 100, "imb/ima"],
- "sen%": [4, 100, "imnb/imna"],
- "ses%": [4, 100, "imsb/imsa"],
- "te%": [3, 100, "imb/imw"],
- "ten%": [4, 100, "imnb/imnw"],
- "tes%": [4, 100, "imsb/imsw"],
+ "zcc": [10, 1000, "zil_commit_count"],
+ "zcwc": [10, 1000, "zil_commit_writer_count"],
+ "ziic": [10, 1000, "zil_itx_indirect_count"],
+ "zic": [10, 1000, "zil_itx_count"],
+ "ziib": [10, 1024, "zil_itx_indirect_bytes"],
+ "zicc": [10, 1000, "zil_itx_copied_count"],
+ "zicb": [10, 1024, "zil_itx_copied_bytes"],
+ "zinc": [10, 1000, "zil_itx_needcopy_count"],
+ "zinb": [10, 1024, "zil_itx_needcopy_bytes"],
+ "zimnc": [10, 1000, "zil_itx_metaslab_normal_count"],
+ "zimnb": [10, 1024, "zil_itx_metaslab_normal_bytes"],
+ "zimsc": [10, 1000, "zil_itx_metaslab_slog_count"],
+ "zimsb": [10, 1024, "zil_itx_metaslab_slog_bytes"],
}
-hdr = ["time", "ds", "cc", "ic", "idc", "idb", "iic", "iib",
- "imnc", "imnw", "imsc", "imsw"]
+hdr = ["time", "pool", "ds", "obj", "zcc", "zcwc", "ziic", "zic", "ziib", \
+ "zicc", "zicb", "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"]
-ghdr = ["time", "cc", "ic", "idc", "idb", "iic", "iib",
- "imnc", "imnw", "imsc", "imsw"]
+ghdr = ["time", "zcc", "zcwc", "ziic", "zic", "ziib", "zicc", "zicb",
+ "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"]
cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]")
@@ -123,7 +105,7 @@
global sep
for col in hdr:
new_col = col
- if interval > 0 and cols[col][1] > 100:
+ if interval > 0 and col not in ['time', 'pool', 'ds', 'obj']:
new_col += "/s"
sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep))
sys.stdout.write("\n")
@@ -133,7 +115,7 @@
global sep
for col in hdr:
val = v[cols[col][2]]
- if interval > 0 and cols[col][1] > 100:
+ if col not in ['time', 'pool', 'ds', 'obj'] and interval > 0:
val = v[cols[col][2]] // interval
sys.stdout.write("%s%s" % (
prettynum(cols[col][0], cols[col][1], val), sep))
@@ -255,7 +237,9 @@
invalid = []
for ele in hdr:
- if ele not in cols:
+ if gFlag and ele not in ghdr:
+ invalid.append(ele)
+ elif ele not in cols:
invalid.append(ele)
if len(invalid) > 0:
@@ -419,17 +403,17 @@
diff = copy.deepcopy(curr)
for pool in curr:
for objset in curr[pool]:
- for key in curr[pool][objset]:
- if not isinstance(diff[pool][objset][key], int):
- continue
- # If prev is NULL, this is the
- # first time we are here
- if not prev:
- diff[pool][objset][key] = 0
- else:
- diff[pool][objset][key] \
- = curr[pool][objset][key] \
- - prev[pool][objset][key]
+ for col in hdr:
+ if col not in ['time', 'pool', 'ds', 'obj']:
+ key = cols[col][2]
+ # If prev is NULL, this is the
+ # first time we are here
+ if not prev:
+ diff[pool][objset][key] = 0
+ else:
+ diff[pool][objset][key] \
+ = curr[pool][objset][key] \
+ - prev[pool][objset][key]
def zil_build_dict(pool = "GLOBAL"):
global kstat
@@ -441,77 +425,10 @@
if objset not in curr[pool]:
curr[pool][objset] = dict()
curr[pool][objset][key] = val
-
-def zil_extend_dict():
- global diff
- for pool in diff:
- for objset in diff[pool]:
- diff[pool][objset]["pool"] = pool
- diff[pool][objset]["objset"] = objset
- diff[pool][objset]["time"] = time.strftime("%H:%M:%S", \
- time.localtime())
- diff[pool][objset]["icc+inc"] = \
- diff[pool][objset]["zil_itx_copied_count"] + \
- diff[pool][objset]["zil_itx_needcopy_count"]
- diff[pool][objset]["icb+inb"] = \
- diff[pool][objset]["zil_itx_copied_bytes"] + \
- diff[pool][objset]["zil_itx_needcopy_bytes"]
- diff[pool][objset]["iic+idc"] = \
- diff[pool][objset]["zil_itx_indirect_count"] + \
- diff[pool][objset]["zil_itx_copied_count"] + \
- diff[pool][objset]["zil_itx_needcopy_count"]
- diff[pool][objset]["iib+idb"] = \
- diff[pool][objset]["zil_itx_indirect_bytes"] + \
- diff[pool][objset]["zil_itx_copied_bytes"] + \
- diff[pool][objset]["zil_itx_needcopy_bytes"]
- diff[pool][objset]["imnc+imsc"] = \
- diff[pool][objset]["zil_itx_metaslab_normal_count"] + \
- diff[pool][objset]["zil_itx_metaslab_slog_count"]
- diff[pool][objset]["imnb+imsb"] = \
- diff[pool][objset]["zil_itx_metaslab_normal_bytes"] + \
- diff[pool][objset]["zil_itx_metaslab_slog_bytes"]
- diff[pool][objset]["imnw+imsw"] = \
- diff[pool][objset]["zil_itx_metaslab_normal_write"] + \
- diff[pool][objset]["zil_itx_metaslab_slog_write"]
- diff[pool][objset]["imna+imsa"] = \
- diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + \
- diff[pool][objset]["zil_itx_metaslab_slog_alloc"]
- if diff[pool][objset]["imna+imsa"] > 0:
- diff[pool][objset]["imb/ima"] = 100 * \
- diff[pool][objset]["imnb+imsb"] // \
- diff[pool][objset]["imna+imsa"]
- else:
- diff[pool][objset]["imb/ima"] = 100
- if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0:
- diff[pool][objset]["imnb/imna"] = 100 * \
- diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \
- diff[pool][objset]["zil_itx_metaslab_normal_alloc"]
- else:
- diff[pool][objset]["imnb/imna"] = 100
- if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0:
- diff[pool][objset]["imsb/imsa"] = 100 * \
- diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \
- diff[pool][objset]["zil_itx_metaslab_slog_alloc"]
- else:
- diff[pool][objset]["imsb/imsa"] = 100
- if diff[pool][objset]["imnw+imsw"] > 0:
- diff[pool][objset]["imb/imw"] = 100 * \
- diff[pool][objset]["imnb+imsb"] // \
- diff[pool][objset]["imnw+imsw"]
- else:
- diff[pool][objset]["imb/imw"] = 100
- if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0:
- diff[pool][objset]["imnb/imnw"] = 100 * \
- diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \
- diff[pool][objset]["zil_itx_metaslab_normal_write"]
- else:
- diff[pool][objset]["imnb/imnw"] = 100
- if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0:
- diff[pool][objset]["imsb/imsw"] = 100 * \
- diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \
- diff[pool][objset]["zil_itx_metaslab_slog_write"]
- else:
- diff[pool][objset]["imsb/imsw"] = 100
+ curr[pool][objset]["pool"] = pool
+ curr[pool][objset]["objset"] = objset
+ curr[pool][objset]["time"] = time.strftime("%H:%M:%S", \
+ time.localtime())
def sign_handler_epipe(sig, frame):
print("Caught EPIPE signal: " + str(frame))
@@ -520,31 +437,30 @@
def main():
global interval
- global curr, diff
+ global curr
hprint = False
init()
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, sign_handler_epipe)
- zil_process_kstat()
- if not curr:
- print ("Error: No stats to show")
- sys.exit(0)
- print_header()
if interval > 0:
- time.sleep(interval)
while True:
calculate_diff()
if not diff:
print ("Error: No stats to show")
sys.exit(0)
- zil_extend_dict()
+ if hprint == False:
+ print_header()
+ hprint = True
print_dict(diff)
time.sleep(interval)
else:
- diff = curr
- zil_extend_dict()
- print_dict(diff)
+ zil_process_kstat()
+ if not curr:
+ print ("Error: No stats to show")
+ sys.exit(0)
+ print_header()
+ print_dict(curr)
if __name__ == '__main__':
main()
diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am
--- a/sys/contrib/openzfs/cmd/zpool/Makefile.am
+++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am
@@ -145,7 +145,6 @@
%D%/compatibility.d/openzfs-2.0-linux \
%D%/compatibility.d/openzfs-2.1-freebsd \
%D%/compatibility.d/openzfs-2.1-linux \
- %D%/compatibility.d/openzfs-2.2 \
%D%/compatibility.d/openzfsonosx-1.7.0 \
%D%/compatibility.d/openzfsonosx-1.8.1 \
%D%/compatibility.d/openzfsonosx-1.9.3 \
@@ -169,20 +168,12 @@
"freebsd-11.3 freebsd-12.0" \
"freebsd-11.3 freebsd-12.1" \
"freebsd-11.3 freebsd-12.2" \
- "freebsd-11.3 freebsd-12.3" \
- "freebsd-11.3 freebsd-12.4" \
- "openzfs-2.1-freebsd freebsd-13.0" \
- "openzfs-2.1-freebsd freebsd-13.1" \
- "openzfs-2.1-freebsd freebsd-13.2" \
"freebsd-11.3 freenas-11.3" \
"freenas-11.0 freenas-11.1" \
"openzfsonosx-1.9.3 openzfsonosx-1.9.4" \
"openzfs-2.0-freebsd truenas-12.0" \
"zol-0.7 ubuntu-18.04" \
- "zol-0.8 ubuntu-20.04" \
- "openzfs-2.1-linux ubuntu-22.04" \
- "openzfs-2.2 openzfs-2.2-linux" \
- "openzfs-2.2 openzfs-2.2-freebsd"
+ "zol-0.8 ubuntu-20.04"
zpoolconfdir = $(sysconfdir)/zfs/zpool.d
INSTALL_DATA_HOOKS += zpool-install-data-hook
diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2
--- a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2
+++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2
@@ -8,7 +8,5 @@
filesystem_limits
hole_birth
large_blocks
-livelist
lz4_compress
spacemap_histogram
-zpool_checkpoint
diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2
deleted file mode 100644
--- a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2
+++ /dev/null
@@ -1,40 +0,0 @@
-# Features supported by OpenZFS 2.2 on Linux and FreeBSD
-allocation_classes
-async_destroy
-blake3
-block_cloning
-bookmark_v2
-bookmark_written
-bookmarks
-device_rebuild
-device_removal
-draid
-edonr
-embedded_data
-empty_bpobj
-enabled_txg
-encryption
-extensible_dataset
-filesystem_limits
-head_errlog
-hole_birth
-large_blocks
-large_dnode
-livelist
-log_spacemap
-lz4_compress
-multi_vdev_crash_dump
-obsolete_counts
-project_quota
-redacted_datasets
-redaction_bookmarks
-resilver_defer
-sha512
-skein
-spacemap_histogram
-spacemap_v2
-userobj_accounting
-vdev_zaps_v2
-zilsaxattr
-zpool_checkpoint
-zstd_compress
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@@ -7662,11 +7662,11 @@
print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
- uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i;
+ uint64_t pass_scanned, scanned, pass_issued, issued, total;
uint64_t elapsed, scan_rate, issue_rate;
double fraction_done;
- char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7];
- char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32];
+ char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
+ char srate_buf[7], irate_buf[7], time_buf[32];
printf(" ");
printf_color(ANSI_BOLD, gettext("scan:"));
@@ -7738,11 +7738,10 @@
pass_scanned = ps->pss_pass_exam;
issued = ps->pss_issued;
pass_issued = ps->pss_pass_issued;
- total_s = ps->pss_to_examine;
- total_i = ps->pss_to_examine - ps->pss_skipped;
+ total = ps->pss_to_examine;
/* we are only done with a block once we have issued the IO for it */
- fraction_done = (double)issued / total_i;
+ fraction_done = (double)issued / total;
/* elapsed time for this pass, rounding up to 1 if it's 0 */
elapsed = time(NULL) - ps->pss_pass_start;
@@ -7751,25 +7750,26 @@
scan_rate = pass_scanned / elapsed;
issue_rate = pass_issued / elapsed;
+ uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
+ ((total - issued) / issue_rate) : UINT64_MAX;
+ secs_to_dhms(total_secs_left, time_buf);
/* format all of the numbers we will be reporting */
zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
- zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf));
- zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf));
+ zfs_nicebytes(total, total_buf, sizeof (total_buf));
+ zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
+ zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
/* do not print estimated time if we have a paused scrub */
- (void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf);
- if (pause == 0 && scan_rate > 0) {
- zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
- (void) printf(gettext(" at %s/s"), srate_buf);
- }
- (void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf);
- if (pause == 0 && issue_rate > 0) {
- zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
- (void) printf(gettext(" at %s/s"), irate_buf);
+ if (pause == 0) {
+ (void) printf(gettext("\t%s scanned at %s/s, "
+ "%s issued at %s/s, %s total\n"),
+ scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
+ } else {
+ (void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
+ scanned_buf, issued_buf, total_buf);
}
- (void) printf(gettext("\n"));
if (is_resilver) {
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
@@ -7782,16 +7782,16 @@
if (pause == 0) {
/*
* Only provide an estimate iff:
- * 1) we haven't yet issued all we expected, and
+ * 1) the time remaining is valid, and
* 2) the issue rate exceeds 10 MB/s, and
* 3) it's either:
* a) a resilver which has started repairs, or
* b) a scrub which has entered the issue phase.
*/
- if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 &&
+ if (total_secs_left != UINT64_MAX &&
+ issue_rate >= 10 * 1024 * 1024 &&
((is_resilver && ps->pss_processed > 0) ||
(is_scrub && issued > 0))) {
- secs_to_dhms((total_i - issued) / issue_rate, time_buf);
(void) printf(gettext(", %s to go\n"), time_buf);
} else {
(void) printf(gettext(", no estimated "
@@ -7803,7 +7803,7 @@
}
static void
-print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name)
+print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
{
if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
return;
@@ -7815,20 +7815,17 @@
uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
uint64_t bytes_issued = vrs->vrs_bytes_issued;
uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
- uint64_t bytes_est_s = vrs->vrs_bytes_est;
- uint64_t bytes_est_i = vrs->vrs_bytes_est;
- if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8)
- bytes_est_i -= vrs->vrs_pass_bytes_skipped;
+ uint64_t bytes_est = vrs->vrs_bytes_est;
uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
(vrs->vrs_pass_time_ms + 1)) * 1000;
uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
(vrs->vrs_pass_time_ms + 1)) * 1000;
double scan_pct = MIN((double)bytes_scanned * 100 /
- (bytes_est_s + 1), 100);
+ (bytes_est + 1), 100);
/* Format all of the numbers we will be reporting */
char bytes_scanned_buf[7], bytes_issued_buf[7];
- char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7];
+ char bytes_rebuilt_buf[7], bytes_est_buf[7];
char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
sizeof (bytes_scanned_buf));
@@ -7836,8 +7833,9 @@
sizeof (bytes_issued_buf));
zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
sizeof (bytes_rebuilt_buf));
- zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf));
- zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf));
+ zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
+ zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
+ zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
time_t start = vrs->vrs_start_time;
time_t end = vrs->vrs_end_time;
@@ -7860,29 +7858,17 @@
assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
- (void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf,
- bytes_est_s_buf);
- if (scan_rate > 0) {
- zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
- (void) printf(gettext(" at %s/s"), scan_rate_buf);
- }
- (void) printf(gettext(", %s / %s issued"), bytes_issued_buf,
- bytes_est_i_buf);
- if (issue_rate > 0) {
- zfs_nicebytes(issue_rate, issue_rate_buf,
- sizeof (issue_rate_buf));
- (void) printf(gettext(" at %s/s"), issue_rate_buf);
- }
- (void) printf(gettext("\n"));
+ secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
+ MAX(scan_rate, 1), time_buf);
+ (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
+ "%s total\n"), bytes_scanned_buf, scan_rate_buf,
+ bytes_issued_buf, issue_rate_buf, bytes_est_buf);
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
bytes_rebuilt_buf, scan_pct);
if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
- if (bytes_est_s >= bytes_scanned &&
- scan_rate >= 10 * 1024 * 1024) {
- secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate,
- time_buf);
+ if (scan_rate >= 10 * 1024 * 1024) {
(void) printf(gettext(", %s to go\n"), time_buf);
} else {
(void) printf(gettext(", no estimated "
@@ -7914,7 +7900,7 @@
ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
char *name = zpool_vdev_name(g_zfs, zhp,
child[c], VDEV_NAME_TYPE_ID);
- print_rebuild_status_impl(vrs, i, name);
+ print_rebuild_status_impl(vrs, name);
free(name);
}
}
@@ -8019,15 +8005,13 @@
active_resilver = (ps->pss_state == DSS_SCANNING);
}
+
have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
scrub_start = ps->pss_start_time;
- if (c > offsetof(pool_scan_stat_t,
- pss_pass_error_scrub_pause) / 8) {
- have_errorscrub = (ps->pss_error_scrub_func ==
- POOL_SCAN_ERRORSCRUB);
- errorscrub_start = ps->pss_error_scrub_start;
- }
+ have_errorscrub = (ps->pss_error_scrub_func ==
+ POOL_SCAN_ERRORSCRUB);
+ errorscrub_start = ps->pss_error_scrub_start;
}
boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c b/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c
--- a/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c
+++ b/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c
@@ -238,7 +238,6 @@
print_kv("end_ts", ps->pss_end_time);
print_kv(",errors", ps->pss_errors);
print_kv(",examined", examined);
- print_kv(",skipped", ps->pss_skipped);
print_kv(",issued", ps->pss_issued);
print_kv(",pass_examined", pass_exam);
print_kv(",pass_issued", ps->pss_pass_issued);
@@ -250,6 +249,7 @@
print_kv(",remaining_t", remaining_time);
print_kv(",start_ts", ps->pss_start_time);
print_kv(",to_examine", ps->pss_to_examine);
+ print_kv(",to_process", ps->pss_to_process);
printf(" %llu\n", (u_longlong_t)timestamp);
return (0);
}
diff --git a/sys/contrib/openzfs/config/kernel-reclaim_state.m4 b/sys/contrib/openzfs/config/kernel-reclaim_state.m4
deleted file mode 100644
--- a/sys/contrib/openzfs/config/kernel-reclaim_state.m4
+++ /dev/null
@@ -1,26 +0,0 @@
-AC_DEFUN([ZFS_AC_KERNEL_SRC_RECLAIMED], [
- dnl #
- dnl # 6.4 API change
- dnl # The reclaimed_slab of struct reclaim_state
- dnl # is renamed to reclaimed
- dnl #
- ZFS_LINUX_TEST_SRC([reclaim_state_reclaimed], [
- #include <linux/swap.h>
- static const struct reclaim_state
- rs __attribute__ ((unused)) = {
- .reclaimed = 100,
- };
- ],[])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_RECLAIMED], [
- AC_MSG_CHECKING([whether struct reclaim_state has reclaimed field])
- ZFS_LINUX_TEST_RESULT([reclaim_state_reclaimed], [
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_RECLAIM_STATE_RECLAIMED, 1,
- [struct reclaim_state has reclaimed])
- ],[
- AC_MSG_RESULT(no)
- ])
-])
-
diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4
--- a/sys/contrib/openzfs/config/kernel.m4
+++ b/sys/contrib/openzfs/config/kernel.m4
@@ -153,7 +153,6 @@
ZFS_AC_KERNEL_SRC_IATTR_VFSID
ZFS_AC_KERNEL_SRC_FILEMAP
ZFS_AC_KERNEL_SRC_WRITEPAGE_T
- ZFS_AC_KERNEL_SRC_RECLAIMED
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -286,7 +285,6 @@
ZFS_AC_KERNEL_IATTR_VFSID
ZFS_AC_KERNEL_FILEMAP
ZFS_AC_KERNEL_WRITEPAGE_T
- ZFS_AC_KERNEL_RECLAIMED
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.zfs-zed.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.zfs-zed.init
new file mode 120000
--- /dev/null
+++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.zfs-zed.init
@@ -0,0 +1 @@
+../etc/init.d/zfs-zed
\ No newline at end of file
diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-import.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-import.init
new file mode 120000
--- /dev/null
+++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-import.init
@@ -0,0 +1 @@
+../etc/init.d/zfs-import
\ No newline at end of file
diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-load-key.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-load-key.init
new file mode 120000
--- /dev/null
+++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-load-key.init
@@ -0,0 +1 @@
+../etc/init.d/zfs-load-key
\ No newline at end of file
diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-mount.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-mount.init
new file mode 120000
--- /dev/null
+++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-mount.init
@@ -0,0 +1 @@
+../etc/init.d/zfs-mount
\ No newline at end of file
diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-share.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-share.init
new file mode 120000
--- /dev/null
+++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-share.init
@@ -0,0 +1 @@
+../etc/init.d/zfs-share
\ No newline at end of file
diff --git a/sys/contrib/openzfs/contrib/debian/rules.in b/sys/contrib/openzfs/contrib/debian/rules.in
--- a/sys/contrib/openzfs/contrib/debian/rules.in
+++ b/sys/contrib/openzfs/contrib/debian/rules.in
@@ -7,8 +7,8 @@
LINUX_MIN := $(shell awk '/Linux-Minimum:/{print $$2}' META)
LINUX_NEXT := $(shell awk -F'[ .]' '/Linux-Maximum:/{print $$2 "." $$3+1}' META)
-DKMSFILES := module include config zfs.release.in autogen.sh copy-builtin META AUTHORS \
- COPYRIGHT LICENSE README.md CODE_OF_CONDUCT.md NEWS NOTICE RELEASES.md
+DKMSFILES := module include config zfs.release.in autogen.sh META AUTHORS \
+ COPYRIGHT LICENSE README.md
ifndef KVERS
KVERS=$(shell uname -r)
diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in
--- a/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in
+++ b/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in
@@ -36,7 +36,7 @@
{ dfatal "Failed to install essential binaries"; exit 1; }
# Adapted from https://github.com/zbm-dev/zfsbootmenu
- if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so' && ldconfig -p 2> /dev/null | grep -qF 'libc.so.6' ; then
+ if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so'; then
# On systems with gcc-config (Gentoo, Funtoo, etc.), use it to find libgcc_s
if command -v gcc-config >/dev/null; then
inst_simple "/usr/lib/gcc/$(s=$(gcc-config -c); echo "${s%-*}/${s##*-}")/libgcc_s.so.1" ||
diff --git a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs
--- a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs
+++ b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs
@@ -344,7 +344,7 @@
# Need the _original_ datasets mountpoint!
mountpoint=$(get_fs_value "$fs" mountpoint)
- ZFS_CMD="mount -o zfsutil -t zfs"
+ ZFS_CMD="mount.zfs -o zfsutil"
if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then
# Can't use the mountpoint property. Might be one of our
# clones. Check the 'org.zol:mountpoint' property set in
@@ -361,7 +361,7 @@
fi
# Don't use mount.zfs -o zfsutils for legacy mountpoint
if [ "$mountpoint" = "legacy" ]; then
- ZFS_CMD="mount -t zfs"
+ ZFS_CMD="mount.zfs"
fi
# Last hail-mary: Hope 'rootmnt' is set!
mountpoint=""
@@ -944,7 +944,7 @@
echo " not specified on the kernel command line."
echo ""
echo "Manually mount the root filesystem on $rootmnt and then exit."
- echo "Hint: Try: mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt"
+ echo "Hint: Try: mount.zfs -o zfsutil ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt"
shell
fi
diff --git a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c
--- a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c
+++ b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c
@@ -67,7 +67,6 @@
#include <sys/mman.h>
static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok";
-static const char OLD_PASSWORD_VAR_NAME[] = "pam_zfs_key_oldauthtok";
static libzfs_handle_t *g_zfs;
@@ -161,10 +160,10 @@
}
static pw_password_t *
-pw_fetch(pam_handle_t *pamh, int tok)
+pw_fetch(pam_handle_t *pamh)
{
const char *token;
- if (pam_get_authtok(pamh, tok, &token, NULL) != PAM_SUCCESS) {
+ if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) {
pam_syslog(pamh, LOG_ERR,
"couldn't get password from PAM stack");
return (NULL);
@@ -178,13 +177,13 @@
}
static const pw_password_t *
-pw_fetch_lazy(pam_handle_t *pamh, int tok, const char *var_name)
+pw_fetch_lazy(pam_handle_t *pamh)
{
- pw_password_t *pw = pw_fetch(pamh, tok);
+ pw_password_t *pw = pw_fetch(pamh);
if (pw == NULL) {
return (NULL);
}
- int ret = pam_set_data(pamh, var_name, pw, destroy_pw);
+ int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw);
if (ret != PAM_SUCCESS) {
pw_free(pw);
pam_syslog(pamh, LOG_ERR, "pam_set_data failed");
@@ -194,23 +193,23 @@
}
static const pw_password_t *
-pw_get(pam_handle_t *pamh, int tok, const char *var_name)
+pw_get(pam_handle_t *pamh)
{
const pw_password_t *authtok = NULL;
- int ret = pam_get_data(pamh, var_name,
+ int ret = pam_get_data(pamh, PASSWORD_VAR_NAME,
(const void**)(&authtok));
if (ret == PAM_SUCCESS)
return (authtok);
if (ret == PAM_NO_MODULE_DATA)
- return (pw_fetch_lazy(pamh, tok, var_name));
+ return (pw_fetch_lazy(pamh));
pam_syslog(pamh, LOG_ERR, "password not available");
return (NULL);
}
static int
-pw_clear(pam_handle_t *pamh, const char *var_name)
+pw_clear(pam_handle_t *pamh)
{
- int ret = pam_set_data(pamh, var_name, NULL, NULL);
+ int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL);
if (ret != PAM_SUCCESS) {
pam_syslog(pamh, LOG_ERR, "clearing password failed");
return (-1);
@@ -387,7 +386,7 @@
int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value,
WRAPPING_KEY_LEN);
pw_free(key);
- if (ret && ret != EEXIST) {
+ if (ret) {
pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret);
zfs_close(ds);
return (-1);
@@ -407,14 +406,14 @@
}
static int
-unmount_unload(pam_handle_t *pamh, const char *ds_name, boolean_t force)
+unmount_unload(pam_handle_t *pamh, const char *ds_name)
{
zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM);
if (ds == NULL) {
pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name);
return (-1);
}
- int ret = zfs_unmount(ds, NULL, force ? MS_FORCE : 0);
+ int ret = zfs_unmount(ds, NULL, 0);
if (ret) {
pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret);
zfs_close(ds);
@@ -436,13 +435,9 @@
char *runstatedir;
char *homedir;
char *dsname;
- uid_t uid_min;
- uid_t uid_max;
uid_t uid;
const char *username;
- boolean_t unmount_and_unload;
- boolean_t force_unmount;
- boolean_t recursive_homes;
+ int unmount_and_unload;
} zfs_key_config_t;
static int
@@ -474,13 +469,9 @@
free(config->homes_prefix);
return (PAM_USER_UNKNOWN);
}
- config->uid_min = 1000;
- config->uid_max = MAXUID;
config->uid = entry->pw_uid;
config->username = name;
- config->unmount_and_unload = B_TRUE;
- config->force_unmount = B_FALSE;
- config->recursive_homes = B_FALSE;
+ config->unmount_and_unload = 1;
config->dsname = NULL;
config->homedir = NULL;
for (int c = 0; c < argc; c++) {
@@ -490,16 +481,8 @@
} else if (strncmp(argv[c], "runstatedir=", 12) == 0) {
free(config->runstatedir);
config->runstatedir = strdup(argv[c] + 12);
- } else if (strncmp(argv[c], "uid_min=", 8) == 0) {
- sscanf(argv[c] + 8, "%u", &config->uid_min);
- } else if (strncmp(argv[c], "uid_max=", 8) == 0) {
- sscanf(argv[c] + 8, "%u", &config->uid_max);
} else if (strcmp(argv[c], "nounmount") == 0) {
- config->unmount_and_unload = B_FALSE;
- } else if (strcmp(argv[c], "forceunmount") == 0) {
- config->force_unmount = B_TRUE;
- } else if (strcmp(argv[c], "recursive_homes") == 0) {
- config->recursive_homes = B_TRUE;
+ config->unmount_and_unload = 0;
} else if (strcmp(argv[c], "prop_mountpoint") == 0) {
if (config->homedir == NULL)
config->homedir = strdup(entry->pw_dir);
@@ -534,12 +517,8 @@
(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
if (strcmp(target->homedir, mountpoint) != 0) {
- if (target->recursive_homes) {
- (void) zfs_iter_filesystems_v2(zhp, 0,
- find_dsname_by_prop_value, target);
- }
zfs_close(zhp);
- return (target->dsname != NULL);
+ return (0);
}
target->dsname = strdup(zfs_get_name(zhp));
@@ -552,23 +531,17 @@
{
if (config->homedir != NULL &&
config->homes_prefix != NULL) {
- if (strcmp(config->homes_prefix, "*") == 0) {
- (void) zfs_iter_root(g_zfs,
- find_dsname_by_prop_value, config);
- } else {
- zfs_handle_t *zhp = zfs_open(g_zfs,
- config->homes_prefix, ZFS_TYPE_FILESYSTEM);
- if (zhp == NULL) {
- pam_syslog(NULL, LOG_ERR,
- "dataset %s not found",
- config->homes_prefix);
- return (NULL);
- }
-
- (void) zfs_iter_filesystems_v2(zhp, 0,
- find_dsname_by_prop_value, config);
- zfs_close(zhp);
+ zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix,
+ ZFS_TYPE_FILESYSTEM);
+ if (zhp == NULL) {
+ pam_syslog(NULL, LOG_ERR, "dataset %s not found",
+ config->homes_prefix);
+ return (NULL);
}
+
+ (void) zfs_iter_filesystems_v2(zhp, 0,
+ find_dsname_by_prop_value, config);
+ zfs_close(zhp);
char *dsname = config->dsname;
config->dsname = NULL;
return (dsname);
@@ -682,13 +655,8 @@
if (config_err != PAM_SUCCESS) {
return (config_err);
}
- if (config.uid < config.uid_min || config.uid > config.uid_max) {
- zfs_key_config_free(&config);
- return (PAM_SERVICE_ERR);
- }
- const pw_password_t *token = pw_fetch_lazy(pamh,
- PAM_AUTHTOK, PASSWORD_VAR_NAME);
+ const pw_password_t *token = pw_fetch_lazy(pamh);
if (token == NULL) {
zfs_key_config_free(&config);
return (PAM_AUTH_ERR);
@@ -738,12 +706,10 @@
if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) {
return (PAM_SERVICE_ERR);
}
- if (config.uid < config.uid_min || config.uid > config.uid_max) {
+ if (config.uid < 1000) {
zfs_key_config_free(&config);
- return (PAM_SERVICE_ERR);
+ return (PAM_SUCCESS);
}
- const pw_password_t *old_token = pw_get(pamh,
- PAM_OLDAUTHTOK, OLD_PASSWORD_VAR_NAME);
{
if (pam_zfs_init(pamh) != 0) {
zfs_key_config_free(&config);
@@ -755,62 +721,49 @@
zfs_key_config_free(&config);
return (PAM_SERVICE_ERR);
}
- if (!old_token) {
- pam_syslog(pamh, LOG_ERR,
- "old password from PAM stack is null");
+ int key_loaded = is_key_loaded(pamh, dataset);
+ if (key_loaded == -1) {
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
return (PAM_SERVICE_ERR);
}
- if (decrypt_mount(pamh, dataset,
- old_token->value, B_TRUE) == -1) {
+ free(dataset);
+ pam_zfs_free();
+ if (! key_loaded) {
pam_syslog(pamh, LOG_ERR,
- "old token mismatch");
- free(dataset);
- pam_zfs_free();
+ "key not loaded, returning try_again");
zfs_key_config_free(&config);
return (PAM_PERM_DENIED);
}
}
if ((flags & PAM_UPDATE_AUTHTOK) != 0) {
- const pw_password_t *token = pw_get(pamh, PAM_AUTHTOK,
- PASSWORD_VAR_NAME);
+ const pw_password_t *token = pw_get(pamh);
if (token == NULL) {
- pam_syslog(pamh, LOG_ERR, "new password unavailable");
- pam_zfs_free();
zfs_key_config_free(&config);
- pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
+ return (PAM_SERVICE_ERR);
+ }
+ if (pam_zfs_init(pamh) != 0) {
+ zfs_key_config_free(&config);
return (PAM_SERVICE_ERR);
}
char *dataset = zfs_key_config_get_dataset(&config);
if (!dataset) {
pam_zfs_free();
zfs_key_config_free(&config);
- pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
- pw_clear(pamh, PASSWORD_VAR_NAME);
return (PAM_SERVICE_ERR);
}
- int was_loaded = is_key_loaded(pamh, dataset);
- if (!was_loaded && decrypt_mount(pamh, dataset,
- old_token->value, B_FALSE) == -1) {
+ if (change_key(pamh, dataset, token->value) == -1) {
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
- pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
- pw_clear(pamh, PASSWORD_VAR_NAME);
return (PAM_SERVICE_ERR);
}
- int changed = change_key(pamh, dataset, token->value);
- if (!was_loaded) {
- unmount_unload(pamh, dataset, config.force_unmount);
- }
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
- if (pw_clear(pamh, OLD_PASSWORD_VAR_NAME) == -1 ||
- pw_clear(pamh, PASSWORD_VAR_NAME) == -1 || changed == -1) {
+ if (pw_clear(pamh) == -1) {
return (PAM_SERVICE_ERR);
}
} else {
@@ -835,7 +788,7 @@
return (PAM_SESSION_ERR);
}
- if (config.uid < config.uid_min || config.uid > config.uid_max) {
+ if (config.uid < 1000) {
zfs_key_config_free(&config);
return (PAM_SUCCESS);
}
@@ -846,8 +799,7 @@
return (PAM_SUCCESS);
}
- const pw_password_t *token = pw_get(pamh,
- PAM_AUTHTOK, PASSWORD_VAR_NAME);
+ const pw_password_t *token = pw_get(pamh);
if (token == NULL) {
zfs_key_config_free(&config);
return (PAM_SESSION_ERR);
@@ -871,7 +823,7 @@
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
- if (pw_clear(pamh, PASSWORD_VAR_NAME) == -1) {
+ if (pw_clear(pamh) == -1) {
return (PAM_SERVICE_ERR);
}
return (PAM_SUCCESS);
@@ -894,7 +846,7 @@
if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) {
return (PAM_SESSION_ERR);
}
- if (config.uid < config.uid_min || config.uid > config.uid_max) {
+ if (config.uid < 1000) {
zfs_key_config_free(&config);
return (PAM_SUCCESS);
}
@@ -916,7 +868,7 @@
zfs_key_config_free(&config);
return (PAM_SESSION_ERR);
}
- if (unmount_unload(pamh, dataset, config.force_unmount) == -1) {
+ if (unmount_unload(pamh, dataset) == -1) {
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h
--- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h
+++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h
@@ -75,7 +75,7 @@
extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache);
extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);
-__attribute__((malloc, alloc_size(1)))
+__attribute__((alloc_size(1)))
void *zfs_kmem_alloc(size_t size, int kmflags);
void zfs_kmem_free(void *buf, size_t size);
uint64_t kmem_size(void);
@@ -83,7 +83,6 @@
int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags);
void kmem_cache_destroy(kmem_cache_t *cache);
-__attribute__((malloc))
void *kmem_cache_alloc(kmem_cache_t *cache, int flags);
void kmem_cache_free(kmem_cache_t *cache, void *buf);
boolean_t kmem_cache_reap_active(void);
diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h
--- a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h
+++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h
@@ -68,6 +68,7 @@
zfs_trim,
zfs_txg,
zfs_vdev,
+ zfs_vdev_cache,
zfs_vdev_file,
zfs_vdev_mirror,
zfs_vnops,
diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h
@@ -31,10 +31,10 @@
#include <linux/vmalloc.h>
extern int kmem_debugging(void);
-__attribute__((format(printf, 1, 0)))
-extern char *kmem_vasprintf(const char *fmt, va_list ap);
-__attribute__((format(printf, 1, 2)))
-extern char *kmem_asprintf(const char *fmt, ...);
+extern char *kmem_vasprintf(const char *fmt, va_list ap)
+ __attribute__((format(printf, 1, 0)));
+extern char *kmem_asprintf(const char *fmt, ...)
+ __attribute__((format(printf, 1, 2)));
extern char *kmem_strdup(const char *str);
extern void kmem_strfree(char *str);
@@ -186,10 +186,10 @@
#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz))
#define kmem_cache_reap_active spl_kmem_cache_reap_active
-__attribute__((malloc, alloc_size(1)))
-extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
-__attribute__((malloc, alloc_size(1)))
-extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line)
+ __attribute__((alloc_size(1)));
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line)
+ __attribute__((alloc_size(1)));
extern void spl_kmem_free(const void *ptr, size_t sz);
/*
diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h
@@ -104,7 +104,6 @@
/* list node for the cpu hotplug callback */
struct hlist_node tq_hp_cb_node;
boolean_t tq_hp_support;
- unsigned long lastshouldstop; /* when to purge dynamic */
} taskq_t;
typedef struct taskq_ent {
diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h b/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h
@@ -91,10 +91,8 @@
#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz))
-extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line)
- __attribute__((malloc, alloc_size(1)));
-extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line)
- __attribute__((malloc, alloc_size(1)));
+extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
extern void spl_vmem_free(const void *ptr, size_t sz);
int spl_vmem_init(void);
diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h
--- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h
+++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h
@@ -215,39 +215,6 @@
TP_ARGS(zilog, zcw))
DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error);
-/*
- * Generic support for three argument tracepoints of the form:
- *
- * DTRACE_PROBE3(...,
- * zilog_t *, ...,
- * uint64_t, ...,
- * uint64_t, ...);
- */
-/* BEGIN CSTYLED */
-DECLARE_EVENT_CLASS(zfs_zil_block_size_class,
- TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1),
- TP_ARGS(zilog, res, s1),
- TP_STRUCT__entry(
- ZILOG_TP_STRUCT_ENTRY
- __field(uint64_t, res)
- __field(uint64_t, s1)
- ),
- TP_fast_assign(
- ZILOG_TP_FAST_ASSIGN
- __entry->res = res;
- __entry->s1 = s1;
- ),
- TP_printk(
- ZILOG_TP_PRINTK_FMT " res %llu s1 %llu",
- ZILOG_TP_PRINTK_ARGS, __entry->res, __entry->s1)
-);
-
-#define DEFINE_ZIL_BLOCK_SIZE_EVENT(name) \
-DEFINE_EVENT(zfs_zil_block_size_class, name, \
- TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), \
- TP_ARGS(zilog, res, s1))
-DEFINE_ZIL_BLOCK_SIZE_EVENT(zfs_zil__block__size);
-
#endif /* _TRACE_ZIL_H */
#undef TRACE_INCLUDE_PATH
@@ -261,7 +228,6 @@
DEFINE_DTRACE_PROBE2(zil__process__commit__itx);
DEFINE_DTRACE_PROBE2(zil__process__normal__itx);
DEFINE_DTRACE_PROBE2(zil__commit__io__error);
-DEFINE_DTRACE_PROBE3(zil__block__size);
#endif /* HAVE_DECLARE_EVENT_CLASS */
#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/include/sys/abd.h b/sys/contrib/openzfs/include/sys/abd.h
--- a/sys/contrib/openzfs/include/sys/abd.h
+++ b/sys/contrib/openzfs/include/sys/abd.h
@@ -86,15 +86,10 @@
* Allocations and deallocations
*/
-__attribute__((malloc))
abd_t *abd_alloc(size_t, boolean_t);
-__attribute__((malloc))
abd_t *abd_alloc_linear(size_t, boolean_t);
-__attribute__((malloc))
abd_t *abd_alloc_gang(void);
-__attribute__((malloc))
abd_t *abd_alloc_for_io(size_t, boolean_t);
-__attribute__((malloc))
abd_t *abd_alloc_sametype(abd_t *, size_t);
boolean_t abd_size_alloc_linear(size_t);
void abd_gang_add(abd_t *, abd_t *, boolean_t);
diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h
--- a/sys/contrib/openzfs/include/sys/arc.h
+++ b/sys/contrib/openzfs/include/sys/arc.h
@@ -304,8 +304,9 @@
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp,
arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
- arc_write_done_func_t *done, void *priv, zio_priority_t priority,
- int zio_flags, const zbookmark_phys_t *zb);
+ arc_write_done_func_t *physdone, arc_write_done_func_t *done,
+ void *priv, zio_priority_t priority, int zio_flags,
+ const zbookmark_phys_t *zb);
arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv);
void arc_remove_prune_callback(arc_prune_t *p);
diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h
--- a/sys/contrib/openzfs/include/sys/arc_impl.h
+++ b/sys/contrib/openzfs/include/sys/arc_impl.h
@@ -123,6 +123,7 @@
void *awcb_private;
arc_write_done_func_t *awcb_ready;
arc_write_done_func_t *awcb_children_ready;
+ arc_write_done_func_t *awcb_physdone;
arc_write_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
diff --git a/sys/contrib/openzfs/include/sys/btree.h b/sys/contrib/openzfs/include/sys/btree.h
--- a/sys/contrib/openzfs/include/sys/btree.h
+++ b/sys/contrib/openzfs/include/sys/btree.h
@@ -105,13 +105,8 @@
boolean_t bti_before;
} zfs_btree_index_t;
-typedef struct btree zfs_btree_t;
-typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t,
- const void *, zfs_btree_index_t *);
-
-struct btree {
+typedef struct btree {
int (*bt_compar) (const void *, const void *);
- bt_find_in_buf_f bt_find_in_buf;
size_t bt_elem_size;
size_t bt_leaf_size;
uint32_t bt_leaf_cap;
@@ -120,54 +115,7 @@
uint64_t bt_num_nodes;
zfs_btree_hdr_t *bt_root;
zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
-};
-
-/*
- * Implementation of Shar's algorithm designed to accelerate binary search by
- * eliminating impossible to predict branches.
- *
- * For optimality, this should be used to generate the search function in the
- * same file as the comparator and the comparator should be marked
- * `__attribute__((always_inline) inline` so that the compiler will inline it.
- *
- * Arguments are:
- *
- * NAME - The function name for this instance of the search function. Use it
- * in a subsequent call to zfs_btree_create().
- * T - The element type stored inside the B-Tree.
- * COMP - A comparator to compare two nodes, it must return exactly: -1, 0,
- * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons,
- * TREE_CMP() from avl.h can be used in a boilerplate function.
- */
-/* BEGIN CSTYLED */
-#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \
-_Pragma("GCC diagnostic push") \
-_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \
-static void * \
-NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \
- const void *value, zfs_btree_index_t *where) \
-{ \
- T *i = (T *)buf; \
- (void) tree; \
- _Pragma("GCC unroll 9") \
- while (nelems > 1) { \
- uint32_t half = nelems / 2; \
- nelems -= half; \
- i += (COMP(&i[half - 1], value) < 0) * half; \
- } \
- \
- int comp = COMP(i, value); \
- where->bti_offset = (i - (T *)buf) + (comp < 0); \
- where->bti_before = (comp != 0); \
- \
- if (comp == 0) { \
- return (i); \
- } \
- \
- return (NULL); \
-} \
-_Pragma("GCC diagnostic pop")
-/* END CSTYLED */
+} zfs_btree_t;
/*
* Allocate and deallocate caches for btree nodes.
@@ -181,19 +129,13 @@
* tree - the tree to be initialized
* compar - function to compare two nodes, it must return exactly: -1, 0, or +1
* -1 for <, 0 for ==, and +1 for >
- * find - optional function to accelerate searches inside B-Tree nodes
- * through Shar's algorithm and comparator inlining. Setting this to
- * NULL will use a generic function. The function should be created
- * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar.
- * compar should be marked `__attribute__((always_inline)) inline` or
- * performance is unlikely to improve very much.
* size - the value of sizeof(struct my_type)
* lsize - custom leaf size
*/
void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
- bt_find_in_buf_f, size_t);
+ size_t);
void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
- bt_find_in_buf_f, size_t, size_t);
+ size_t, size_t);
/*
* Find a node with a matching value in the tree. Returns the matching node
diff --git a/sys/contrib/openzfs/include/sys/dsl_scan.h b/sys/contrib/openzfs/include/sys/dsl_scan.h
--- a/sys/contrib/openzfs/include/sys/dsl_scan.h
+++ b/sys/contrib/openzfs/include/sys/dsl_scan.h
@@ -61,7 +61,7 @@
uint64_t scn_end_time;
uint64_t scn_to_examine; /* total bytes to be scanned */
uint64_t scn_examined; /* bytes scanned so far */
- uint64_t scn_skipped; /* bytes skipped by scanner */
+ uint64_t scn_to_process;
uint64_t scn_processed;
uint64_t scn_errors; /* scan I/O error count */
uint64_t scn_ddt_class_max;
diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h
--- a/sys/contrib/openzfs/include/sys/fs/zfs.h
+++ b/sys/contrib/openzfs/include/sys/fs/zfs.h
@@ -1088,7 +1088,7 @@
uint64_t pss_end_time; /* scan end time */
uint64_t pss_to_examine; /* total bytes to scan */
uint64_t pss_examined; /* total bytes located by scanner */
- uint64_t pss_skipped; /* total bytes skipped by scanner */
+ uint64_t pss_to_process; /* total bytes to process */
uint64_t pss_processed; /* total processed bytes */
uint64_t pss_errors; /* scan errors */
@@ -1152,7 +1152,6 @@
uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */
uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */
- uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */
} vdev_rebuild_stat_t;
/*
diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h
--- a/sys/contrib/openzfs/include/sys/spa.h
+++ b/sys/contrib/openzfs/include/sys/spa.h
@@ -723,10 +723,16 @@
* Send TRIM commands in-line during normal pool operation while deleting.
* OFF: no
* ON: yes
+ * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
*/
typedef enum {
SPA_AUTOTRIM_OFF = 0, /* default */
SPA_AUTOTRIM_ON,
+#ifdef IN_FREEBSD_BASE
+ SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
+#else
+ SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,
+#endif
} spa_autotrim_t;
/*
@@ -1168,6 +1174,10 @@
zbookmark_phys_t *zb);
extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep);
+/* vdev cache */
+extern void vdev_cache_stat_init(void);
+extern void vdev_cache_stat_fini(void);
+
/* vdev mirror */
extern void vdev_mirror_stat_init(void);
extern void vdev_mirror_stat_fini(void);
diff --git a/sys/contrib/openzfs/include/sys/vdev.h b/sys/contrib/openzfs/include/sys/vdev.h
--- a/sys/contrib/openzfs/include/sys/vdev.h
+++ b/sys/contrib/openzfs/include/sys/vdev.h
@@ -158,15 +158,20 @@
extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
+extern void vdev_cache_init(vdev_t *vd);
+extern void vdev_cache_fini(vdev_t *vd);
+extern boolean_t vdev_cache_read(zio_t *zio);
+extern void vdev_cache_write(zio_t *zio);
+extern void vdev_cache_purge(vdev_t *vd);
+
extern void vdev_queue_init(vdev_t *vd);
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
-extern uint32_t vdev_queue_length(vdev_t *vd);
+extern int vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
-extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h
--- a/sys/contrib/openzfs/include/sys/vdev_impl.h
+++ b/sys/contrib/openzfs/include/sys/vdev_impl.h
@@ -57,6 +57,8 @@
* Forward declarations that lots of things need.
*/
typedef struct vdev_queue vdev_queue_t;
+typedef struct vdev_cache vdev_cache_t;
+typedef struct vdev_cache_entry vdev_cache_entry_t;
struct abd;
extern uint_t zfs_vdev_queue_depth_pct;
@@ -130,24 +132,44 @@
/*
* Virtual device properties
*/
-typedef union vdev_queue_class {
- list_t vqc_list;
- avl_tree_t vqc_tree;
+struct vdev_cache_entry {
+ struct abd *ve_abd;
+ uint64_t ve_offset;
+ clock_t ve_lastused;
+ avl_node_t ve_offset_node;
+ avl_node_t ve_lastused_node;
+ uint32_t ve_hits;
+ uint16_t ve_missed_update;
+ zio_t *ve_fill_io;
+};
+
+struct vdev_cache {
+ avl_tree_t vc_offset_tree;
+ avl_tree_t vc_lastused_tree;
+ kmutex_t vc_lock;
+};
+
+typedef struct vdev_queue_class {
+ uint32_t vqc_active;
+
+ /*
+ * Sorted by offset or timestamp, depending on if the queue is
+ * LBA-ordered vs FIFO.
+ */
+ avl_tree_t vqc_queued_tree;
} vdev_queue_class_t;
struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
+ avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
+ avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset;
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
- uint32_t vq_cqueued; /* Classes with queued I/Os. */
- uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
- uint32_t vq_active; /* Number of active I/Os. */
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
- list_t vq_active_list; /* List of active I/Os. */
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */
@@ -421,6 +443,7 @@
boolean_t vdev_resilver_deferred; /* resilver deferred */
boolean_t vdev_kobj_flag; /* kobj event record */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
+ vdev_cache_t vdev_cache; /* physical block cache */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
vdev_aux_t vdev_label_aux; /* on-disk aux state */
diff --git a/sys/contrib/openzfs/include/sys/vdev_rebuild.h b/sys/contrib/openzfs/include/sys/vdev_rebuild.h
--- a/sys/contrib/openzfs/include/sys/vdev_rebuild.h
+++ b/sys/contrib/openzfs/include/sys/vdev_rebuild.h
@@ -79,7 +79,6 @@
uint64_t vr_pass_start_time;
uint64_t vr_pass_bytes_scanned;
uint64_t vr_pass_bytes_issued;
- uint64_t vr_pass_bytes_skipped;
/* On-disk state updated by vdev_rebuild_zap_update_sync() */
vdev_rebuild_phys_t vr_rebuild_phys;
diff --git a/sys/contrib/openzfs/include/sys/zfs_refcount.h b/sys/contrib/openzfs/include/sys/zfs_refcount.h
--- a/sys/contrib/openzfs/include/sys/zfs_refcount.h
+++ b/sys/contrib/openzfs/include/sys/zfs_refcount.h
@@ -27,7 +27,6 @@
#define _SYS_ZFS_REFCOUNT_H
#include <sys/inttypes.h>
-#include <sys/avl.h>
#include <sys/list.h>
#include <sys/zfs_context.h>
@@ -44,22 +43,19 @@
#ifdef ZFS_DEBUG
typedef struct reference {
- union {
- avl_node_t a;
- list_node_t l;
- } ref_link;
+ list_node_t ref_link;
const void *ref_holder;
uint64_t ref_number;
- boolean_t ref_search;
+ uint8_t *ref_removed;
} reference_t;
typedef struct refcount {
- uint64_t rc_count;
kmutex_t rc_mtx;
- avl_tree_t rc_tree;
- list_t rc_removed;
- uint_t rc_removed_count;
boolean_t rc_tracked;
+ list_t rc_list;
+ list_t rc_removed;
+ uint64_t rc_count;
+ uint64_t rc_removed_count;
} zfs_refcount_t;
/*
@@ -77,15 +73,13 @@
int64_t zfs_refcount_add(zfs_refcount_t *, const void *);
int64_t zfs_refcount_remove(zfs_refcount_t *, const void *);
/*
- * Note that (add|remove)_many adds/removes one reference with "number" N,
- * _not_ N references with "number" 1, which is what (add|remove)_few does,
- * or what vanilla zfs_refcount_(add|remove) called N times would do.
+ * Note that (add|remove)_many add/remove one reference with "number" N,
+ * _not_ make N references with "number" 1, which is what vanilla
+ * zfs_refcount_(add|remove) would do if called N times.
*
* Attempting to remove a reference with number N when none exists is a
* panic on debug kernels with reference_tracking enabled.
*/
-void zfs_refcount_add_few(zfs_refcount_t *, uint64_t, const void *);
-void zfs_refcount_remove_few(zfs_refcount_t *, uint64_t, const void *);
int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *);
int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *);
void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *);
@@ -114,10 +108,6 @@
#define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count)
#define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
#define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
-#define zfs_refcount_add_few(rc, number, holder) \
- atomic_add_64(&(rc)->rc_count, number)
-#define zfs_refcount_remove_few(rc, number, holder) \
- atomic_add_64(&(rc)->rc_count, -number)
#define zfs_refcount_add_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, number)
#define zfs_refcount_remove_many(rc, number, holder) \
diff --git a/sys/contrib/openzfs/include/sys/zfs_znode.h b/sys/contrib/openzfs/include/sys/zfs_znode.h
--- a/sys/contrib/openzfs/include/sys/zfs_znode.h
+++ b/sys/contrib/openzfs/include/sys/zfs_znode.h
@@ -158,7 +158,6 @@
#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
-extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
#ifdef _KERNEL
#include <sys/zfs_znode_impl.h>
@@ -281,6 +280,7 @@
extern void zfs_remove_op_tables(void);
extern int zfs_create_op_tables(void);
extern dev_t zfs_cmpldev(uint64_t);
+extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
extern int zfs_get_stats(objset_t *os, nvlist_t *nv);
extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os);
extern void zfs_znode_dmu_fini(znode_t *);
diff --git a/sys/contrib/openzfs/include/sys/zil.h b/sys/contrib/openzfs/include/sys/zil.h
--- a/sys/contrib/openzfs/include/sys/zil.h
+++ b/sys/contrib/openzfs/include/sys/zil.h
@@ -489,22 +489,18 @@
* Transactions which have been allocated to the "normal"
* (i.e. not slog) storage pool. Note that "bytes" accumulate
* the actual log record sizes - which do not include the actual
- * data in case of indirect writes. bytes <= write <= alloc.
+ * data in case of indirect writes.
*/
kstat_named_t zil_itx_metaslab_normal_count;
kstat_named_t zil_itx_metaslab_normal_bytes;
- kstat_named_t zil_itx_metaslab_normal_write;
- kstat_named_t zil_itx_metaslab_normal_alloc;
/*
* Transactions which have been allocated to the "slog" storage pool.
* If there are no separate log devices, this is the same as the
- * "normal" pool. bytes <= write <= alloc.
+ * "normal" pool.
*/
kstat_named_t zil_itx_metaslab_slog_count;
kstat_named_t zil_itx_metaslab_slog_bytes;
- kstat_named_t zil_itx_metaslab_slog_write;
- kstat_named_t zil_itx_metaslab_slog_alloc;
} zil_kstat_values_t;
typedef struct zil_sums {
@@ -519,12 +515,8 @@
wmsum_t zil_itx_needcopy_bytes;
wmsum_t zil_itx_metaslab_normal_count;
wmsum_t zil_itx_metaslab_normal_bytes;
- wmsum_t zil_itx_metaslab_normal_write;
- wmsum_t zil_itx_metaslab_normal_alloc;
wmsum_t zil_itx_metaslab_slog_count;
wmsum_t zil_itx_metaslab_slog_bytes;
- wmsum_t zil_itx_metaslab_slog_write;
- wmsum_t zil_itx_metaslab_slog_alloc;
} zil_sums_t;
#define ZIL_STAT_INCR(zil, stat, val) \
diff --git a/sys/contrib/openzfs/include/sys/zil_impl.h b/sys/contrib/openzfs/include/sys/zil_impl.h
--- a/sys/contrib/openzfs/include/sys/zil_impl.h
+++ b/sys/contrib/openzfs/include/sys/zil_impl.h
@@ -44,7 +44,7 @@
* must be held.
*
* After the lwb is "opened", it can transition into the "issued" state
- * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must
+ * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
* be held when making this transition.
*
* After the lwb's write zio completes, it transitions into the "write
@@ -93,23 +93,20 @@
blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */
- boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */
int lwb_nused; /* # used bytes in buffer */
- int lwb_nfilled; /* # filled bytes in buffer */
int lwb_sz; /* size of block and buffer */
lwb_state_t lwb_state; /* the state of this lwb */
char *lwb_buf; /* log write buffer */
zio_t *lwb_write_zio; /* zio for the lwb buffer */
zio_t *lwb_root_zio; /* root zio for lwb write and flushes */
- hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
uint64_t lwb_issued_txg; /* the txg when the write is issued */
uint64_t lwb_max_txg; /* highest txg in this lwb */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
- list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */
list_t lwb_itxs; /* list of itx's */
list_t lwb_waiters; /* list of zil_commit_waiter's */
avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */
kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */
+ hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
} lwb_t;
/*
diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h
--- a/sys/contrib/openzfs/include/sys/zio.h
+++ b/sys/contrib/openzfs/include/sys/zio.h
@@ -190,6 +190,7 @@
#define ZIO_FLAG_SPECULATIVE (1ULL << 8)
#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9)
#define ZIO_FLAG_DONT_RETRY (1ULL << 10)
+#define ZIO_FLAG_DONT_CACHE (1ULL << 11)
#define ZIO_FLAG_NODATA (1ULL << 12)
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)
@@ -341,9 +342,9 @@
enum zio_checksum zp_checksum;
enum zio_compress zp_compress;
uint8_t zp_complevel;
+ dmu_object_type_t zp_type;
uint8_t zp_level;
uint8_t zp_copies;
- dmu_object_type_t zp_type;
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
boolean_t zp_nopwrite;
@@ -436,12 +437,6 @@
list_node_t zl_child_node;
} zio_link_t;
-enum zio_qstate {
- ZIO_QS_NONE = 0,
- ZIO_QS_QUEUED,
- ZIO_QS_ACTIVE,
-};
-
struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
@@ -466,6 +461,7 @@
/* Callback info */
zio_done_func_t *io_ready;
zio_done_func_t *io_children_ready;
+ zio_done_func_t *io_physdone;
zio_done_func_t *io_done;
void *io_private;
int64_t io_prev_space_delta; /* DMU private */
@@ -485,12 +481,6 @@
const zio_vsd_ops_t *io_vsd_ops;
metaslab_class_t *io_metaslab_class; /* dva throttle class */
- enum zio_qstate io_queue_state; /* vdev queue state */
- union {
- list_node_t l;
- avl_node_t a;
- } io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
- avl_node_t io_offset_node; /* vdev offset queues */
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_queued_timestamp;
@@ -498,6 +488,9 @@
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
+ avl_node_t io_queue_node;
+ avl_node_t io_offset_node;
+ avl_node_t io_alloc_node;
zio_alloc_list_t io_alloc_list;
/* Internal pipeline state */
@@ -511,6 +504,9 @@
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+ uint64_t io_child_count;
+ uint64_t io_phys_children;
+ uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
@@ -558,8 +554,9 @@
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
- zio_done_func_t *done, void *priv, zio_priority_t priority,
- zio_flag_t flags, const zbookmark_phys_t *zb);
+ zio_done_func_t *physdone, zio_done_func_t *done,
+ void *priv, zio_priority_t priority, zio_flag_t flags,
+ const zbookmark_phys_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
@@ -611,7 +608,6 @@
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
extern zio_t *zio_unique_parent(zio_t *cio);
extern void zio_add_child(zio_t *pio, zio_t *cio);
-extern void zio_add_child_first(zio_t *pio, zio_t *cio);
extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size);
diff --git a/sys/contrib/openzfs/lib/libspl/include/umem.h b/sys/contrib/openzfs/lib/libspl/include/umem.h
--- a/sys/contrib/openzfs/lib/libspl/include/umem.h
+++ b/sys/contrib/openzfs/lib/libspl/include/umem.h
@@ -83,7 +83,7 @@
const char *_umem_options_init(void);
const char *_umem_logging_init(void);
-__attribute__((malloc, alloc_size(1)))
+__attribute__((alloc_size(1)))
static inline void *
umem_alloc(size_t size, int flags)
{
@@ -96,7 +96,7 @@
return (ptr);
}
-__attribute__((malloc, alloc_size(1)))
+__attribute__((alloc_size(1)))
static inline void *
umem_alloc_aligned(size_t size, size_t align, int flags)
{
@@ -118,7 +118,7 @@
return (ptr);
}
-__attribute__((malloc, alloc_size(1)))
+__attribute__((alloc_size(1)))
static inline void *
umem_zalloc(size_t size, int flags)
{
@@ -188,7 +188,6 @@
umem_free(cp, sizeof (umem_cache_t));
}
-__attribute__((malloc))
static inline void *
umem_cache_alloc(umem_cache_t *cp, int flags)
{
diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c
--- a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c
@@ -1789,8 +1789,7 @@
nvlist_t *nvl;
int nvl_len = 0;
int added_resv = 0;
- zfs_prop_t prop;
- boolean_t nsprop = B_FALSE;
+ zfs_prop_t prop = 0;
nvpair_t *elem;
(void) snprintf(errbuf, sizeof (errbuf),
@@ -1837,7 +1836,6 @@
elem = nvlist_next_nvpair(nvl, elem)) {
prop = zfs_name_to_prop(nvpair_name(elem));
- nsprop |= zfs_is_namespace_prop(prop);
assert(cl_idx < nvl_len);
/*
@@ -1936,7 +1934,8 @@
* if one of the options handled by the generic
* Linux namespace layer has been modified.
*/
- if (nsprop && zfs_is_mounted(zhp, NULL))
+ if (zfs_is_namespace_prop(prop) &&
+ zfs_is_mounted(zhp, NULL))
ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0);
}
}
diff --git a/sys/contrib/openzfs/lib/libzpool/Makefile.am b/sys/contrib/openzfs/lib/libzpool/Makefile.am
--- a/sys/contrib/openzfs/lib/libzpool/Makefile.am
+++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am
@@ -135,6 +135,7 @@
module/zfs/uberblock.c \
module/zfs/unique.c \
module/zfs/vdev.c \
+ module/zfs/vdev_cache.c \
module/zfs/vdev_draid.c \
module/zfs/vdev_draid_rand.c \
module/zfs/vdev_indirect.c \
diff --git a/sys/contrib/openzfs/man/man4/spl.4 b/sys/contrib/openzfs/man/man4/spl.4
--- a/sys/contrib/openzfs/man/man4/spl.4
+++ b/sys/contrib/openzfs/man/man4/spl.4
@@ -193,19 +193,4 @@
reading it could cause a lock-up if the list grow too large
without limiting the output.
"(truncated)" will be shown if the list is larger than the limit.
-.
-.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
-(Linux-only)
-How long a taskq has to have had no work before we tear it down.
-Previously, we would tear down a dynamic taskq worker as soon
-as we noticed it had no work, but it was observed that this led
-to a lot of churn in tearing down things we then immediately
-spawned anew.
-In practice, it seems any nonzero value will remove the vast
-majority of this churn, while the nontrivially larger value
-was chosen to help filter out the little remaining churn on
-a mostly idle system.
-Setting this value to
-.Sy 0
-will revert to the previous behavior.
.El
diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4
--- a/sys/contrib/openzfs/man/man4/zfs.4
+++ b/sys/contrib/openzfs/man/man4/zfs.4
@@ -239,16 +239,6 @@
Make some blocks above a certain size be gang blocks.
This option is used by the test suite to facilitate testing.
.
-.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
-Default DDT ZAP data block size as a power of 2. Note that changing this after
-creating a DDT on the pool will not affect existing DDTs, only newly created
-ones.
-.
-.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
-Default DDT ZAP indirect block size as a power of 2. Note that changing this
-after creating a DDT on the pool will not affect existing DDTs, only newly
-created ones.
-.
.It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int
Default dnode block size as a power of 2.
.
@@ -2026,12 +2016,33 @@
Flush dirty data to disk at least every this many seconds (maximum TXG
duration).
.
+.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+Allow TRIM I/O operations to be aggregated.
+This is normally not helpful because the extents to be trimmed
+will have been already been aggregated by the metaslab.
+This option is provided for debugging and performance analysis.
+.
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
Max vdev I/O aggregation size.
.
.It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
Max vdev I/O aggregation size for non-rotating media.
.
+.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64 KiB Pc Pq uint
+Shift size to inflate reads to.
+.
+.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq uint
+Inflate reads smaller than this value to meet the
+.Sy zfs_vdev_cache_bshift
+size
+.Pq default Sy 64 KiB .
+.
+.It Sy zfs_vdev_cache_size Ns = Ns Sy 0 Pq uint
+Total size of the per-disk cache in bytes.
+.Pp
+Currently this feature is disabled, as it has been found to not be helpful
+for performance and in some cases harmful.
+.
.It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int
A number by which the balancing algorithm increments the load calculation for
the purpose of selecting the least busy mirror member when an I/O operation
diff --git a/sys/contrib/openzfs/man/man7/zpool-features.7 b/sys/contrib/openzfs/man/man7/zpool-features.7
--- a/sys/contrib/openzfs/man/man7/zpool-features.7
+++ b/sys/contrib/openzfs/man/man7/zpool-features.7
@@ -228,10 +228,8 @@
filesystem_limits
hole_birth
large_blocks
-livelist
lz4_compress
spacemap_histogram
-zpool_checkpoint
.No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev
.Ed
diff --git a/sys/contrib/openzfs/man/man8/zdb.8 b/sys/contrib/openzfs/man/man8/zdb.8
--- a/sys/contrib/openzfs/man/man8/zdb.8
+++ b/sys/contrib/openzfs/man/man8/zdb.8
@@ -14,7 +14,7 @@
.\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
.\" Copyright (c) 2017 Intel Corporation.
.\"
-.Dd June 27, 2023
+.Dd October 7, 2020
.Dt ZDB 8
.Os
.
@@ -41,17 +41,9 @@
.Ar poolname Ns Op Ar / Ns Ar dataset Ns | Ns Ar objset-ID
.Op Ar object Ns | Ns Ar range Ns …
.Nm
-.Fl B
-.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
-.Op Fl U Ar cache
-.Op Fl K Ar key
-.Ar poolname Ns Ar / Ns Ar objset-ID
-.Op Ar backup-flags
-.Nm
.Fl C
.Op Fl A
.Op Fl U Ar cache
-.Op Ar poolname
.Nm
.Fl E
.Op Fl A
@@ -131,22 +123,6 @@
Display statistics regarding the number, size
.Pq logical, physical and allocated
and deduplication of blocks.
-.It Fl B , -backup
-Generate a backup stream, similar to
-.Nm zfs Cm send ,
-but for the numeric objset ID, and without opening the dataset.
-This can be useful in recovery scenarios if dataset metadata has become
-corrupted but the dataset itself is readable.
-The optional
-.Ar flags
-argument is a string of one or more of the letters
-.Sy e ,
-.Sy L ,
-.Sy c ,
-and
-.Sy w ,
-which correspond to the same flags in
-.Xr zfs-send 8 .
.It Fl c , -checksum
Verify the checksum of all metadata blocks while printing block statistics
.Po see
diff --git a/sys/contrib/openzfs/man/man8/zfs-create.8 b/sys/contrib/openzfs/man/man8/zfs-create.8
--- a/sys/contrib/openzfs/man/man8/zfs-create.8
+++ b/sys/contrib/openzfs/man/man8/zfs-create.8
@@ -234,11 +234,14 @@
Print verbose information about the created dataset.
.El
.El
-.Ss ZFS for Swap
-Swapping to a ZFS volume is prone to deadlock and not recommended.
-See OpenZFS FAQ.
-.Pp
-Swapping to a file on a ZFS filesystem is not supported.
+.Ss ZFS Volumes as Swap
+ZFS volumes may be used as swap devices.
+After creating the volume with the
+.Nm zfs Cm create Fl V
+enable the swap area using the
+.Xr swapon 8
+command.
+Swapping to files on ZFS filesystems is not supported.
.
.Sh EXAMPLES
.\" These are, respectively, examples 1, 10 from zfs.8
diff --git a/sys/contrib/openzfs/man/man8/zpool-create.8 b/sys/contrib/openzfs/man/man8/zpool-create.8
--- a/sys/contrib/openzfs/man/man8/zpool-create.8
+++ b/sys/contrib/openzfs/man/man8/zpool-create.8
@@ -87,13 +87,13 @@
However this check is not robust enough
to detect simultaneous attempts to use a new device in different pools, even if
.Sy multihost Ns = Sy enabled .
-The administrator must ensure that simultaneous invocations of any combination
+The administrator must ensure, that simultaneous invocations of any combination
of
.Nm zpool Cm replace ,
.Nm zpool Cm create ,
.Nm zpool Cm add ,
or
-.Nm zpool Cm labelclear
+.Nm zpool Cm labelclear ,
do not refer to the same device.
Using the same device in two pools will result in pool corruption.
.Pp
diff --git a/sys/contrib/openzfs/man/man8/zpool-events.8 b/sys/contrib/openzfs/man/man8/zpool-events.8
--- a/sys/contrib/openzfs/man/man8/zpool-events.8
+++ b/sys/contrib/openzfs/man/man8/zpool-events.8
@@ -456,6 +456,7 @@
ZIO_FLAG_SPECULATIVE:0x00000100
ZIO_FLAG_CONFIG_WRITER:0x00000200
ZIO_FLAG_DONT_RETRY:0x00000400
+ZIO_FLAG_DONT_CACHE:0x00000800
ZIO_FLAG_NODATA:0x00001000
ZIO_FLAG_INDUCE_DAMAGE:0x00002000
diff --git a/sys/contrib/openzfs/man/man8/zpool-scrub.8 b/sys/contrib/openzfs/man/man8/zpool-scrub.8
--- a/sys/contrib/openzfs/man/man8/zpool-scrub.8
+++ b/sys/contrib/openzfs/man/man8/zpool-scrub.8
@@ -26,7 +26,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
-.Dd June 22, 2023
+.Dd July 25, 2021
.Dt ZPOOL-SCRUB 8
.Os
.
@@ -123,7 +123,7 @@
.No # Nm zpool Cm status
...
scan: scrub in progress since Sun Jul 25 16:07:49 2021
- 403M / 405M scanned at 100M/s, 68.4M / 405M issued at 10.0M/s
+ 403M scanned at 100M/s, 68.4M issued at 10.0M/s, 405M total
0B repaired, 16.91% done, 00:00:04 to go
...
.Ed
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in
--- a/sys/contrib/openzfs/module/Kbuild.in
+++ b/sys/contrib/openzfs/module/Kbuild.in
@@ -34,20 +34,6 @@
ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than=
endif
-# Generated binary search code is particularly bad with this optimization.
-# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
-# is not affected when unrolling is done.
-# Disable it until the following upstream issue is resolved:
-# https://github.com/llvm/llvm-project/issues/62790
-ifeq ($(CONFIG_X86),y)
-ifeq ($(CONFIG_CC_IS_CLANG),y)
-CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false
-CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false
-CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false
-CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false
-endif
-endif
-
ifneq ($(KBUILD_EXTMOD),)
@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@
@@ -382,6 +368,7 @@
uberblock.o \
unique.o \
vdev.o \
+ vdev_cache.o \
vdev_draid.o \
vdev_draid_rand.o \
vdev_indirect.o \
diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd
--- a/sys/contrib/openzfs/module/Makefile.bsd
+++ b/sys/contrib/openzfs/module/Makefile.bsd
@@ -308,6 +308,7 @@
uberblock.c \
unique.c \
vdev.c \
+ vdev_cache.c \
vdev_draid.c \
vdev_draid_rand.c \
vdev_indirect.c \
@@ -399,20 +400,6 @@
.include <bsd.kmod.mk>
-# Generated binary search code is particularly bad with this optimization.
-# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
-# is not affected when unrolling is done.
-# Disable it until the following upstream issue is resolved:
-# https://github.com/llvm/llvm-project/issues/62790
-.if ${CC} == "clang"
-.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64"
-CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false
-CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false
-CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false
-CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false
-.endif
-.endif
-
CFLAGS.sysctl_os.c= -include ../zfs_config.h
CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -872,6 +872,8 @@
"Enable to bypass vdev_validate().");
/* END CSTYLED */
+/* vdev_cache.c */
+
/* vdev_mirror.c */
/* vdev_queue.c */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -495,8 +495,10 @@
{
zfs_acl_node_t *aclnode;
- while ((aclnode = list_remove_head(&aclp->z_acl)))
+ while ((aclnode = list_head(&aclp->z_acl))) {
+ list_remove(&aclp->z_acl, aclnode);
zfs_acl_node_free(aclnode);
+ }
aclp->z_acl_count = 0;
aclp->z_acl_bytes = 0;
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -2220,6 +2220,92 @@
return (0);
}
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+ uint64_t *cached_copy = NULL;
+
+ /*
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
+ */
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION) {
+ pname = ZPL_VERSION_STR;
+ } else {
+ pname = zfs_prop_to_name(prop);
+ }
+
+ if (os != NULL) {
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+ }
+
+ if (error == ENOENT) {
+ /* No value set, use the default value */
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ *value = ZPL_VERSION;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ case ZFS_PROP_UTF8ONLY:
+ *value = 0;
+ break;
+ case ZFS_PROP_CASE:
+ *value = ZFS_CASE_SENSITIVE;
+ break;
+ case ZFS_PROP_ACLTYPE:
+ *value = ZFS_ACLTYPE_NFSV4;
+ break;
+ default:
+ return (error);
+ }
+ error = 0;
+ }
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
+ return (error);
+}
+
/*
* Return true if the corresponding vfs's unmounted flag is set.
* Otherwise return false.
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
@@ -2069,93 +2069,6 @@
return (error);
}
-/*
- * Read a property stored within the master node.
- */
-int
-zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
-{
- uint64_t *cached_copy = NULL;
-
- /*
- * Figure out where in the objset_t the cached copy would live, if it
- * is available for the requested property.
- */
- if (os != NULL) {
- switch (prop) {
- case ZFS_PROP_VERSION:
- cached_copy = &os->os_version;
- break;
- case ZFS_PROP_NORMALIZE:
- cached_copy = &os->os_normalization;
- break;
- case ZFS_PROP_UTF8ONLY:
- cached_copy = &os->os_utf8only;
- break;
- case ZFS_PROP_CASE:
- cached_copy = &os->os_casesensitivity;
- break;
- default:
- break;
- }
- }
- if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
- *value = *cached_copy;
- return (0);
- }
-
- /*
- * If the property wasn't cached, look up the file system's value for
- * the property. For the version property, we look up a slightly
- * different string.
- */
- const char *pname;
- int error = ENOENT;
- if (prop == ZFS_PROP_VERSION) {
- pname = ZPL_VERSION_STR;
- } else {
- pname = zfs_prop_to_name(prop);
- }
-
- if (os != NULL) {
- ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
- error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
- }
-
- if (error == ENOENT) {
- /* No value set, use the default value */
- switch (prop) {
- case ZFS_PROP_VERSION:
- *value = ZPL_VERSION;
- break;
- case ZFS_PROP_NORMALIZE:
- case ZFS_PROP_UTF8ONLY:
- *value = 0;
- break;
- case ZFS_PROP_CASE:
- *value = ZFS_CASE_SENSITIVE;
- break;
- case ZFS_PROP_ACLTYPE:
- *value = ZFS_ACLTYPE_NFSV4;
- break;
- default:
- return (error);
- }
- error = 0;
- }
-
- /*
- * If one of the methods for getting the property value above worked,
- * copy it into the objset_t's cache.
- */
- if (error == 0 && cached_copy != NULL) {
- *cached_copy = *value;
- }
-
- return (error);
-}
-
-
void
zfs_znode_update_vfs(znode_t *zp)
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
@@ -182,11 +182,8 @@
* of that infrastructure we are responsible for incrementing it.
*/
if (current->reclaim_state)
-#ifdef HAVE_RECLAIM_STATE_RECLAIMED
- current->reclaim_state->reclaimed += size >> PAGE_SHIFT;
-#else
current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
-#endif
+
vfree(ptr);
}
@@ -1015,18 +1012,8 @@
ASSERT0(flags & ~KM_PUBLIC_MASK);
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT((skc->skc_flags & KMC_SLAB) == 0);
-
- *obj = NULL;
-
- /*
- * Since we can't sleep attempt an emergency allocation to satisfy
- * the request. The only alterative is to fail the allocation but
- * it's preferable try. The use of KM_NOSLEEP is expected to be rare.
- */
- if (flags & KM_NOSLEEP)
- return (spl_emergency_alloc(skc, flags, obj));
-
might_sleep();
+ *obj = NULL;
/*
* Before allocating a new slab wait for any reaping to complete and
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@@ -36,12 +36,6 @@
module_param(spl_taskq_thread_bind, int, 0644);
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
-static uint_t spl_taskq_thread_timeout_ms = 10000;
-/* BEGIN CSTYLED */
-module_param(spl_taskq_thread_timeout_ms, uint, 0644);
-/* END CSTYLED */
-MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
- "Time to require a dynamic thread be idle before it gets cleaned up");
static int spl_taskq_thread_dynamic = 1;
module_param(spl_taskq_thread_dynamic, int, 0444);
@@ -854,37 +848,12 @@
tqt_thread_list) == tqt)
return (0);
- int no_work =
+ return
((tq->tq_nspawn == 0) && /* No threads are being spawned */
(tq->tq_nactive == 0) && /* No threads are handling tasks */
(tq->tq_nthreads > 1) && /* More than 1 thread is running */
(!taskq_next_ent(tq)) && /* There are no pending tasks */
(spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
-
- /*
- * If we would have said stop before, let's instead wait a bit, maybe
- * we'll see more work come our way soon...
- */
- if (no_work) {
- /* if it's 0, we want the old behavior. */
- /* if the taskq is being torn down, we also want to go away. */
- if (spl_taskq_thread_timeout_ms == 0 ||
- !(tq->tq_flags & TASKQ_ACTIVE))
- return (1);
- unsigned long lasttime = tq->lastshouldstop;
- if (lasttime > 0) {
- if (time_after(jiffies, lasttime +
- msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
- return (1);
- else
- return (0);
- } else {
- tq->lastshouldstop = jiffies;
- }
- } else {
- tq->lastshouldstop = 0;
- }
- return (0);
}
static int
@@ -1122,7 +1091,6 @@
tq->tq_flags = (flags | TASKQ_ACTIVE);
tq->tq_next_id = TASKQID_INITIAL;
tq->tq_lowest_id = TASKQID_INITIAL;
- tq->lastshouldstop = 0;
INIT_LIST_HEAD(&tq->tq_free_list);
INIT_LIST_HEAD(&tq->tq_pend_list);
INIT_LIST_HEAD(&tq->tq_prio_list);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
--- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@@ -219,11 +219,7 @@
arc_reduce_target_size(ptob(sc->nr_to_scan));
arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE);
if (current->reclaim_state != NULL)
-#ifdef HAVE_RECLAIM_STATE_RECLAIMED
- current->reclaim_state->reclaimed += sc->nr_to_scan;
-#else
current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
-#endif
/*
* We are experiencing memory pressure which the arc_evict_zthr was
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -493,8 +493,10 @@
{
zfs_acl_node_t *aclnode;
- while ((aclnode = list_remove_head(&aclp->z_acl)))
+ while ((aclnode = list_head(&aclp->z_acl))) {
+ list_remove(&aclp->z_acl, aclnode);
zfs_acl_node_free(aclnode);
+ }
aclp->z_acl_count = 0;
aclp->z_acl_bytes = 0;
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -2052,6 +2052,91 @@
return (0);
}
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+ uint64_t *cached_copy = NULL;
+
+ /*
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
+ */
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION)
+ pname = ZPL_VERSION_STR;
+ else
+ pname = zfs_prop_to_name(prop);
+
+ if (os != NULL) {
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+ }
+
+ if (error == ENOENT) {
+ /* No value set, use the default value */
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ *value = ZPL_VERSION;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ case ZFS_PROP_UTF8ONLY:
+ *value = 0;
+ break;
+ case ZFS_PROP_CASE:
+ *value = ZFS_CASE_SENSITIVE;
+ break;
+ case ZFS_PROP_ACLTYPE:
+ *value = ZFS_ACLTYPE_OFF;
+ break;
+ default:
+ return (error);
+ }
+ error = 0;
+ }
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
+ return (error);
+}
+
/*
* Return true if the corresponding vfs's unmounted flag is set.
* Otherwise return false.
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
@@ -2254,91 +2254,6 @@
return (error);
}
-/*
- * Read a property stored within the master node.
- */
-int
-zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
-{
- uint64_t *cached_copy = NULL;
-
- /*
- * Figure out where in the objset_t the cached copy would live, if it
- * is available for the requested property.
- */
- if (os != NULL) {
- switch (prop) {
- case ZFS_PROP_VERSION:
- cached_copy = &os->os_version;
- break;
- case ZFS_PROP_NORMALIZE:
- cached_copy = &os->os_normalization;
- break;
- case ZFS_PROP_UTF8ONLY:
- cached_copy = &os->os_utf8only;
- break;
- case ZFS_PROP_CASE:
- cached_copy = &os->os_casesensitivity;
- break;
- default:
- break;
- }
- }
- if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
- *value = *cached_copy;
- return (0);
- }
-
- /*
- * If the property wasn't cached, look up the file system's value for
- * the property. For the version property, we look up a slightly
- * different string.
- */
- const char *pname;
- int error = ENOENT;
- if (prop == ZFS_PROP_VERSION)
- pname = ZPL_VERSION_STR;
- else
- pname = zfs_prop_to_name(prop);
-
- if (os != NULL) {
- ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
- error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
- }
-
- if (error == ENOENT) {
- /* No value set, use the default value */
- switch (prop) {
- case ZFS_PROP_VERSION:
- *value = ZPL_VERSION;
- break;
- case ZFS_PROP_NORMALIZE:
- case ZFS_PROP_UTF8ONLY:
- *value = 0;
- break;
- case ZFS_PROP_CASE:
- *value = ZFS_CASE_SENSITIVE;
- break;
- case ZFS_PROP_ACLTYPE:
- *value = ZFS_ACLTYPE_OFF;
- break;
- default:
- return (error);
- }
- error = 0;
- }
-
- /*
- * If one of the methods for getting the property value above worked,
- * copy it into the objset_t's cache.
- */
- if (error == 0 && cached_copy != NULL) {
- *cached_copy = *value;
- }
-
- return (error);
-}
-
#if defined(_KERNEL)
EXPORT_SYMBOL(zfs_create_fs);
EXPORT_SYMBOL(zfs_obj_to_path);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -54,7 +54,7 @@
static unsigned long zvol_max_discard_blocks = 16384;
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
-static unsigned int zvol_open_timeout_ms = 1000;
+static const unsigned int zvol_open_timeout_ms = 1000;
#endif
static unsigned int zvol_threads = 0;
@@ -1612,9 +1612,4 @@
"Process volblocksize blocks per thread");
#endif
-#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
-module_param(zvol_open_timeout_ms, uint, 0644);
-MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
-#endif
-
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
--- a/sys/contrib/openzfs/module/zcommon/zpool_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
@@ -160,7 +160,7 @@
"wait | continue | panic", "FAILMODE", failuremode_table,
sfeatures);
zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim",
- SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL,
+ SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL,
"on | off", "AUTOTRIM", boolean_table, sfeatures);
/* hidden properties */
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -965,7 +965,7 @@
l2arc_dev_t *dev);
/* L2ARC persistence write I/O routines. */
-static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
l2arc_write_callback_t *cb);
/* L2ARC persistence auxiliary routines. */
@@ -6106,7 +6106,8 @@
asize, abd,
ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
- zio_flags | ZIO_FLAG_CANFAIL |
+ zio_flags | ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
acb->acb_zio_head = rzio;
@@ -6675,6 +6676,18 @@
callback->awcb_children_ready(zio, buf, callback->awcb_private);
}
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write. See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+ arc_write_callback_t *cb = zio->io_private;
+ if (cb->awcb_physdone != NULL)
+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
static void
arc_write_done(zio_t *zio)
{
@@ -6764,9 +6777,9 @@
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
const zio_prop_t *zp, arc_write_done_func_t *ready,
- arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
- void *private, zio_priority_t priority, int zio_flags,
- const zbookmark_phys_t *zb)
+ arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
+ arc_write_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
@@ -6813,6 +6826,7 @@
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_children_ready = children_ready;
+ callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
@@ -6849,7 +6863,8 @@
abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
- arc_write_done, callback, priority, zio_flags, zb);
+ arc_write_physdone, arc_write_done, callback,
+ priority, zio_flags, zb);
return (zio);
}
@@ -7851,7 +7866,8 @@
taskq_destroy(arc_prune_taskq);
mutex_enter(&arc_prune_mtx);
- while ((p = list_remove_head(&arc_prune_list)) != NULL) {
+ while ((p = list_head(&arc_prune_list)) != NULL) {
+ list_remove(&arc_prune_list, p);
zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
zfs_refcount_destroy(&p->p_refcnt);
kmem_free(p, sizeof (*p));
@@ -8159,7 +8175,7 @@
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
- uint64_t size;
+ uint64_t size, dev_size, tsize;
/*
* Make sure our globals have meaningful values in case the user
@@ -8176,45 +8192,35 @@
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
+ /*
+ * Make sure the write size does not exceed the size of the cache
+ * device. This is important in l2arc_evict(), otherwise infinite
+ * iteration can occur.
+ */
+ dev_size = dev->l2ad_end - dev->l2ad_start;
+
/* We need to add in the worst case scenario of log block overhead. */
- size += l2arc_log_blk_overhead(size, dev);
+ tsize = size + l2arc_log_blk_overhead(size, dev);
if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
/*
* Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
* times the writesize, whichever is greater.
*/
- size += MAX(64 * 1024 * 1024,
- (size * l2arc_trim_ahead) / 100);
+ tsize += MAX(64 * 1024 * 1024,
+ (tsize * l2arc_trim_ahead) / 100);
}
- /*
- * Make sure the write size does not exceed the size of the cache
- * device. This is important in l2arc_evict(), otherwise infinite
- * iteration can occur.
- */
- if (size > dev->l2ad_end - dev->l2ad_start) {
+ if (tsize >= dev_size) {
cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
"plus the overhead of log blocks (persistent L2ARC, "
"%llu bytes) exceeds the size of the cache device "
"(guid %llu), resetting them to the default (%d)",
(u_longlong_t)l2arc_log_blk_overhead(size, dev),
(u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
-
size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
- if (l2arc_trim_ahead > 1) {
- cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1");
- l2arc_trim_ahead = 1;
- }
-
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
-
- size += l2arc_log_blk_overhead(size, dev);
- if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
- size += MAX(64 * 1024 * 1024,
- (size * l2arc_trim_ahead) / 100);
- }
}
return (size);
@@ -8313,14 +8319,20 @@
static void
l2arc_do_free_on_write(void)
{
- l2arc_data_free_t *df;
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
mutex_enter(&l2arc_free_on_write_mtx);
- while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
+ buflist = l2arc_free_on_write;
+
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
ASSERT3P(df->l2df_abd, !=, NULL);
abd_free(df->l2df_abd);
+ list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
+
mutex_exit(&l2arc_free_on_write_mtx);
}
@@ -8833,7 +8845,7 @@
top:
rerun = B_FALSE;
- if (dev->l2ad_hand + distance > dev->l2ad_end) {
+ if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
/*
* When there is no space to accommodate upcoming writes,
* evict to the end. Then bump the write and evict hands
@@ -9027,7 +9039,7 @@
*/
ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
if (!dev->l2ad_first)
- ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
+ ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
}
}
@@ -9287,13 +9299,7 @@
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
psize);
- /*
- * If the allocated size of this buffer plus the max
- * size for the pending log block exceeds the evicted
- * target size, terminate writing buffers for this run.
- */
- if (write_asize + asize +
- sizeof (l2arc_log_blk_phys_t) > target_sz) {
+ if ((write_asize + asize) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
break;
@@ -9407,14 +9413,8 @@
* arcstat_l2_{size,asize} kstats are updated
* internally.
*/
- if (l2arc_log_blk_insert(dev, hdr)) {
- /*
- * l2ad_hand will be adjusted in
- * l2arc_log_blk_commit().
- */
- write_asize +=
- l2arc_log_blk_commit(dev, pio, cb);
- }
+ if (l2arc_log_blk_insert(dev, hdr))
+ l2arc_log_blk_commit(dev, pio, cb);
zio_nowait(wzio);
}
@@ -10173,7 +10173,8 @@
err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_SPECULATIVE, B_FALSE));
abd_free(abd);
@@ -10493,10 +10494,11 @@
cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
cb->l2rcb_abd = abd_get_from_buf(lb, asize);
pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
return (pio);
@@ -10562,7 +10564,7 @@
* This function allocates some memory to temporarily hold the serialized
* buffer to be written. This is then released in l2arc_write_done.
*/
-static uint64_t
+static void
l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
{
l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
@@ -10673,8 +10675,6 @@
dev->l2ad_log_ent_idx = 0;
dev->l2ad_log_blk_payload_asize = 0;
dev->l2ad_log_blk_payload_start = 0;
-
- return (asize);
}
/*
diff --git a/sys/contrib/openzfs/module/zfs/bplist.c b/sys/contrib/openzfs/module/zfs/bplist.c
--- a/sys/contrib/openzfs/module/zfs/bplist.c
+++ b/sys/contrib/openzfs/module/zfs/bplist.c
@@ -65,8 +65,9 @@
bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
- while ((bpe = list_remove_head(&bpl->bpl_list))) {
+ while ((bpe = list_head(&bpl->bpl_list))) {
bplist_iterate_last_removed = bpe;
+ list_remove(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
func(arg, &bpe->bpe_blk, tx);
kmem_free(bpe, sizeof (*bpe));
@@ -81,7 +82,10 @@
bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
- while ((bpe = list_remove_head(&bpl->bpl_list)))
+ while ((bpe = list_head(&bpl->bpl_list))) {
+ bplist_iterate_last_removed = bpe;
+ list_remove(&bpl->bpl_list, bpe);
kmem_free(bpe, sizeof (*bpe));
+ }
mutex_exit(&bpl->bpl_lock);
}
diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c
--- a/sys/contrib/openzfs/module/zfs/btree.c
+++ b/sys/contrib/openzfs/module/zfs/btree.c
@@ -193,20 +193,14 @@
void
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
- bt_find_in_buf_f bt_find_in_buf, size_t size)
+ size_t size)
{
- zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
- BTREE_LEAF_SIZE);
+ zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE);
}
-static void *
-zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
- const void *value, zfs_btree_index_t *where);
-
void
zfs_btree_create_custom(zfs_btree_t *tree,
int (*compar) (const void *, const void *),
- bt_find_in_buf_f bt_find_in_buf,
size_t size, size_t lsize)
{
size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
@@ -214,8 +208,6 @@
ASSERT3U(size, <=, esize / 2);
memset(tree, 0, sizeof (*tree));
tree->bt_compar = compar;
- tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
- zfs_btree_find_in_buf : bt_find_in_buf;
tree->bt_elem_size = size;
tree->bt_leaf_size = lsize;
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
@@ -311,7 +303,7 @@
* element in the last leaf, it's in the last leaf or
* it's not in the tree.
*/
- void *d = tree->bt_find_in_buf(tree,
+ void *d = zfs_btree_find_in_buf(tree,
last_leaf->btl_elems +
last_leaf->btl_hdr.bth_first * size,
last_leaf->btl_hdr.bth_count, value, &idx);
@@ -335,7 +327,7 @@
for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
ASSERT3P(node, !=, NULL);
- void *d = tree->bt_find_in_buf(tree, node->btc_elems,
+ void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
node->btc_hdr.bth_count, value, &idx);
EQUIV(d != NULL, !idx.bti_before);
if (d != NULL) {
@@ -355,7 +347,7 @@
*/
zfs_btree_leaf_t *leaf = (depth == 0 ?
(zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
- void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
+ void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems +
leaf->btl_hdr.bth_first * size,
leaf->btl_hdr.bth_count, value, &idx);
@@ -679,7 +671,7 @@
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
zfs_btree_index_t idx;
ASSERT(zfs_btree_is_core(par_hdr));
- VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
+ VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
par_hdr->bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
uint32_t offset = idx.bti_offset;
@@ -905,7 +897,7 @@
}
zfs_btree_index_t idx;
zfs_btree_core_t *parent = hdr->bth_parent;
- VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
+ VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
--- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c
+++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
@@ -49,12 +49,8 @@
{ "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }
+ { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }
}
};
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -4369,6 +4369,22 @@
rw_exit(&parent_db->db_rwlock);
}
+static void
+dbuf_lightweight_physdone(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
+ ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+ /*
+ * The callback will be called io_phys_children times. Retire one
+ * portion of our dirty space each time we are called. Any rounding
+ * error will be cleaned up by dbuf_lightweight_done().
+ */
+ int delta = dr->dr_accounted / zio->io_phys_children;
+ dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
static void
dbuf_lightweight_done(zio_t *zio)
{
@@ -4387,8 +4403,16 @@
dsl_dataset_block_born(ds, zio->io_bp, tx);
}
- dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
- zio->io_txg);
+ /*
+ * See comment in dbuf_write_done().
+ */
+ if (zio->io_phys_children == 0) {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted, zio->io_txg);
+ } else {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted % zio->io_phys_children, zio->io_txg);
+ }
abd_free(dr->dt.dll.dr_abd);
kmem_free(dr, sizeof (*dr));
@@ -4422,7 +4446,8 @@
dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
- dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
+ dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
+ ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
zio_nowait(dr->dr_zio);
@@ -4764,6 +4789,37 @@
DB_DNODE_EXIT(db);
}
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times). This
+ * allows the DMU to monitor the progress of each logical i/o. For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block. There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ (void) buf;
+ dmu_buf_impl_t *db = arg;
+ objset_t *os = db->db_objset;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+ dbuf_dirty_record_t *dr;
+ int delta = 0;
+
+ dr = db->db_data_pending;
+ ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+ /*
+ * The callback will be called io_phys_children times. Retire one
+ * portion of our dirty space each time we are called. Any rounding
+ * error will be cleaned up by dbuf_write_done().
+ */
+ delta = dr->dr_accounted / zio->io_phys_children;
+ dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
@@ -4838,8 +4894,27 @@
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
- dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
- zio->io_txg);
+ /*
+ * If we didn't do a physical write in this ZIO and we
+ * still ended up here, it means that the space of the
+ * dbuf that we just released (and undirtied) above hasn't
+ * been marked as undirtied in the pool's accounting.
+ *
+ * Thus, we undirty that space in the pool's view of the
+ * world here. For physical writes this type of update
+ * happens in dbuf_write_physdone().
+ *
+ * If we did a physical write, cleanup any rounding errors
+ * that came up due to writing multiple copies of a block
+ * on disk [see dbuf_write_physdone()].
+ */
+ if (zio->io_phys_children == 0) {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted, zio->io_txg);
+ } else {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted % zio->io_phys_children, zio->io_txg);
+ }
kmem_free(dr, sizeof (dbuf_dirty_record_t));
}
@@ -5087,7 +5162,7 @@
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_override_ready, NULL,
+ dbuf_write_override_ready, NULL, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
@@ -5101,7 +5176,7 @@
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_nofill_ready, NULL,
+ dbuf_write_nofill_ready, NULL, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@@ -5120,8 +5195,9 @@
dr->dr_zio = arc_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
- children_ready_cb, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ children_ready_cb, dbuf_write_physdone,
+ dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED, &zb);
}
}
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
--- a/sys/contrib/openzfs/module/zfs/ddt.c
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -1209,19 +1209,10 @@
ASSERT3S(dde->dde_class, <, DDT_CLASSES);
ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
-
- /*
- * This entry already existed (dde_type is real), so it must
- * have refcnt >0 at the start of this txg. We are called from
- * brt_pending_apply(), before frees are issued, so the refcnt
- * can't be lowered yet. Therefore, it must be >0. We assert
- * this because if the order of BRT and DDT interactions were
- * ever to change and the refcnt was ever zero here, then
- * likely further action is required to fill out the DDT entry,
- * and this is a place that is likely to be missed in testing.
- */
- ASSERT3U(ddp->ddp_refcnt, >, 0);
-
+ if (ddp->ddp_refcnt == 0) {
+ /* This should never happen? */
+ ddt_phys_fill(ddp, bp);
+ }
ddt_phys_addref(ddp);
result = B_TRUE;
} else {
diff --git a/sys/contrib/openzfs/module/zfs/ddt_zap.c b/sys/contrib/openzfs/module/zfs/ddt_zap.c
--- a/sys/contrib/openzfs/module/zfs/ddt_zap.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c
@@ -31,8 +31,8 @@
#include <sys/zap.h>
#include <sys/dmu_tx.h>
-static unsigned int ddt_zap_default_bs = 15;
-static unsigned int ddt_zap_default_ibs = 15;
+static const int ddt_zap_leaf_blockshift = 12;
+static const int ddt_zap_indirect_blockshift = 12;
static int
ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
@@ -43,7 +43,7 @@
flags |= ZAP_FLAG_PRE_HASHED_KEY;
*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
- ddt_zap_default_bs, ddt_zap_default_ibs,
+ ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
DMU_OT_NONE, 0, tx);
return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0);
@@ -166,10 +166,3 @@
ddt_zap_walk,
ddt_zap_count,
};
-
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW,
- "DDT ZAP leaf blockshift");
-ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW,
- "DDT ZAP indirect blockshift");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -1698,7 +1698,7 @@
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
- dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
+ dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
@@ -1864,7 +1864,7 @@
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
- &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
+ &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
return (0);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -1698,7 +1698,7 @@
zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
- &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
+ &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
@@ -1755,8 +1755,9 @@
taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
- while ((dr = list_remove_head(list)) != NULL) {
+ while ((dr = list_head(list)) != NULL) {
ASSERT0(dr->dr_dbuf->db_level);
+ list_remove(list, dr);
zio_nowait(dr->dr_zio);
}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c
--- a/sys/contrib/openzfs/module/zfs/dmu_recv.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c
@@ -1371,8 +1371,8 @@
dnode_t *dn;
abd_t *abd = rrd->abd;
zio_cksum_t bp_cksum = bp->blk_cksum;
- zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY |
- ZIO_FLAG_CANFAIL;
+ zio_flag_t flags = ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL;
if (rwa->raw)
flags |= ZIO_FLAG_RAW;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c
--- a/sys/contrib/openzfs/module/zfs/dmu_send.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_send.c
@@ -1955,7 +1955,7 @@
{
dsl_dataset_t *to_ds = dspp->to_ds;
dsl_pool_t *dp = dspp->dp;
-
+#ifdef _KERNEL
if (dmu_objset_type(os) == DMU_OST_ZFS) {
uint64_t version;
if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0)
@@ -1964,6 +1964,7 @@
if (version >= ZPL_VERSION_SA)
*featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
}
+#endif
/* raw sends imply large_block_ok */
if ((dspp->rawok || dspp->large_block_ok) &&
@@ -2792,7 +2793,6 @@
}
if (err == 0) {
- owned = B_TRUE;
err = zap_lookup(dspp.dp->dp_meta_objset,
dspp.to_ds->ds_object,
DS_FIELD_RESUME_TOGUID, 8, 1,
@@ -2806,24 +2806,21 @@
sizeof (dspp.saved_toname),
dspp.saved_toname);
}
- /* Only disown if there was an error in the lookups */
- if (owned && (err != 0))
+ if (err != 0)
dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
kmem_strfree(name);
} else {
err = dsl_dataset_own(dspp.dp, tosnap, dsflags,
FTAG, &dspp.to_ds);
- if (err == 0)
- owned = B_TRUE;
}
+ owned = B_TRUE;
} else {
err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,
&dspp.to_ds);
}
if (err != 0) {
- /* Note: dsl dataset is not owned at this point */
dsl_pool_rele(dspp.dp, FTAG);
return (err);
}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c
--- a/sys/contrib/openzfs/module/zfs/dmu_tx.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c
@@ -1396,7 +1396,8 @@
{
dmu_tx_callback_t *dcb;
- while ((dcb = list_remove_tail(cb_list)) != NULL) {
+ while ((dcb = list_tail(cb_list)) != NULL) {
+ list_remove(cb_list, dcb);
dcb->dcb_func(dcb->dcb_data, error);
kmem_free(dcb, sizeof (dmu_tx_callback_t));
}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -520,7 +520,8 @@
issued = pf_end - pf_start + ipf_end - ipf_start;
if (issued > 1) {
/* More references on top of taken in dmu_zfetch_prepare(). */
- zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL);
+ for (int i = 0; i < issued - 1; i++)
+ zfs_refcount_add(&zs->zs_refs, NULL);
} else if (issued == 0) {
/* Some other thread has done our work, so drop the ref. */
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
--- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@@ -3782,7 +3782,8 @@
if (l == NULL || !list_link_active(&l->list_head))
return;
- while ((snap = list_remove_tail(l)) != NULL) {
+ while ((snap = list_tail(l)) != NULL) {
+ list_remove(l, snap);
dsl_dataset_rele(snap->ds, tag);
kmem_free(snap, sizeof (*snap));
}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c
--- a/sys/contrib/openzfs/module/zfs/dsl_dir.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c
@@ -1490,7 +1490,7 @@
if (tr_cookie == NULL)
return;
- while ((tr = list_remove_head(tr_list)) != NULL) {
+ while ((tr = list_head(tr_list)) != NULL) {
if (tr->tr_ds) {
mutex_enter(&tr->tr_ds->dd_lock);
ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
@@ -1500,6 +1500,7 @@
} else {
arc_tempreserve_clear(tr->tr_size);
}
+ list_remove(tr_list, tr);
kmem_free(tr, sizeof (struct tempreserve));
}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -234,7 +234,7 @@
static int zfs_free_bpobj_enabled = 1;
/* Error blocks to be scrubbed in one txg. */
-static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
+uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
/* the order has to match pool_scan_type */
static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
@@ -573,8 +573,7 @@
* counter to how far we've scanned. We know we're consistent
* up to here.
*/
- scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
- scn->scn_phys.scn_skipped;
+ scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
if (dsl_scan_is_running(scn) &&
spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
@@ -3438,8 +3437,10 @@
* If we were suspended in the middle of processing,
* requeue any unfinished sios and exit.
*/
- while ((sio = list_remove_head(&sio_list)) != NULL)
+ while ((sio = list_head(&sio_list)) != NULL) {
+ list_remove(&sio_list, sio);
scan_io_queue_insert_impl(queue, sio);
+ }
queue->q_zio = NULL;
mutex_exit(q_lock);
@@ -4363,7 +4364,7 @@
* Disabled by default, set zfs_scan_report_txgs to report
* average performance over the last zfs_scan_report_txgs TXGs.
*/
- if (zfs_scan_report_txgs != 0 &&
+ if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 &&
tx->tx_txg % zfs_scan_report_txgs == 0) {
scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
spa_scan_stat_init(spa);
@@ -4565,15 +4566,6 @@
all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
}
-static void
-count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
-{
- if (BP_IS_EMBEDDED(bp))
- return;
- atomic_add_64(&scn->scn_phys.scn_skipped,
- all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
-}
-
static void
count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
{
@@ -4719,7 +4711,7 @@
count_block(dp->dp_blkstats, bp);
if (phys_birth <= scn->scn_phys.scn_min_txg ||
phys_birth >= scn->scn_phys.scn_max_txg) {
- count_block_skipped(scn, bp, B_TRUE);
+ count_block_issued(spa, bp, B_TRUE);
return (0);
}
@@ -4760,7 +4752,7 @@
if (needs_io && !zfs_no_scrub_io) {
dsl_scan_enqueue(dp, bp, zio_flags, zb);
} else {
- count_block_skipped(scn, bp, B_TRUE);
+ count_block_issued(spa, bp, B_TRUE);
}
/* do not relocate this block */
@@ -4885,7 +4877,6 @@
* with single operation. Plus it makes scrubs more sequential and reduces
* chances that minor extent change move it within the B-tree.
*/
-__attribute__((always_inline)) inline
static int
ext_size_compare(const void *x, const void *y)
{
@@ -4894,17 +4885,13 @@
return (TREE_CMP(*a, *b));
}
-ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
- ext_size_compare)
-
static void
ext_size_create(range_tree_t *rt, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
- zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
- sizeof (uint64_t));
+ zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t));
}
static void
@@ -5129,9 +5116,9 @@
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
- /* count the block as though we skipped it */
+ /* count the block as though we issued it */
sio2bp(sio, &tmpbp);
- count_block_skipped(scn, &tmpbp, B_FALSE);
+ count_block_issued(spa, &tmpbp, B_FALSE);
sio_free(sio);
}
diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c
--- a/sys/contrib/openzfs/module/zfs/fm.c
+++ b/sys/contrib/openzfs/module/zfs/fm.c
@@ -148,7 +148,8 @@
list_remove(&zevent_list, ev);
/* Remove references to this event in all private file data */
- while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) {
+ while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
+ list_remove(&ev->ev_ze_list, ze);
ze->ze_zevent = NULL;
ze->ze_dropped++;
}
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -1342,7 +1342,6 @@
* Comparison function for the private size-ordered tree using 32-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
-__attribute__((always_inline)) inline
static int
metaslab_rangesize32_compare(const void *x1, const void *x2)
{
@@ -1353,15 +1352,16 @@
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
+ if (likely(cmp))
+ return (cmp);
- return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
+ return (TREE_CMP(r1->rs_start, r2->rs_start));
}
/*
* Comparison function for the private size-ordered tree using 64-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
-__attribute__((always_inline)) inline
static int
metaslab_rangesize64_compare(const void *x1, const void *x2)
{
@@ -1372,10 +1372,11 @@
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
+ if (likely(cmp))
+ return (cmp);
- return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
+ return (TREE_CMP(r1->rs_start, r2->rs_start));
}
-
typedef struct metaslab_rt_arg {
zfs_btree_t *mra_bt;
uint32_t mra_floor_shift;
@@ -1411,13 +1412,6 @@
range_tree_walk(rt, metaslab_size_sorted_add, &arg);
}
-
-ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
- range_seg32_t, metaslab_rangesize32_compare)
-
-ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
- range_seg64_t, metaslab_rangesize64_compare)
-
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
@@ -1430,22 +1424,19 @@
size_t size;
int (*compare) (const void *, const void *);
- bt_find_in_buf_f bt_find;
switch (rt->rt_type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = metaslab_rangesize32_compare;
- bt_find = metaslab_rt_find_rangesize32_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = metaslab_rangesize64_compare;
- bt_find = metaslab_rt_find_rangesize64_in_buf;
break;
default:
panic("Invalid range seg type %d", rt->rt_type);
}
- zfs_btree_create(size_tree, compare, bt_find, size);
+ zfs_btree_create(size_tree, compare, size);
mrap->mra_floor_shift = metaslab_by_size_min_shift;
}
@@ -5650,7 +5641,8 @@
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
*/
- zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
+ for (int d = 0; d < slots; d++)
+ zfs_refcount_add(&mca->mca_alloc_slots, zio);
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
return (B_TRUE);
}
@@ -5664,7 +5656,8 @@
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
- zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
+ for (int d = 0; d < slots; d++)
+ zfs_refcount_remove(&mca->mca_alloc_slots, zio);
}
static int
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
--- a/sys/contrib/openzfs/module/zfs/range_tree.c
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -151,7 +151,6 @@
rt->rt_histogram[idx]--;
}
-__attribute__((always_inline)) inline
static int
range_tree_seg32_compare(const void *x1, const void *x2)
{
@@ -164,7 +163,6 @@
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
-__attribute__((always_inline)) inline
static int
range_tree_seg64_compare(const void *x1, const void *x2)
{
@@ -177,7 +175,6 @@
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
-__attribute__((always_inline)) inline
static int
range_tree_seg_gap_compare(const void *x1, const void *x2)
{
@@ -190,15 +187,6 @@
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
-ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t,
- range_tree_seg32_compare)
-
-ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t,
- range_tree_seg64_compare)
-
-ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t,
- range_tree_seg_gap_compare)
-
range_tree_t *
range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
void *arg, uint64_t start, uint64_t shift, uint64_t gap)
@@ -209,27 +197,23 @@
ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
size_t size;
int (*compare) (const void *, const void *);
- bt_find_in_buf_f bt_find;
switch (type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = range_tree_seg32_compare;
- bt_find = range_tree_seg32_find_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = range_tree_seg64_compare;
- bt_find = range_tree_seg64_find_in_buf;
break;
case RANGE_SEG_GAP:
size = sizeof (range_seg_gap_t);
compare = range_tree_seg_gap_compare;
- bt_find = range_tree_seg_gap_find_in_buf;
break;
default:
panic("Invalid range seg type %d", type);
}
- zfs_btree_create(&rt->rt_root, compare, bt_find, size);
+ zfs_btree_create(&rt->rt_root, compare, size);
rt->rt_ops = ops;
rt->rt_gap = gap;
diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c
--- a/sys/contrib/openzfs/module/zfs/refcount.c
+++ b/sys/contrib/openzfs/module/zfs/refcount.c
@@ -36,40 +36,33 @@
static uint_t reference_history = 3; /* tunable */
static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
void
zfs_refcount_init(void)
{
reference_cache = kmem_cache_create("reference_cache",
sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ reference_history_cache = kmem_cache_create("reference_history_cache",
+ sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
}
void
zfs_refcount_fini(void)
{
kmem_cache_destroy(reference_cache);
-}
-
-static int
-zfs_refcount_compare(const void *x1, const void *x2)
-{
- const reference_t *r1 = (const reference_t *)x1;
- const reference_t *r2 = (const reference_t *)x2;
-
- int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder);
- int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number);
- int cmp = cmp1 ? cmp1 : cmp2;
- return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2));
+ kmem_cache_destroy(reference_history_cache);
}
void
zfs_refcount_create(zfs_refcount_t *rc)
{
mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t),
- offsetof(reference_t, ref_link.a));
+ list_create(&rc->rc_list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
list_create(&rc->rc_removed, sizeof (reference_t),
- offsetof(reference_t, ref_link.l));
+ offsetof(reference_t, ref_link));
rc->rc_count = 0;
rc->rc_removed_count = 0;
rc->rc_tracked = reference_tracking_enable;
@@ -93,15 +86,19 @@
zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
{
reference_t *ref;
- void *cookie = NULL;
ASSERT3U(rc->rc_count, ==, number);
- while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL)
+ while ((ref = list_head(&rc->rc_list))) {
+ list_remove(&rc->rc_list, ref);
kmem_cache_free(reference_cache, ref);
- avl_destroy(&rc->rc_tree);
+ }
+ list_destroy(&rc->rc_list);
- while ((ref = list_remove_head(&rc->rc_removed)))
+ while ((ref = list_head(&rc->rc_removed))) {
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache, ref->ref_removed);
kmem_cache_free(reference_cache, ref);
+ }
list_destroy(&rc->rc_removed);
mutex_destroy(&rc->rc_mtx);
}
@@ -127,10 +124,10 @@
int64_t
zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
{
- reference_t *ref;
+ reference_t *ref = NULL;
int64_t count;
- if (likely(!rc->rc_tracked)) {
+ if (!rc->rc_tracked) {
count = atomic_add_64_nv(&(rc)->rc_count, number);
ASSERT3U(count, >=, number);
return (count);
@@ -139,9 +136,8 @@
ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
ref->ref_holder = holder;
ref->ref_number = number;
- ref->ref_search = B_FALSE;
mutex_enter(&rc->rc_mtx);
- avl_add(&rc->rc_tree, ref);
+ list_insert_head(&rc->rc_list, ref);
rc->rc_count += number;
count = rc->rc_count;
mutex_exit(&rc->rc_mtx);
@@ -155,55 +151,51 @@
return (zfs_refcount_add_many(rc, 1, holder));
}
-void
-zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
-{
- if (likely(!rc->rc_tracked))
- (void) zfs_refcount_add_many(rc, number, holder);
- else for (; number > 0; number--)
- (void) zfs_refcount_add(rc, holder);
-}
-
int64_t
zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
const void *holder)
{
- reference_t *ref, s;
+ reference_t *ref;
int64_t count;
- if (likely(!rc->rc_tracked)) {
+ if (!rc->rc_tracked) {
count = atomic_add_64_nv(&(rc)->rc_count, -number);
ASSERT3S(count, >=, 0);
return (count);
}
- s.ref_holder = holder;
- s.ref_number = number;
- s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
ASSERT3U(rc->rc_count, >=, number);
- ref = avl_find(&rc->rc_tree, &s, NULL);
- if (unlikely(ref == NULL)) {
- panic("No such hold %p on refcount %llx", holder,
- (u_longlong_t)(uintptr_t)rc);
- return (-1);
- }
- avl_remove(&rc->rc_tree, ref);
- if (reference_history > 0) {
- list_insert_head(&rc->rc_removed, ref);
- if (rc->rc_removed_count >= reference_history) {
- ref = list_remove_tail(&rc->rc_removed);
- kmem_cache_free(reference_cache, ref);
- } else {
- rc->rc_removed_count++;
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder && ref->ref_number == number) {
+ list_remove(&rc->rc_list, ref);
+ if (reference_history > 0) {
+ ref->ref_removed =
+ kmem_cache_alloc(reference_history_cache,
+ KM_SLEEP);
+ list_insert_head(&rc->rc_removed, ref);
+ rc->rc_removed_count++;
+ if (rc->rc_removed_count > reference_history) {
+ ref = list_tail(&rc->rc_removed);
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache,
+ ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ rc->rc_removed_count--;
+ }
+ } else {
+ kmem_cache_free(reference_cache, ref);
+ }
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
}
- } else {
- kmem_cache_free(reference_cache, ref);
}
- rc->rc_count -= number;
- count = rc->rc_count;
- mutex_exit(&rc->rc_mtx);
- return (count);
+ panic("No such hold %p on refcount %llx", holder,
+ (u_longlong_t)(uintptr_t)rc);
+ return (-1);
}
int64_t
@@ -212,50 +204,34 @@
return (zfs_refcount_remove_many(rc, 1, holder));
}
-void
-zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
-{
- if (likely(!rc->rc_tracked))
- (void) zfs_refcount_remove_many(rc, number, holder);
- else for (; number > 0; number--)
- (void) zfs_refcount_remove(rc, holder);
-}
-
void
zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
{
- avl_tree_t tree;
- list_t removed;
- reference_t *ref;
- void *cookie = NULL;
- uint64_t count;
- uint_t removed_count;
+ int64_t count, removed_count;
+ list_t list, removed;
- avl_create(&tree, zfs_refcount_compare, sizeof (reference_t),
- offsetof(reference_t, ref_link.a));
+ list_create(&list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
list_create(&removed, sizeof (reference_t),
- offsetof(reference_t, ref_link.l));
+ offsetof(reference_t, ref_link));
mutex_enter(&src->rc_mtx);
count = src->rc_count;
removed_count = src->rc_removed_count;
src->rc_count = 0;
src->rc_removed_count = 0;
- avl_swap(&tree, &src->rc_tree);
+ list_move_tail(&list, &src->rc_list);
list_move_tail(&removed, &src->rc_removed);
mutex_exit(&src->rc_mtx);
mutex_enter(&dst->rc_mtx);
dst->rc_count += count;
dst->rc_removed_count += removed_count;
- if (avl_is_empty(&dst->rc_tree))
- avl_swap(&dst->rc_tree, &tree);
- else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL)
- avl_add(&dst->rc_tree, ref);
+ list_move_tail(&dst->rc_list, &list);
list_move_tail(&dst->rc_removed, &removed);
mutex_exit(&dst->rc_mtx);
- avl_destroy(&tree);
+ list_destroy(&list);
list_destroy(&removed);
}
@@ -263,19 +239,23 @@
zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
const void *current_holder, const void *new_holder)
{
- reference_t *ref, s;
+ reference_t *ref;
+ boolean_t found = B_FALSE;
- if (likely(!rc->rc_tracked))
+ if (!rc->rc_tracked)
return;
- s.ref_holder = current_holder;
- s.ref_number = number;
- s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
- ref = avl_find(&rc->rc_tree, &s, NULL);
- ASSERT(ref);
- ref->ref_holder = new_holder;
- avl_update(&rc->rc_tree, ref);
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == current_holder &&
+ ref->ref_number == number) {
+ ref->ref_holder = new_holder;
+ found = B_TRUE;
+ break;
+ }
+ }
+ ASSERT(found);
mutex_exit(&rc->rc_mtx);
}
@@ -295,23 +275,21 @@
boolean_t
zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
{
- reference_t *ref, s;
- avl_index_t idx;
- boolean_t res;
+ reference_t *ref;
- if (likely(!rc->rc_tracked))
+ if (!rc->rc_tracked)
return (zfs_refcount_count(rc) > 0);
- s.ref_holder = holder;
- s.ref_number = 0;
- s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
- ref = avl_find(&rc->rc_tree, &s, &idx);
- if (likely(ref == NULL))
- ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
- res = ref && ref->ref_holder == holder;
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+ }
mutex_exit(&rc->rc_mtx);
- return (res);
+ return (B_FALSE);
}
/*
@@ -322,23 +300,21 @@
boolean_t
zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
{
- reference_t *ref, s;
- avl_index_t idx;
- boolean_t res;
+ reference_t *ref;
- if (likely(!rc->rc_tracked))
+ if (!rc->rc_tracked)
return (B_TRUE);
mutex_enter(&rc->rc_mtx);
- s.ref_holder = holder;
- s.ref_number = 0;
- s.ref_search = B_TRUE;
- ref = avl_find(&rc->rc_tree, &s, &idx);
- if (likely(ref == NULL))
- ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
- res = ref == NULL || ref->ref_holder != holder;
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+ }
+ }
mutex_exit(&rc->rc_mtx);
- return (res);
+ return (B_TRUE);
}
EXPORT_SYMBOL(zfs_refcount_create);
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -33,7 +33,6 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
- * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
*/
/*
@@ -1609,16 +1608,16 @@
{
void *cookie = NULL;
spa_log_sm_t *sls;
- log_summary_entry_t *e;
-
while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
&cookie)) != NULL) {
VERIFY0(sls->sls_mscount);
kmem_free(sls, sizeof (spa_log_sm_t));
}
- while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e != NULL; e = list_head(&spa->spa_log_summary)) {
VERIFY0(e->lse_mscount);
+ list_remove(&spa->spa_log_summary, e);
kmem_free(e, sizeof (log_summary_entry_t));
}
@@ -6875,11 +6874,9 @@
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
- dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
+ if (dsl_scan_resilvering(spa_get_dsl(spa)))
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_RESILVER_IN_PROGRESS));
- }
} else {
if (vdev_rebuild_active(rvd))
return (spa_vdev_exit(spa, NULL, txg,
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -730,7 +730,7 @@
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
- sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
+ sizeof (zio_t), offsetof(zio_t, io_alloc_node));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
@@ -814,7 +814,8 @@
if (spa->spa_root)
spa_strfree(spa->spa_root);
- while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) {
+ while ((dp = list_head(&spa->spa_config_list)) != NULL) {
+ list_remove(&spa->spa_config_list, dp);
if (dp->scd_path != NULL)
spa_strfree(dp->scd_path);
kmem_free(dp, sizeof (spa_config_dirent_t));
@@ -2438,6 +2439,7 @@
zio_init();
dmu_init();
zil_init();
+ vdev_cache_stat_init();
vdev_mirror_stat_init();
vdev_raidz_math_init();
vdev_file_init();
@@ -2461,6 +2463,7 @@
spa_evict_all();
vdev_file_fini();
+ vdev_cache_stat_fini();
vdev_mirror_stat_fini();
vdev_raidz_math_fini();
chksum_fini();
@@ -2611,7 +2614,7 @@
ps->pss_end_time = scn->scn_phys.scn_end_time;
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
ps->pss_examined = scn->scn_phys.scn_examined;
- ps->pss_skipped = scn->scn_phys.scn_skipped;
+ ps->pss_to_process = scn->scn_phys.scn_to_process;
ps->pss_processed = scn->scn_phys.scn_processed;
ps->pss_errors = scn->scn_phys.scn_errors;
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
--- a/sys/contrib/openzfs/module/zfs/txg.c
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -895,10 +895,15 @@
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
- boolean_t res = B_TRUE;
- for (int i = 0; i < TXG_SIZE; i++)
- res &= (tl->tl_head[i] == NULL);
- return (res);
+ mutex_enter(&tl->tl_lock);
+ for (int i = 0; i < TXG_SIZE; i++) {
+ if (!txg_list_empty_impl(tl, i)) {
+ mutex_exit(&tl->tl_lock);
+ return (B_FALSE);
+ }
+ }
+ mutex_exit(&tl->tl_lock);
+ return (B_TRUE);
}
/*
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -29,7 +29,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Datto Inc. All rights reserved.
* Copyright (c) 2021, Klara Inc.
- * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
+ * Copyright [2021] Hewlett Packard Enterprise Development LP
*/
#include <sys/zfs_context.h>
@@ -715,6 +715,7 @@
offsetof(struct vdev, vdev_dtl_node));
vd->vdev_stat.vs_timestamp = gethrtime();
vdev_queue_init(vd);
+ vdev_cache_init(vd);
return (vd);
}
@@ -1095,6 +1096,7 @@
* Clean up vdev structure.
*/
vdev_queue_fini(vd);
+ vdev_cache_fini(vd);
if (vd->vdev_path)
spa_strfree(vd->vdev_path);
@@ -1718,7 +1720,8 @@
vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
- ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
+ ZIO_FLAG_TRYHARD;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@@ -2609,6 +2612,8 @@
vd->vdev_ops->vdev_op_close(vd);
+ vdev_cache_purge(vd);
+
/*
* We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
@@ -2694,17 +2699,6 @@
(void) vdev_validate(vd);
}
- /*
- * Recheck if resilver is still needed and cancel any
- * scheduled resilver if resilver is unneeded.
- */
- if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
- spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
- mutex_enter(&spa->spa_async_lock);
- spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
- mutex_exit(&spa->spa_async_lock);
- }
-
/*
* Reassess parent vdev's health.
*/
@@ -4608,9 +4602,11 @@
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
- for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
- vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
- vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
+ for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
+ vsx->vsx_active_queue[t] =
+ vd->vdev_queue.vq_class[t].vqc_active;
+ vsx->vsx_pend_queue[t] = avl_numnodes(
+ &vd->vdev_queue.vq_class[t].vqc_queued_tree);
}
}
}
@@ -5468,20 +5464,20 @@
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
- if (vq->vq_active > 0) {
+ if (avl_numnodes(&vq->vq_active_tree) > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
- zfs_dbgmsg("slow vdev: %s has %u active IOs",
- vd->vdev_path, vq->vq_active);
+ zfs_dbgmsg("slow vdev: %s has %lu active IOs",
+ vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
- fio = list_head(&vq->vq_active_list);
+ fio = avl_first(&vq->vq_active_tree);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_cache.c b/sys/contrib/openzfs/module/zfs/vdev_cache.c
new file mode 100644
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_cache.c
@@ -0,0 +1,436 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/kstat.h>
+#include <sys/abd.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache. When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result. In the best case, this can turn 128 back-to-back 512-byte
+ * reads into a single 64k read followed by 127 cache hits; this reduces
+ * latency dramatically. In the worst case, it can turn an isolated 512-byte
+ * read into a 64k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth. A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region. Currently, only
+ * metadata I/O is inflated. A further enhancement could take advantage of
+ * more semantic information about the I/O. And it could use something
+ * faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate. This reserves a cache entry for the specified region.
+ * We separate the allocate and fill operations so that multiple threads
+ * don't generate I/O for the same cache miss.
+ *
+ * (2) Fill. When the I/O for a cache miss completes, the fill routine
+ * places the data in the previously allocated cache entry.
+ *
+ * (3) Read. Read data from the cache.
+ *
+ * (4) Write. Update cache contents after write completion.
+ *
+ * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
+ * if the total cache size exceeds zfs_vdev_cache_size.
+ */
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * All i/os smaller than zfs_vdev_cache_max will be turned into
+ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
+ * track buffer). At most zfs_vdev_cache_size bytes will be kept in each
+ * vdev's vdev_cache.
+ *
+ * TODO: Note that with the current ZFS code, it turns out that the
+ * vdev cache is not helpful, and in some cases actually harmful. It
+ * is better if we disable this. Once some time has passed, we should
+ * actually remove this to simplify the code. For now we just disable
+ * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11
+ * has made these same changes.
+ */
+static uint_t zfs_vdev_cache_max = 1 << 14; /* 16KB */
+static uint_t zfs_vdev_cache_size = 0;
+static uint_t zfs_vdev_cache_bshift = 16;
+
+#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */
+
+static kstat_t *vdc_ksp = NULL;
+
+typedef struct vdc_stats {
+ kstat_named_t vdc_stat_delegations;
+ kstat_named_t vdc_stat_hits;
+ kstat_named_t vdc_stat_misses;
+} vdc_stats_t;
+
+static vdc_stats_t vdc_stats = {
+ { "delegations", KSTAT_DATA_UINT64 },
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 }
+};
+
+#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64);
+
+static inline int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
+ const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
+
+ return (TREE_CMP(ve1->ve_offset, ve2->ve_offset));
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
+ const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
+
+ int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused);
+ if (likely(cmp))
+ return (cmp);
+
+ /*
+ * Among equally old entries, sort by offset to ensure uniqueness.
+ */
+ return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
+ ASSERT3P(ve->ve_abd, !=, NULL);
+
+ avl_remove(&vc->vc_lastused_tree, ve);
+ avl_remove(&vc->vc_offset_tree, ve);
+ abd_free(ve->ve_abd);
+ kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache. At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
+ vdev_cache_entry_t *ve;
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+ if (zfs_vdev_cache_size == 0)
+ return (NULL);
+
+ /*
+ * If adding a new entry would exceed the cache size,
+ * evict the oldest entry (LRU).
+ */
+ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
+ zfs_vdev_cache_size) {
+ ve = avl_first(&vc->vc_lastused_tree);
+ if (ve->ve_fill_io != NULL)
+ return (NULL);
+ ASSERT3U(ve->ve_hits, !=, 0);
+ vdev_cache_evict(vc, ve);
+ }
+
+ ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+ ve->ve_offset = offset;
+ ve->ve_lastused = ddi_get_lbolt();
+ ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
+
+ avl_add(&vc->vc_offset_tree, ve);
+ avl_add(&vc->vc_lastused_tree, ve);
+
+ return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
+
+ if (ve->ve_lastused != ddi_get_lbolt()) {
+ avl_remove(&vc->vc_lastused_tree, ve);
+ ve->ve_lastused = ddi_get_lbolt();
+ avl_add(&vc->vc_lastused_tree, ve);
+ }
+
+ ve->ve_hits++;
+ abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *fio)
+{
+ vdev_t *vd = fio->io_vd;
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve = fio->io_private;
+ zio_t *pio;
+
+ ASSERT3U(fio->io_size, ==, VCBS);
+
+ /*
+ * Add data to the cache.
+ */
+ mutex_enter(&vc->vc_lock);
+
+ ASSERT3P(ve->ve_fill_io, ==, fio);
+ ASSERT3U(ve->ve_offset, ==, fio->io_offset);
+ ASSERT3P(ve->ve_abd, ==, fio->io_abd);
+
+ ve->ve_fill_io = NULL;
+
+ /*
+ * Even if this cache line was invalidated by a missed write update,
+ * any reads that were queued up before the missed update are still
+ * valid, so we can satisfy them from this line before we evict it.
+ */
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(fio, &zl)) != NULL)
+ vdev_cache_hit(vc, ve, pio);
+
+ if (fio->io_error || ve->ve_missed_update)
+ vdev_cache_evict(vc, ve);
+
+ mutex_exit(&vc->vc_lock);
+}
+
+/*
+ * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss.
+ */
+boolean_t
+vdev_cache_read(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
+ zio_t *fio;
+ uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS);
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ if (zfs_vdev_cache_size == 0)
+ return (B_FALSE);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+ return (B_FALSE);
+
+ if (zio->io_size > zfs_vdev_cache_max)
+ return (B_FALSE);
+
+ /*
+ * If the I/O straddles two or more cache blocks, don't cache it.
+ */
+ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
+ return (B_FALSE);
+
+ ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = cache_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
+
+ if (ve != NULL) {
+ if (ve->ve_missed_update) {
+ mutex_exit(&vc->vc_lock);
+ return (B_FALSE);
+ }
+
+ if ((fio = ve->ve_fill_io) != NULL) {
+ zio_vdev_io_bypass(zio);
+ zio_add_child(zio, fio);
+ mutex_exit(&vc->vc_lock);
+ VDCSTAT_BUMP(vdc_stat_delegations);
+ return (B_TRUE);
+ }
+
+ vdev_cache_hit(vc, ve, zio);
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ VDCSTAT_BUMP(vdc_stat_hits);
+ return (B_TRUE);
+ }
+
+ ve = vdev_cache_allocate(zio);
+
+ if (ve == NULL) {
+ mutex_exit(&vc->vc_lock);
+ return (B_FALSE);
+ }
+
+ fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
+ ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
+
+ ve->ve_fill_io = fio;
+ zio_vdev_io_bypass(zio);
+ zio_add_child(zio, fio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_nowait(fio);
+ VDCSTAT_BUMP(vdc_stat_misses);
+
+ return (B_TRUE);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t io_start = zio->io_offset;
+ uint64_t io_end = io_start + zio->io_size;
+ uint64_t min_offset = P2ALIGN(io_start, VCBS);
+ uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
+ avl_index_t where;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = min_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+ if (ve == NULL)
+ ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+ while (ve != NULL && ve->ve_offset < max_offset) {
+ uint64_t start = MAX(ve->ve_offset, io_start);
+ uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
+
+ if (ve->ve_fill_io != NULL) {
+ ve->ve_missed_update = 1;
+ } else {
+ abd_copy_off(ve->ve_abd, zio->io_abd,
+ start - ve->ve_offset, start - io_start,
+ end - start);
+ }
+ ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+ }
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_purge(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve;
+
+ mutex_enter(&vc->vc_lock);
+ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+ vdev_cache_evict(vc, ve);
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_offset_node));
+
+ avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_lastused_node));
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ vdev_cache_purge(vd);
+
+ avl_destroy(&vc->vc_offset_tree);
+ avl_destroy(&vc->vc_lastused_tree);
+
+ mutex_destroy(&vc->vc_lock);
+}
+
+void
+vdev_cache_stat_init(void)
+{
+ vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (vdc_ksp != NULL) {
+ vdc_ksp->ks_data = &vdc_stats;
+ kstat_install(vdc_ksp);
+ }
+}
+
+void
+vdev_cache_stat_fini(void)
+{
+ if (vdc_ksp != NULL) {
+ kstat_delete(vdc_ksp);
+ vdc_ksp = NULL;
+ }
+}
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, UINT, ZMOD_RW,
+ "Inflate reads small than max");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, UINT, ZMOD_RD,
+ "Total size of the per-disk cache");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, UINT, ZMOD_RW,
+ "Shift size to inflate reads too");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
@@ -293,16 +293,17 @@
indirect_vsd_t *iv = zio->io_vsd;
indirect_split_t *is;
- while ((is = list_remove_head(&iv->iv_splits)) != NULL) {
+ while ((is = list_head(&iv->iv_splits)) != NULL) {
for (int c = 0; c < is->is_children; c++) {
indirect_child_t *ic = &is->is_child[c];
if (ic->ic_data != NULL)
abd_free(ic->ic_data);
}
+ list_remove(&iv->iv_splits, is);
indirect_child_t *ic;
- while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
- ;
+ while ((ic = list_head(&is->is_unique_child)) != NULL)
+ list_remove(&is->is_unique_child, ic);
list_destroy(&is->is_unique_child);
@@ -1658,8 +1659,8 @@
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
indirect_child_t *ic;
- while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
- ;
+ while ((ic = list_head(&is->is_unique_child)) != NULL)
+ list_remove(&is->is_unique_child, ic);
is->is_unique_children = 0;
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -486,9 +486,6 @@
if (vd->vdev_isspare)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
- if (flags & VDEV_CONFIG_L2CACHE)
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
-
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd == vd->vdev_top) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
--- a/sys/contrib/openzfs/module/zfs/vdev_queue.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -228,6 +228,13 @@
*/
uint_t zfs_vdev_def_queue_depth = 32;
+/*
+ * Allow TRIM I/Os to be aggregated. This should normally not be needed since
+ * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
+ * by the TRIM code in zfs_trim.c.
+ */
+static uint_t zfs_vdev_aggregate_trim = 0;
+
static int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
@@ -242,60 +249,38 @@
return (TREE_PCMP(z1, z2));
}
-#define VDQ_T_SHIFT 29
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+ return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+ ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
+ if (t == ZIO_TYPE_READ)
+ return (&vq->vq_read_offset_tree);
+ else if (t == ZIO_TYPE_WRITE)
+ return (&vq->vq_write_offset_tree);
+ else
+ return (&vq->vq_trim_offset_tree);
+}
static int
-vdev_queue_to_compare(const void *x1, const void *x2)
+vdev_queue_timestamp_compare(const void *x1, const void *x2)
{
const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2;
- int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
- z2->io_timestamp >> VDQ_T_SHIFT);
- int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
- int cmp = tcmp ? tcmp : ocmp;
+ int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
- if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
+ if (likely(cmp))
return (cmp);
return (TREE_PCMP(z1, z2));
}
-static inline boolean_t
-vdev_queue_class_fifo(zio_priority_t p)
-{
- return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
- p == ZIO_PRIORITY_TRIM);
-}
-
-static void
-vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
-{
- zio_priority_t p = zio->io_priority;
- vq->vq_cqueued |= 1U << p;
- if (vdev_queue_class_fifo(p))
- list_insert_tail(&vq->vq_class[p].vqc_list, zio);
- else
- avl_add(&vq->vq_class[p].vqc_tree, zio);
-}
-
-static void
-vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
-{
- zio_priority_t p = zio->io_priority;
- uint32_t empty;
- if (vdev_queue_class_fifo(p)) {
- list_t *list = &vq->vq_class[p].vqc_list;
- list_remove(list, zio);
- empty = list_is_empty(list);
- } else {
- avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
- avl_remove(tree, zio);
- empty = avl_is_empty(tree);
- }
- vq->vq_cqueued &= ~(empty << p);
-}
-
static uint_t
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
{
@@ -375,7 +360,7 @@
}
static uint_t
-vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
+vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
@@ -385,7 +370,7 @@
case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_max_active);
case ZIO_PRIORITY_ASYNC_WRITE:
- return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
+ return (vdev_queue_max_async_writes(spa));
case ZIO_PRIORITY_SCRUB:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
@@ -429,10 +414,10 @@
static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq)
{
- uint32_t cq = vq->vq_cqueued;
- zio_priority_t p, p1;
+ spa_t *spa = vq->vq_vdev->vdev_spa;
+ zio_priority_t p, n;
- if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
+ if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
/*
@@ -440,18 +425,14 @@
* Do round-robin to reduce starvation due to zfs_vdev_max_active
* and vq_nia_credit limits.
*/
- p1 = vq->vq_last_prio + 1;
- if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
- p1 = 0;
- for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
- vdev_queue_class_min_active(vq, p))
- goto found;
- }
- for (p = 0; p < p1; p++) {
- if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
- vdev_queue_class_min_active(vq, p))
- goto found;
+ for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
+ p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+ vq->vq_class[p].vqc_active <
+ vdev_queue_class_min_active(vq, p)) {
+ vq->vq_last_prio = p;
+ return (p);
+ }
}
/*
@@ -459,14 +440,16 @@
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
- vdev_queue_class_max_active(vq, p))
- break;
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+ vq->vq_class[p].vqc_active <
+ vdev_queue_class_max_active(spa, vq, p)) {
+ vq->vq_last_prio = p;
+ return (p);
+ }
}
-found:
- vq->vq_last_prio = p;
- return (p);
+ /* No eligible queued i/os */
+ return (ZIO_PRIORITY_NUM_QUEUEABLE);
}
void
@@ -475,30 +458,42 @@
vdev_queue_t *vq = &vd->vdev_queue;
zio_priority_t p;
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
vq->vq_vdev = vd;
+ taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
- for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if (vdev_queue_class_fifo(p)) {
- list_create(&vq->vq_class[p].vqc_list,
- sizeof (zio_t),
- offsetof(struct zio, io_queue_node.l));
- } else {
- avl_create(&vq->vq_class[p].vqc_tree,
- vdev_queue_to_compare, sizeof (zio_t),
- offsetof(struct zio, io_queue_node.a));
- }
- }
- avl_create(&vq->vq_read_offset_tree,
+ avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
- avl_create(&vq->vq_write_offset_tree,
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ int (*compfn) (const void *, const void *);
+
+ /*
+ * The synchronous/trim i/o queues are dispatched in FIFO rather
+ * than LBA order. This provides more consistent latency for
+ * these i/os.
+ */
+ if (p == ZIO_PRIORITY_SYNC_READ ||
+ p == ZIO_PRIORITY_SYNC_WRITE ||
+ p == ZIO_PRIORITY_TRIM) {
+ compfn = vdev_queue_timestamp_compare;
+ } else {
+ compfn = vdev_queue_offset_compare;
+ }
+ avl_create(vdev_queue_class_tree(vq, p), compfn,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ }
+
vq->vq_last_offset = 0;
- list_create(&vq->vq_active_list, sizeof (struct zio),
- offsetof(struct zio, io_queue_node.l));
- mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
}
void
@@ -506,39 +501,30 @@
{
vdev_queue_t *vq = &vd->vdev_queue;
- for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if (vdev_queue_class_fifo(p))
- list_destroy(&vq->vq_class[p].vqc_list);
- else
- avl_destroy(&vq->vq_class[p].vqc_tree);
- }
- avl_destroy(&vq->vq_read_offset_tree);
- avl_destroy(&vq->vq_write_offset_tree);
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
+ avl_destroy(vdev_queue_class_tree(vq, p));
+ avl_destroy(&vq->vq_active_tree);
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
- list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
}
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
- zio->io_queue_state = ZIO_QS_QUEUED;
- vdev_queue_class_add(vq, zio);
- if (zio->io_type == ZIO_TYPE_READ)
- avl_add(&vq->vq_read_offset_tree, zio);
- else if (zio->io_type == ZIO_TYPE_WRITE)
- avl_add(&vq->vq_write_offset_tree, zio);
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
- vdev_queue_class_remove(vq, zio);
- if (zio->io_type == ZIO_TYPE_READ)
- avl_remove(&vq->vq_read_offset_tree, zio);
- else if (zio->io_type == ZIO_TYPE_WRITE)
- avl_remove(&vq->vq_write_offset_tree, zio);
- zio->io_queue_state = ZIO_QS_NONE;
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
}
static boolean_t
@@ -560,16 +546,14 @@
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- vq->vq_cactive[zio->io_priority]++;
- vq->vq_active++;
+ vq->vq_class[zio->io_priority].vqc_active++;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1;
} else if (vq->vq_ia_active > 0) {
vq->vq_nia_credit--;
}
- zio->io_queue_state = ZIO_QS_ACTIVE;
- list_insert_tail(&vq->vq_active_list, zio);
+ avl_add(&vq->vq_active_tree, zio);
}
static void
@@ -577,8 +561,7 @@
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- vq->vq_cactive[zio->io_priority]--;
- vq->vq_active--;
+ vq->vq_class[zio->io_priority].vqc_active--;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0;
@@ -586,8 +569,7 @@
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
- list_remove(&vq->vq_active_list, zio);
- zio->io_queue_state = ZIO_QS_NONE;
+ avl_remove(&vq->vq_active_tree, zio);
}
static void
@@ -620,28 +602,29 @@
uint64_t maxgap = 0;
uint64_t size;
uint64_t limit;
+ int maxblocksize;
boolean_t stretch = B_FALSE;
+ avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
+ zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
uint64_t next_offset;
abd_t *abd;
- avl_tree_t *t;
-
- /*
- * TRIM aggregation should not be needed since code in zfs_trim.c can
- * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
- */
- if (zio->io_type == ZIO_TYPE_TRIM)
- return (NULL);
-
- if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
- return (NULL);
+ maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
if (vq->vq_vdev->vdev_nonrot)
limit = zfs_vdev_aggregation_limit_non_rotating;
else
limit = zfs_vdev_aggregation_limit;
- if (limit == 0)
+ limit = MIN(limit, maxblocksize);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
+ return (NULL);
+
+ /*
+ * While TRIM commands could be aggregated based on offset this
+ * behavior is disabled until it's determined to be beneficial.
+ */
+ if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
return (NULL);
- limit = MIN(limit, SPA_MAXBLOCKSIZE);
/*
* I/Os to distributed spares are directly dispatched to the dRAID
@@ -652,13 +635,8 @@
first = last = zio;
- if (zio->io_type == ZIO_TYPE_READ) {
+ if (zio->io_type == ZIO_TYPE_READ)
maxgap = zfs_vdev_read_gap_limit;
- t = &vq->vq_read_offset_tree;
- } else {
- ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
- t = &vq->vq_write_offset_tree;
- }
/*
* We can aggregate I/Os that are sufficiently adjacent and of
@@ -679,7 +657,6 @@
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-optional I/O.
*/
- zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= limit &&
@@ -709,7 +686,7 @@
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
(IO_SPAN(first, dio) <= limit ||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
- IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
+ IO_SPAN(first, dio) <= maxblocksize &&
IO_GAP(last, dio) <= maxgap &&
dio->io_type == zio->io_type) {
last = dio;
@@ -763,7 +740,7 @@
return (NULL);
size = IO_SPAN(first, last);
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, <=, maxblocksize);
abd = abd_alloc_gang();
if (abd == NULL)
@@ -771,7 +748,8 @@
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
abd, size, first->io_type, zio->io_priority,
- flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL);
+ flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+ vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
nio = first;
@@ -847,30 +825,19 @@
return (NULL);
}
- if (vdev_queue_class_fifo(p)) {
- zio = list_head(&vq->vq_class[p].vqc_list);
- } else {
- /*
- * For LBA-ordered queues (async / scrub / initializing),
- * issue the I/O which follows the most recently issued I/O
- * in LBA (offset) order, but to avoid starvation only within
- * the same 0.5 second interval as the first I/O.
- */
- tree = &vq->vq_class[p].vqc_tree;
- zio = aio = avl_first(tree);
- if (zio->io_offset < vq->vq_last_offset) {
- vq->vq_io_search.io_timestamp = zio->io_timestamp;
- vq->vq_io_search.io_offset = vq->vq_last_offset;
- zio = avl_find(tree, &vq->vq_io_search, &idx);
- if (zio == NULL) {
- zio = avl_nearest(tree, idx, AVL_AFTER);
- if (zio == NULL ||
- (zio->io_timestamp >> VDQ_T_SHIFT) !=
- (aio->io_timestamp >> VDQ_T_SHIFT))
- zio = aio;
- }
- }
- }
+ /*
+ * For LBA-ordered queues (async / scrub / initializing), issue the
+ * i/o which follows the most recently issued i/o in LBA (offset) order.
+ *
+ * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
+ */
+ tree = vdev_queue_class_tree(vq, p);
+ vq->vq_io_search.io_timestamp = 0;
+ vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
+ VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ if (zio == NULL)
+ zio = avl_first(tree);
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);
@@ -940,7 +907,7 @@
ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
}
- zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();
mutex_enter(&vq->vq_lock);
@@ -1001,6 +968,7 @@
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ avl_tree_t *tree;
/*
* ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@@ -1035,11 +1003,12 @@
* Otherwise, the zio is currently active and we cannot change its
* priority.
*/
- if (zio->io_queue_state == ZIO_QS_QUEUED) {
- vdev_queue_class_remove(vq, zio);
+ tree = vdev_queue_class_tree(vq, zio->io_priority);
+ if (avl_find(tree, zio, NULL) == zio) {
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
zio->io_priority = priority;
- vdev_queue_class_add(vq, zio);
- } else if (zio->io_queue_state == ZIO_QS_NONE) {
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
zio->io_priority = priority;
}
@@ -1052,10 +1021,10 @@
* vq_lock mutex use here, instead we prefer to keep it lock free for
* performance.
*/
-uint32_t
+int
vdev_queue_length(vdev_t *vd)
{
- return (vd->vdev_queue.vq_active);
+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
}
uint64_t
@@ -1064,22 +1033,15 @@
return (vd->vdev_queue.vq_last_offset);
}
-uint64_t
-vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
-{
- vdev_queue_t *vq = &vd->vdev_queue;
- if (vdev_queue_class_fifo(p))
- return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
- else
- return (avl_numnodes(&vq->vq_class[p].vqc_tree));
-}
-
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
"Max vdev I/O aggregation size");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW,
+ "Allow TRIM I/O to be aggregated");
+
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
"Aggregate read I/O over gap");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
--- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
@@ -571,10 +571,8 @@
vdev_rebuild_blkptr_init(&blk, vd, start, size);
uint64_t psize = BP_GET_PSIZE(&blk);
- if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) {
- vr->vr_pass_bytes_skipped += size;
+ if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
return (0);
- }
mutex_enter(&vr->vr_io_lock);
@@ -788,7 +786,6 @@
vr->vr_pass_start_time = gethrtime();
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;
- vr->vr_pass_bytes_skipped = 0;
uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);
@@ -1156,7 +1153,6 @@
vr->vr_pass_start_time);
vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
- vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped;
mutex_exit(&tvd->vdev_rebuild_lock);
}
diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c
--- a/sys/contrib/openzfs/module/zfs/zap_micro.c
+++ b/sys/contrib/openzfs/module/zfs/zap_micro.c
@@ -285,7 +285,6 @@
}
}
-__attribute__((always_inline)) inline
static int
mze_compare(const void *arg1, const void *arg2)
{
@@ -296,9 +295,6 @@
(uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
}
-ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
- mze_compare)
-
static void
mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
{
@@ -465,7 +461,7 @@
* 62 entries before we have to add 2KB B-tree core node.
*/
zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
- mze_find_in_buf, sizeof (mzap_ent_t), 512);
+ sizeof (mzap_ent_t), 512);
zap_name_t *zn = zap_name_alloc(zap);
for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c
--- a/sys/contrib/openzfs/module/zfs/zfs_fm.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@@ -1522,8 +1522,9 @@
{
recent_events_node_t *entry;
- while ((entry = list_remove_head(&recent_events_list)) != NULL) {
+ while ((entry = list_head(&recent_events_list)) != NULL) {
avl_remove(&recent_events_tree, entry);
+ list_remove(&recent_events_list, entry);
kmem_free(entry, sizeof (*entry));
}
avl_destroy(&recent_events_tree);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
--- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
@@ -699,15 +699,19 @@
zfs_fuid_t *zfuid;
zfs_fuid_domain_t *zdomain;
- while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL)
+ while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
+ list_remove(&fuidp->z_fuids, zfuid);
kmem_free(zfuid, sizeof (zfs_fuid_t));
+ }
if (fuidp->z_domain_table != NULL)
kmem_free(fuidp->z_domain_table,
(sizeof (char *)) * fuidp->z_domain_cnt);
- while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL)
+ while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
+ list_remove(&fuidp->z_domains, zdomain);
kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
+ }
kmem_free(fuidp, sizeof (zfs_fuid_info_t));
}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
--- a/sys/contrib/openzfs/module/zfs/zfs_onexit.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
@@ -87,7 +87,8 @@
zfs_onexit_action_node_t *ap;
mutex_enter(&zo->zo_lock);
- while ((ap = list_remove_head(&zo->zo_actions)) != NULL) {
+ while ((ap = list_head(&zo->zo_actions)) != NULL) {
+ list_remove(&zo->zo_actions, ap);
mutex_exit(&zo->zo_lock);
ap->za_func(ap->za_data);
kmem_free(ap, sizeof (zfs_onexit_action_node_t));
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
--- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -462,12 +462,14 @@
return (SET_ERROR(EINVAL));
}
+ const uint64_t max_blksz = zfsvfs->z_max_blksz;
+
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
*/
- ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
- if (zfs_uio_prefaultpages(pfbytes, uio)) {
+ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFAULT));
}
@@ -542,31 +544,10 @@
break;
}
- uint64_t blksz;
- if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
- if (zp->z_blksz > zfsvfs->z_max_blksz &&
- !ISP2(zp->z_blksz)) {
- /*
- * File's blocksize is already larger than the
- * "recordsize" property. Only let it grow to
- * the next power of 2.
- */
- blksz = 1 << highbit64(zp->z_blksz);
- } else {
- blksz = zfsvfs->z_max_blksz;
- }
- blksz = MIN(blksz, P2ROUNDUP(end_size,
- SPA_MINBLOCKSIZE));
- blksz = MAX(blksz, zp->z_blksz);
- } else {
- blksz = zp->z_blksz;
- }
-
arc_buf_t *abuf = NULL;
- ssize_t nbytes = n;
- if (n >= blksz && woff >= zp->z_size &&
- P2PHASE(woff, blksz) == 0 &&
- (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
+ if (n >= max_blksz && woff >= zp->z_size &&
+ P2PHASE(woff, max_blksz) == 0 &&
+ zp->z_blksz == max_blksz) {
/*
* This write covers a full block. "Borrow" a buffer
* from the dmu so that we can fill it before we enter
@@ -574,26 +555,18 @@
* holding up the transaction if the data copy hangs
* up on a pagefault (e.g., from an NFS server mapping).
*/
+ size_t cbytes;
+
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
+ max_blksz);
ASSERT(abuf != NULL);
- ASSERT(arc_buf_size(abuf) == blksz);
- if ((error = zfs_uiocopy(abuf->b_data, blksz,
- UIO_WRITE, uio, &nbytes))) {
+ ASSERT(arc_buf_size(abuf) == max_blksz);
+ if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
+ UIO_WRITE, uio, &cbytes))) {
dmu_return_arcbuf(abuf);
break;
}
- ASSERT3S(nbytes, ==, blksz);
- } else {
- nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
- P2PHASE(woff, blksz));
- if (pfbytes < nbytes) {
- if (zfs_uio_prefaultpages(nbytes, uio)) {
- error = SET_ERROR(EFAULT);
- break;
- }
- pfbytes = nbytes;
- }
+ ASSERT3S(cbytes, ==, max_blksz);
}
/*
@@ -603,7 +576,8 @@
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
DB_DNODE_ENTER(db);
- dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
+ dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+ MIN(n, max_blksz));
DB_DNODE_EXIT(db);
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
@@ -626,10 +600,31 @@
* shrink down lr_length to the appropriate size.
*/
if (lr->lr_length == UINT64_MAX) {
- zfs_grow_blocksize(zp, blksz, tx);
+ uint64_t new_blksz;
+
+ if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ zfs_grow_blocksize(zp, new_blksz, tx);
zfs_rangelock_reduce(lr, woff, n);
}
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ const ssize_t nbytes =
+ MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
ssize_t tx_bytes;
if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio);
@@ -649,8 +644,12 @@
* zfs_uio_prefaultpages, or prefaultpages may
* error, and we may break the loop early.
*/
- n -= tx_bytes - zfs_uio_resid(uio);
- pfbytes -= tx_bytes - zfs_uio_resid(uio);
+ if (tx_bytes != zfs_uio_resid(uio))
+ n -= tx_bytes - zfs_uio_resid(uio);
+ if (zfs_uio_prefaultpages(MIN(n, max_blksz),
+ uio)) {
+ break;
+ }
continue;
}
#endif
@@ -666,6 +665,15 @@
}
tx_bytes -= zfs_uio_resid(uio);
} else {
+ /* Implied by abuf != NULL: */
+ ASSERT3S(n, >=, max_blksz);
+ ASSERT0(P2PHASE(woff, max_blksz));
+ /*
+ * We can simplify nbytes to MIN(n, max_blksz) since
+ * P2PHASE(woff, max_blksz) is 0, and knowing
+ * n >= max_blksz lets us simplify further:
+ */
+ ASSERT3S(nbytes, ==, max_blksz);
/*
* Thus, we're writing a full block at a block-aligned
* offset and extending the file past EOF.
@@ -750,7 +758,13 @@
break;
ASSERT3S(tx_bytes, ==, nbytes);
n -= nbytes;
- pfbytes -= nbytes;
+
+ if (n > 0) {
+ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ error = SET_ERROR(EFAULT);
+ break;
+ }
+ }
}
zfs_znode_update_vfs(zp);
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -116,12 +116,8 @@
{ "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 },
- { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 },
};
static zil_sums_t zil_sums_global;
@@ -150,10 +146,6 @@
static kmem_cache_t *zil_lwb_cache;
static kmem_cache_t *zil_zcw_cache;
-static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
-static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb);
-static itx_t *zil_itx_clone(itx_t *oitx);
-
static int
zil_bp_compare(const void *x1, const void *x2)
{
@@ -249,10 +241,11 @@
*/
static int
zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
- blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
+ blkptr_t *nbp, void *dst, char **end)
{
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t aflags = ARC_FLAG_WAIT;
+ arc_buf_t *abuf = NULL;
zbookmark_phys_t zb;
int error;
@@ -269,7 +262,7 @@
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
- abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+ &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
zio_cksum_t cksum = bp->blk_cksum;
@@ -284,23 +277,23 @@
*/
cksum.zc_word[ZIL_ZC_SEQ]++;
- uint64_t size = BP_GET_LSIZE(bp);
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
- zil_chain_t *zilc = (*abuf)->b_data;
+ zil_chain_t *zilc = abuf->b_data;
char *lr = (char *)(zilc + 1);
+ uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
- sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
- zilc->zc_nused < sizeof (*zilc) ||
- zilc->zc_nused > size) {
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
error = SET_ERROR(ECKSUM);
} else {
- *begin = lr;
- *end = lr + zilc->zc_nused - sizeof (*zilc);
+ ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
+ memcpy(dst, lr, len);
+ *end = (char *)dst + len;
*nbp = zilc->zc_next_blk;
}
} else {
- char *lr = (*abuf)->b_data;
+ char *lr = abuf->b_data;
+ uint64_t size = BP_GET_LSIZE(bp);
zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
@@ -308,11 +301,15 @@
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
- *begin = lr;
- *end = lr + zilc->zc_nused;
+ ASSERT3U(zilc->zc_nused, <=,
+ SPA_OLD_MAXBLOCKSIZE);
+ memcpy(dst, lr, zilc->zc_nused);
+ *end = (char *)dst + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
}
}
+
+ arc_buf_destroy(abuf, &abuf);
}
return (error);
@@ -378,12 +375,8 @@
wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
- wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
- wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
- wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
- wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
}
void
@@ -400,12 +393,8 @@
wmsum_fini(&zs->zil_itx_needcopy_bytes);
wmsum_fini(&zs->zil_itx_metaslab_normal_count);
wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
- wmsum_fini(&zs->zil_itx_metaslab_normal_write);
- wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
wmsum_fini(&zs->zil_itx_metaslab_slog_count);
wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
- wmsum_fini(&zs->zil_itx_metaslab_slog_write);
- wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
}
void
@@ -433,18 +422,10 @@
wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
zs->zil_itx_metaslab_normal_bytes.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
- zs->zil_itx_metaslab_normal_write.value.ui64 =
- wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
- zs->zil_itx_metaslab_normal_alloc.value.ui64 =
- wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
zs->zil_itx_metaslab_slog_count.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
zs->zil_itx_metaslab_slog_bytes.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
- zs->zil_itx_metaslab_slog_write.value.ui64 =
- wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
- zs->zil_itx_metaslab_slog_alloc.value.ui64 =
- wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
}
/*
@@ -464,6 +445,7 @@
uint64_t blk_count = 0;
uint64_t lr_count = 0;
blkptr_t blk, next_blk = {{{{0}}}};
+ char *lrbuf, *lrp;
int error = 0;
/*
@@ -481,13 +463,13 @@
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
+ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
int reclen;
- char *lrp, *end;
- arc_buf_t *abuf = NULL;
+ char *end = NULL;
if (blk_seq > claim_blk_seq)
break;
@@ -503,10 +485,8 @@
break;
error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
- &lrp, &end, &abuf);
+ lrbuf, &end);
if (error != 0) {
- if (abuf)
- arc_buf_destroy(abuf, &abuf);
if (claimed) {
char name[ZFS_MAX_DATASET_NAME_LEN];
@@ -519,25 +499,20 @@
break;
}
- for (; lrp < end; lrp += reclen) {
+ for (lrp = lrbuf; lrp < end; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
- if (lr->lrc_seq > claim_lr_seq) {
- arc_buf_destroy(abuf, &abuf);
+ if (lr->lrc_seq > claim_lr_seq)
goto done;
- }
error = parse_lr_func(zilog, lr, arg, txg);
- if (error != 0) {
- arc_buf_destroy(abuf, &abuf);
+ if (error != 0)
goto done;
- }
ASSERT3U(max_lr_seq, <, lr->lrc_seq);
max_lr_seq = lr->lrc_seq;
lr_count++;
}
- arc_buf_destroy(abuf, &abuf);
}
done:
zilog->zl_parse_error = error;
@@ -547,6 +522,7 @@
zilog->zl_parse_lr_count = lr_count;
zil_bp_tree_fini(zilog);
+ zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
@@ -771,21 +747,20 @@
lwb->lwb_blk = *bp;
lwb->lwb_fastwrite = fastwrite;
lwb->lwb_slog = slog;
- lwb->lwb_indirect = B_FALSE;
- if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
- lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
- lwb->lwb_sz = BP_GET_LSIZE(bp);
- } else {
- lwb->lwb_nused = lwb->lwb_nfilled = 0;
- lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
- }
lwb->lwb_state = LWB_STATE_CLOSED;
lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+ lwb->lwb_max_txg = txg;
lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL;
lwb->lwb_issued_timestamp = 0;
lwb->lwb_issued_txg = 0;
- lwb->lwb_max_txg = txg;
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ lwb->lwb_nused = sizeof (zil_chain_t);
+ lwb->lwb_sz = BP_GET_LSIZE(bp);
+ } else {
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
+ }
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
@@ -799,8 +774,8 @@
{
ASSERT(MUTEX_HELD(&zilog->zl_lock));
ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
- VERIFY(list_is_empty(&lwb->lwb_waiters));
- VERIFY(list_is_empty(&lwb->lwb_itxs));
+ ASSERT(list_is_empty(&lwb->lwb_waiters));
+ ASSERT(list_is_empty(&lwb->lwb_itxs));
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
ASSERT3P(lwb->lwb_write_zio, ==, NULL);
ASSERT3P(lwb->lwb_root_zio, ==, NULL);
@@ -1398,14 +1373,9 @@
zil_commit_waiter_t *zcw;
itx_t *itx;
uint64_t txg;
- list_t itxs, waiters;
spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
- list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
- list_create(&waiters, sizeof (zil_commit_waiter_t),
- offsetof(zil_commit_waiter_t, zcw_node));
-
hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
mutex_enter(&zilog->zl_lock);
@@ -1414,6 +1384,9 @@
lwb->lwb_root_zio = NULL;
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+ lwb->lwb_state = LWB_STATE_FLUSH_DONE;
+
if (zilog->zl_last_lwb_opened == lwb) {
/*
* Remember the highest committed log sequence number
@@ -1424,22 +1397,13 @@
zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
}
- list_move_tail(&itxs, &lwb->lwb_itxs);
- list_move_tail(&waiters, &lwb->lwb_waiters);
- txg = lwb->lwb_issued_txg;
-
- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
- lwb->lwb_state = LWB_STATE_FLUSH_DONE;
-
- mutex_exit(&zilog->zl_lock);
-
- while ((itx = list_remove_head(&itxs)) != NULL)
+ while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
zil_itx_destroy(itx);
- list_destroy(&itxs);
- while ((zcw = list_remove_head(&waiters)) != NULL) {
+ while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
mutex_enter(&zcw->zcw_lock);
+ ASSERT3P(zcw->zcw_lwb, ==, lwb);
zcw->zcw_lwb = NULL;
/*
* We expect any ZIO errors from child ZIOs to have been
@@ -1464,9 +1428,11 @@
mutex_exit(&zcw->zcw_lock);
}
- list_destroy(&waiters);
+
+ mutex_exit(&zilog->zl_lock);
mutex_enter(&zilog->zl_lwb_io_lock);
+ txg = lwb->lwb_issued_txg;
ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
zilog->zl_lwb_inflight[txg & TXG_MASK]--;
if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
@@ -1700,41 +1666,46 @@
EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
- if (lwb->lwb_root_zio != NULL)
- return;
-
- lwb->lwb_root_zio = zio_root(zilog->zl_spa,
- zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
-
- abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
- BP_GET_LSIZE(&lwb->lwb_blk));
-
- if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
- prio = ZIO_PRIORITY_SYNC_WRITE;
- else
- prio = ZIO_PRIORITY_ASYNC_WRITE;
-
SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
mutex_enter(&zilog->zl_lock);
- if (!lwb->lwb_fastwrite) {
- metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
- lwb->lwb_fastwrite = 1;
- }
+ if (lwb->lwb_root_zio == NULL) {
+ abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
+ BP_GET_LSIZE(&lwb->lwb_blk));
+
+ if (!lwb->lwb_fastwrite) {
+ metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
+ lwb->lwb_fastwrite = 1;
+ }
+
+ if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+ prio = ZIO_PRIORITY_SYNC_WRITE;
+ else
+ prio = ZIO_PRIORITY_ASYNC_WRITE;
- lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0,
- &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
- zil_lwb_write_done, lwb, prio,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
+ lwb->lwb_root_zio = zio_root(zilog->zl_spa,
+ zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
+ ASSERT3P(lwb->lwb_root_zio, !=, NULL);
- lwb->lwb_state = LWB_STATE_OPENED;
+ lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
+ zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
+ BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
+ prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
+ ASSERT3P(lwb->lwb_write_zio, !=, NULL);
- zil_lwb_set_zio_dependency(zilog, lwb);
- zilog->zl_last_lwb_opened = lwb;
+ lwb->lwb_state = LWB_STATE_OPENED;
+
+ zil_lwb_set_zio_dependency(zilog, lwb);
+ zilog->zl_last_lwb_opened = lwb;
+ }
mutex_exit(&zilog->zl_lock);
+
+ ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+ ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
}
/*
@@ -1765,11 +1736,11 @@
static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
/*
- * Close the log block for being issued and allocate the next one.
- * Has to be called under zl_issuer_lock to chain more lwbs.
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
*/
static lwb_t *
-zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
+zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
{
lwb_t *nlwb = NULL;
zil_chain_t *zilc;
@@ -1777,7 +1748,7 @@
blkptr_t *bp;
dmu_tx_t *tx;
uint64_t txg;
- uint64_t zil_blksz;
+ uint64_t zil_blksz, wsz;
int i, error;
boolean_t slog;
@@ -1786,17 +1757,16 @@
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
- /*
- * If this lwb includes indirect writes, we have to commit before
- * creating the transaction, otherwise we may end up in dead lock.
- */
- if (lwb->lwb_indirect) {
- for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
- itx = list_next(&lwb->lwb_itxs, itx))
- zil_lwb_commit(zilog, lwb, itx);
- lwb->lwb_nused = lwb->lwb_nfilled;
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ zilc = (zil_chain_t *)lwb->lwb_buf;
+ bp = &zilc->zc_next_blk;
+ } else {
+ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+ bp = &zilc->zc_next_blk;
}
+ ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
+
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
@@ -1844,18 +1814,19 @@
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
- DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
- uint64_t, zil_blksz,
- uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
- if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2)
- zilc = (zil_chain_t *)lwb->lwb_buf;
- else
- zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
- bp = &zilc->zc_next_blk;
BP_ZERO(bp);
error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
+ if (slog) {
+ ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
+ ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
+ lwb->lwb_nused);
+ } else {
+ ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
+ ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
+ lwb->lwb_nused);
+ }
if (error == 0) {
ASSERT3U(bp->blk_birth, ==, txg);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -1867,68 +1838,17 @@
nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
}
- lwb->lwb_state = LWB_STATE_ISSUED;
-
- dmu_tx_commit(tx);
-
- /*
- * We need to acquire the config lock for the lwb to issue it later.
- * However, if we already have a queue of closed parent lwbs already
- * holding the config lock (but not yet issued), we can't block here
- * waiting on the lock or we will deadlock. In that case we must
- * first issue to parent IOs before waiting on the lock.
- */
- if (ilwbs && !list_is_empty(ilwbs)) {
- if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) {
- lwb_t *tlwb;
- while ((tlwb = list_remove_head(ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, tlwb);
- spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
- }
- } else {
- spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
- }
-
- if (ilwbs)
- list_insert_tail(ilwbs, lwb);
-
- /*
- * If there was an allocation failure then nlwb will be null which
- * forces a txg_wait_synced().
- */
- return (nlwb);
-}
-
-/*
- * Finalize previously closed block and issue the write zio.
- * Does not require locking.
- */
-static void
-zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
-{
- zil_chain_t *zilc;
- int wsz;
-
- /* Actually fill the lwb with the data if not yet. */
- if (!lwb->lwb_indirect) {
- for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
- itx = list_next(&lwb->lwb_itxs, itx))
- zil_lwb_commit(zilog, lwb, itx);
- lwb->lwb_nused = lwb->lwb_nfilled;
- }
-
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
/* For Slim ZIL only write what is used. */
- wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int);
- ASSERT3S(wsz, <=, lwb->lwb_sz);
+ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+ ASSERT3U(wsz, <=, lwb->lwb_sz);
zio_shrink(lwb->lwb_write_zio, wsz);
wsz = lwb->lwb_write_zio->io_size;
- zilc = (zil_chain_t *)lwb->lwb_buf;
} else {
wsz = lwb->lwb_sz;
- zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
}
+
zilc->zc_pad = 0;
zilc->zc_nused = lwb->lwb_nused;
zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
@@ -1938,28 +1858,22 @@
*/
memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
- if (lwb->lwb_slog) {
- ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
- ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
- lwb->lwb_nused);
- ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
- wsz);
- ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
- BP_GET_LSIZE(&lwb->lwb_blk));
- } else {
- ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
- ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
- lwb->lwb_nused);
- ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
- wsz);
- ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
- BP_GET_LSIZE(&lwb->lwb_blk));
- }
- ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER));
+ spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
+
zil_lwb_add_block(lwb, &lwb->lwb_blk);
lwb->lwb_issued_timestamp = gethrtime();
+ lwb->lwb_state = LWB_STATE_ISSUED;
+
zio_nowait(lwb->lwb_root_zio);
zio_nowait(lwb->lwb_write_zio);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * If there was an allocation failure then nlwb will be null which
+ * forces a txg_wait_synced().
+ */
+ return (nlwb);
}
/*
@@ -1995,19 +1909,13 @@
sizeof (lr_write_t));
}
-/*
- * Estimate space needed in the lwb for the itx. Allocate more lwbs or
- * split the itx as needed, but don't touch the actual transaction data.
- * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
- * to chain more lwbs.
- */
static lwb_t *
-zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
- itx_t *citx;
- lr_t *lr, *clr;
- lr_write_t *lrw;
- uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
+ lr_t *lrcb, *lrc;
+ lr_write_t *lrwb, *lrw;
+ char *lr_buf;
+ uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
@@ -2015,8 +1923,8 @@
zil_lwb_write_open(zilog, lwb);
- lr = &itx->itx_lr;
- lrw = (lr_write_t *)lr;
+ lrc = &itx->itx_lr;
+ lrw = (lr_write_t *)lrc;
/*
* A commit itx doesn't represent any on-disk state; instead
@@ -2030,23 +1938,24 @@
*
* For more details, see the comment above zil_commit().
*/
- if (lr->lrc_txtype == TX_COMMIT) {
+ if (lrc->lrc_txtype == TX_COMMIT) {
mutex_enter(&zilog->zl_lock);
zil_commit_waiter_link_lwb(itx->itx_private, lwb);
itx->itx_private = NULL;
mutex_exit(&zilog->zl_lock);
- list_insert_tail(&lwb->lwb_itxs, itx);
return (lwb);
}
- if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
+ dpad = dlen - lrw->lr_length;
} else {
- dlen = 0;
+ dlen = dpad = 0;
}
- reclen = lr->lrc_reclen;
+ reclen = lrc->lrc_reclen;
zilog->zl_cur_used += (reclen + dlen);
+ txg = lrc->lrc_txg;
cont:
/*
@@ -2059,7 +1968,7 @@
lwb_sp < zil_max_waste_space(zilog) &&
(dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) {
- lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
+ lwb = zil_lwb_write_issue(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_open(zilog, lwb);
@@ -2078,99 +1987,19 @@
}
dnow = MIN(dlen, lwb_sp - reclen);
- if (dlen > dnow) {
- ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
- ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
- citx = zil_itx_clone(itx);
- clr = &citx->itx_lr;
- lr_write_t *clrw = (lr_write_t *)clr;
- clrw->lr_length = dnow;
- lrw->lr_offset += dnow;
- lrw->lr_length -= dnow;
- } else {
- citx = itx;
- clr = lr;
- }
-
- /*
- * We're actually making an entry, so update lrc_seq to be the
- * log record sequence number. Note that this is generally not
- * equal to the itx sequence number because not all transactions
- * are synchronous, and sometimes spa_sync() gets there first.
- */
- clr->lrc_seq = ++zilog->zl_lr_seq;
-
- lwb->lwb_nused += reclen + dnow;
- ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
- ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
-
- zil_lwb_add_txg(lwb, lr->lrc_txg);
- list_insert_tail(&lwb->lwb_itxs, citx);
-
- dlen -= dnow;
- if (dlen > 0) {
- zilog->zl_cur_used += reclen;
- goto cont;
- }
-
- /*
- * We have to really issue all queued LWBs before we may have to
- * wait for a txg sync. Otherwise we may end up in a dead lock.
- */
- if (lr->lrc_txtype == TX_WRITE) {
- boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa);
- if (frozen || itx->itx_wr_state == WR_INDIRECT) {
- lwb_t *tlwb;
- while ((tlwb = list_remove_head(ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, tlwb);
- }
- if (itx->itx_wr_state == WR_INDIRECT)
- lwb->lwb_indirect = B_TRUE;
- if (frozen)
- txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
- }
-
- return (lwb);
-}
-
-/*
- * Fill the actual transaction data into the lwb, following zil_lwb_assign().
- * Does not require locking.
- */
-static void
-zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
-{
- lr_t *lr, *lrb;
- lr_write_t *lrw, *lrwb;
- char *lr_buf;
- uint64_t dlen, reclen;
-
- lr = &itx->itx_lr;
- lrw = (lr_write_t *)lr;
-
- if (lr->lrc_txtype == TX_COMMIT)
- return;
-
- if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
- dlen = P2ROUNDUP_TYPED(
- lrw->lr_length, sizeof (uint64_t), uint64_t);
- } else {
- dlen = 0;
- }
- reclen = lr->lrc_reclen;
- ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
-
- lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
- memcpy(lr_buf, lr, reclen);
- lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */
- lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */
+ lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+ memcpy(lr_buf, lrc, reclen);
+ lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */
+ lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */
ZIL_STAT_BUMP(zilog, zil_itx_count);
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
*/
- if (lr->lrc_txtype == TX_WRITE) {
+ if (lrc->lrc_txtype == TX_WRITE) {
+ if (txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
if (itx->itx_wr_state == WR_COPIED) {
ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
@@ -2181,10 +2010,14 @@
if (itx->itx_wr_state == WR_NEED_COPY) {
dbuf = lr_buf + reclen;
- lrb->lrc_reclen += dlen;
+ lrcb->lrc_reclen += dnow;
+ if (lrwb->lr_length > dnow)
+ lrwb->lr_length = dnow;
+ lrw->lr_offset += dnow;
+ lrw->lr_length -= dnow;
ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
- dlen);
+ dnow);
} else {
ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
dbuf = NULL;
@@ -2211,11 +2044,9 @@
error = zilog->zl_get_data(itx->itx_private,
itx->itx_gen, lrwb, dbuf, lwb,
lwb->lwb_write_zio);
- if (dbuf != NULL && error == 0) {
+ if (dbuf != NULL && error == 0 && dnow == dlen)
/* Zero any padding bytes in the last block. */
- memset((char *)dbuf + lrwb->lr_length, 0,
- dlen - lrwb->lr_length);
- }
+ memset((char *)dbuf + lrwb->lr_length, 0, dpad);
/*
* Typically, the only return values we should see from
@@ -2243,26 +2074,39 @@
error);
zfs_fallthrough;
case EIO:
- if (lwb->lwb_indirect) {
- txg_wait_synced(zilog->zl_dmu_pool,
- lr->lrc_txg);
- } else {
- lwb->lwb_write_zio->io_error = error;
- }
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
zfs_fallthrough;
case ENOENT:
zfs_fallthrough;
case EEXIST:
zfs_fallthrough;
case EALREADY:
- return;
+ return (lwb);
}
}
}
- lwb->lwb_nfilled += reclen + dlen;
- ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
- ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
+ /*
+ * We're actually making an entry, so update lrc_seq to be the
+ * log record sequence number. Note that this is generally not
+ * equal to the itx sequence number because not all transactions
+ * are synchronous, and sometimes spa_sync() gets there first.
+ */
+ lrcb->lrc_seq = ++zilog->zl_lr_seq;
+ lwb->lwb_nused += reclen + dnow;
+
+ zil_lwb_add_txg(lwb, txg);
+
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
+ ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+
+ dlen -= dnow;
+ if (dlen > 0) {
+ zilog->zl_cur_used += reclen;
+ goto cont;
+ }
+
+ return (lwb);
}
itx_t *
@@ -2287,16 +2131,6 @@
return (itx);
}
-static itx_t *
-zil_itx_clone(itx_t *oitx)
-{
- itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
- memcpy(itx, oitx, oitx->itx_size);
- itx->itx_callback = NULL;
- itx->itx_callback_data = NULL;
- return (itx);
-}
-
void
zil_itx_destroy(itx_t *itx)
{
@@ -2328,7 +2162,7 @@
/*
* In the general case, commit itxs will not be found
* here, as they'll be committed to an lwb via
- * zil_lwb_assign(), and free'd in that function. Having
+ * zil_lwb_commit(), and free'd in that function. Having
* said that, it is still possible for commit itxs to be
* found here, due to the following race:
*
@@ -2546,10 +2380,10 @@
* This function will traverse the queue of itxs that need to be
* committed, and move them onto the ZIL's zl_itx_commit_list.
*/
-static uint64_t
+static void
zil_get_commit_list(zilog_t *zilog)
{
- uint64_t otxg, txg, wtxg = 0;
+ uint64_t otxg, txg;
list_t *commit_list = &zilog->zl_itx_commit_list;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@@ -2583,22 +2417,10 @@
*/
ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
- list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
- if (unlikely(zilog->zl_suspend > 0)) {
- /*
- * ZIL was just suspended, but we lost the race.
- * Allow all earlier itxs to be committed, but ask
- * caller to do txg_wait_synced(txg) for any new.
- */
- if (!list_is_empty(sync_list))
- wtxg = MAX(wtxg, txg);
- } else {
- list_move_tail(commit_list, sync_list);
- }
+ list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
mutex_exit(&itxg->itxg_lock);
}
- return (wtxg);
}
/*
@@ -2739,7 +2561,7 @@
* lwb will be issued to the zio layer to be written to disk.
*/
static void
-zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
+zil_process_commit_list(zilog_t *zilog)
{
spa_t *spa = zilog->zl_spa;
list_t nolwb_itxs;
@@ -2841,23 +2663,18 @@
*/
if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
if (lwb != NULL) {
- lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
- if (lwb == NULL) {
+ lwb = zil_lwb_commit(zilog, itx, lwb);
+
+ if (lwb == NULL)
list_insert_tail(&nolwb_itxs, itx);
- } else if ((zcw->zcw_lwb != NULL &&
- zcw->zcw_lwb != lwb) || zcw->zcw_done) {
- /*
- * Our lwb is done, leave the rest of
- * itx list to somebody else who care.
- */
- first = B_FALSE;
- break;
- }
+ else
+ list_insert_tail(&lwb->lwb_itxs, itx);
} else {
if (lrc->lrc_txtype == TX_COMMIT) {
zil_commit_waiter_link_nolwb(
itx->itx_private, &nolwb_waiters);
}
+
list_insert_tail(&nolwb_itxs, itx);
}
} else {
@@ -2873,8 +2690,6 @@
* the ZIL write pipeline; see the comment within
* zil_commit_writer_stall() for more details.
*/
- while ((lwb = list_remove_head(ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog);
/*
@@ -2920,13 +2735,13 @@
* on the system, such that this function will be
* immediately called again (not necessarily by the same
* thread) and this lwb's zio will be issued via
- * zil_lwb_assign(). This way, the lwb is guaranteed to
+ * zil_lwb_commit(). This way, the lwb is guaranteed to
* be "full" when it is issued to disk, and we'll make
* use of the lwb's size the best we can.
*
* 2. If there isn't sufficient ZIL activity occurring on
* the system, such that this lwb's zio isn't issued via
- * zil_lwb_assign(), zil_commit_waiter() will issue the
+ * zil_lwb_commit(), zil_commit_waiter() will issue the
* lwb's zio. If this occurs, the lwb is not guaranteed
* to be "full" by the time its zio is issued, and means
* the size of the lwb was "too large" given the amount
@@ -2958,14 +2773,10 @@
zfs_commit_timeout_pct / 100;
if (sleep < zil_min_commit_timeout ||
lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) {
- lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
+ lwb = zil_lwb_write_issue(zilog, lwb);
zilog->zl_cur_used = 0;
- if (lwb == NULL) {
- while ((lwb = list_remove_head(ilwbs))
- != NULL)
- zil_lwb_write_issue(zilog, lwb);
+ if (lwb == NULL)
zil_commit_writer_stall(zilog);
- }
}
}
}
@@ -2985,17 +2796,12 @@
* not issued, we rely on future calls to zil_commit_writer() to issue
* the lwb, or the timeout mechanism found in zil_commit_waiter().
*/
-static uint64_t
+static void
zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
- list_t ilwbs;
- lwb_t *lwb;
- uint64_t wtxg = 0;
-
ASSERT(!MUTEX_HELD(&zilog->zl_lock));
ASSERT(spa_writeable(zilog->zl_spa));
- list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
mutex_enter(&zilog->zl_issuer_lock);
if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
@@ -3020,16 +2826,12 @@
ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
- wtxg = zil_get_commit_list(zilog);
+ zil_get_commit_list(zilog);
zil_prune_commit_list(zilog);
- zil_process_commit_list(zilog, zcw, &ilwbs);
+ zil_process_commit_list(zilog);
out:
mutex_exit(&zilog->zl_issuer_lock);
- while ((lwb = list_remove_head(&ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, lwb);
- list_destroy(&ilwbs);
- return (wtxg);
}
static void
@@ -3056,7 +2858,7 @@
return;
/*
- * In order to call zil_lwb_write_close() we must hold the
+ * In order to call zil_lwb_write_issue() we must hold the
* zilog's "zl_issuer_lock". We can't simply acquire that lock,
* since we're already holding the commit waiter's "zcw_lock",
* and those two locks are acquired in the opposite order
@@ -3074,10 +2876,8 @@
* the waiter is marked "done"), so without this check we could
* wind up with a use-after-free error below.
*/
- if (zcw->zcw_done) {
- lwb = NULL;
+ if (zcw->zcw_done)
goto out;
- }
ASSERT3P(lwb, ==, zcw->zcw_lwb);
@@ -3096,17 +2896,15 @@
* if it's ISSUED or OPENED, and block any other threads that might
* attempt to issue this lwb. For that reason we hold the
* zl_issuer_lock when checking the lwb_state; we must not call
- * zil_lwb_write_close() if the lwb had already been issued.
+ * zil_lwb_write_issue() if the lwb had already been issued.
*
* See the comment above the lwb_state_t structure definition for
* more details on the lwb states, and locking requirements.
*/
if (lwb->lwb_state == LWB_STATE_ISSUED ||
lwb->lwb_state == LWB_STATE_WRITE_DONE ||
- lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
- lwb = NULL;
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE)
goto out;
- }
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
@@ -3116,7 +2914,7 @@
* since we've reached the commit waiter's timeout and it still
* hasn't been issued.
*/
- lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL);
+ lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
@@ -3136,7 +2934,7 @@
if (nlwb == NULL) {
/*
- * When zil_lwb_write_close() returns NULL, this
+ * When zil_lwb_write_issue() returns NULL, this
* indicates zio_alloc_zil() failed to allocate the
* "next" lwb on-disk. When this occurs, the ZIL write
* pipeline must be stalled; see the comment within the
@@ -3158,16 +2956,12 @@
* lock, which occurs prior to calling dmu_tx_commit()
*/
mutex_exit(&zcw->zcw_lock);
- zil_lwb_write_issue(zilog, lwb);
- lwb = NULL;
zil_commit_writer_stall(zilog);
mutex_enter(&zcw->zcw_lock);
}
out:
mutex_exit(&zilog->zl_issuer_lock);
- if (lwb)
- zil_lwb_write_issue(zilog, lwb);
ASSERT(MUTEX_HELD(&zcw->zcw_lock));
}
@@ -3182,7 +2976,7 @@
* waited "long enough" and the lwb is still in the "open" state.
*
* Given a sufficient amount of itxs being generated and written using
- * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
+ * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
* function. If this does not occur, this secondary responsibility will
* ensure the lwb is issued even if there is not other synchronous
* activity on the system.
@@ -3545,7 +3339,7 @@
zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
zil_commit_itx_assign(zilog, zcw);
- uint64_t wtxg = zil_commit_writer(zilog, zcw);
+ zil_commit_writer(zilog, zcw);
zil_commit_waiter(zilog, zcw);
if (zcw->zcw_zio_error != 0) {
@@ -3560,8 +3354,6 @@
DTRACE_PROBE2(zil__commit__io__error,
zilog_t *, zilog, zil_commit_waiter_t *, zcw);
txg_wait_synced(zilog->zl_dmu_pool, 0);
- } else if (wtxg != 0) {
- txg_wait_synced(zilog->zl_dmu_pool, wtxg);
}
zil_free_commit_waiter(zcw);
@@ -3864,7 +3656,7 @@
/*
* zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
* on the time when the dmu_tx transaction is assigned in
- * zil_lwb_write_close().
+ * zil_lwb_write_issue().
*/
mutex_enter(&zilog->zl_lwb_io_lock);
txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -626,6 +626,8 @@
void
zio_add_child(zio_t *pio, zio_t *cio)
{
+ zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+
/*
* Logical I/Os can have logical, gang, or vdev children.
* Gang I/Os can have gang or vdev children.
@@ -634,7 +636,6 @@
*/
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
- zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
zl->zl_parent = pio;
zl->zl_child = cio;
@@ -643,45 +644,16 @@
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
- uint64_t *countp = pio->io_children[cio->io_child_type];
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
- countp[w] += !cio->io_state[w];
+ pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
list_insert_head(&pio->io_child_list, zl);
list_insert_head(&cio->io_parent_list, zl);
- mutex_exit(&cio->io_lock);
- mutex_exit(&pio->io_lock);
-}
-
-void
-zio_add_child_first(zio_t *pio, zio_t *cio)
-{
- /*
- * Logical I/Os can have logical, gang, or vdev children.
- * Gang I/Os can have gang or vdev children.
- * Vdev I/Os can only have vdev children.
- * The following ASSERT captures all of these constraints.
- */
- ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
-
- zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
- zl->zl_parent = pio;
- zl->zl_child = cio;
-
- ASSERT(list_is_empty(&cio->io_parent_list));
- list_insert_head(&cio->io_parent_list, zl);
-
- mutex_enter(&pio->io_lock);
-
- ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
-
- uint64_t *countp = pio->io_children[cio->io_child_type];
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
- countp[w] += !cio->io_state[w];
-
- list_insert_head(&pio->io_child_list, zl);
+ pio->io_child_count++;
+ cio->io_parent_count++;
+ mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
}
@@ -697,6 +669,9 @@
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
+ pio->io_child_count--;
+ cio->io_parent_count--;
+
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
kmem_cache_free(zio_link_cache, zl);
@@ -871,14 +846,12 @@
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
+ zio->io_bp = (blkptr_t *)bp;
+ zio->io_bp_copy = *bp;
+ zio->io_bp_orig = *bp;
if (type != ZIO_TYPE_WRITE ||
- zio->io_child_type == ZIO_CHILD_DDT) {
- zio->io_bp_copy = *bp;
+ zio->io_child_type == ZIO_CHILD_DDT)
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
- } else {
- zio->io_bp = (blkptr_t *)bp;
- }
- zio->io_bp_orig = *bp;
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_logical = zio;
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
@@ -913,7 +886,7 @@
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
zio->io_gang_leader = pio->io_gang_leader;
- zio_add_child_first(pio, zio);
+ zio_add_child(pio, zio);
}
taskq_init_ent(&zio->io_tqent);
@@ -1189,8 +1162,9 @@
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
- zio_done_func_t *done, void *private, zio_priority_t priority,
- zio_flag_t flags, const zbookmark_phys_t *zb)
+ zio_done_func_t *physdone, zio_done_func_t *done,
+ void *private, zio_priority_t priority, zio_flag_t flags,
+ const zbookmark_phys_t *zb)
{
zio_t *zio;
@@ -1210,6 +1184,7 @@
zio->io_ready = ready;
zio->io_children_ready = children_ready;
+ zio->io_physdone = physdone;
zio->io_prop = *zp;
/*
@@ -1542,11 +1517,16 @@
flags &= ~ZIO_FLAG_IO_ALLOCATING;
}
+
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ zio->io_physdone = pio->io_physdone;
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+ zio->io_logical->io_phys_children++;
+
return (zio);
}
@@ -1634,8 +1614,15 @@
abd_return_buf_copy(zio->io_abd, data, psize);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
}
+ if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
@@ -2730,7 +2717,7 @@
blkptr_t *bp = zio->io_bp;
ASSERT(gio == zio_unique_parent(zio));
- ASSERT(list_is_empty(&zio->io_child_list));
+ ASSERT(zio->io_child_count == 0);
if (zio->io_error)
return;
@@ -2988,7 +2975,7 @@
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
has_data ? abd_get_offset(pio->io_abd, pio->io_size -
resid) : NULL, lsize, lsize, &zp,
- zio_write_gang_member_ready, NULL,
+ zio_write_gang_member_ready, NULL, NULL,
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
@@ -3450,7 +3437,7 @@
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
- zio_ddt_child_write_ready, NULL,
+ zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
@@ -3968,6 +3955,9 @@
zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_TRIM)) {
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
+ return (zio);
+
if ((zio = vdev_queue_io(zio)) == NULL)
return (NULL);
@@ -4004,6 +3994,9 @@
vd->vdev_ops != &vdev_draid_spare_ops) {
vdev_queue_io_done(zio);
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_device_injections(vd, zio,
EIO, EILSEQ);
@@ -4113,7 +4106,8 @@
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
zio->io_error = 0;
- zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
+ zio->io_flags |= ZIO_FLAG_IO_RETRY |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
zio_requeue_io_start_cut_in_line);
@@ -4153,6 +4147,13 @@
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ zio->io_physdone != NULL) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+ ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+ zio->io_physdone(zio->io_logical);
+ }
+
return (zio);
}
@@ -4474,10 +4475,8 @@
zio->io_ready(zio);
}
-#ifdef ZFS_DEBUG
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
-#endif
if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -4904,7 +4903,7 @@
return (NULL);
}
- ASSERT(list_is_empty(&zio->io_child_list));
+ ASSERT(zio->io_child_count == 0);
ASSERT(zio->io_reexecute == 0);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -1203,7 +1203,8 @@
* Prefetch is completed, we can do zvol_os_create_minor
* sequentially.
*/
- while ((job = list_remove_head(&minors_list)) != NULL) {
+ while ((job = list_head(&minors_list)) != NULL) {
+ list_remove(&minors_list, job);
if (!job->error)
(void) zvol_os_create_minor(job->name);
kmem_strfree(job->name);
@@ -1310,8 +1311,10 @@
rw_exit(&zvol_state_lock);
/* Drop zvol_state_lock before calling zvol_free() */
- while ((zv = list_remove_head(&free_list)) != NULL)
+ while ((zv = list_head(&free_list)) != NULL) {
+ list_remove(&free_list, zv);
zvol_os_free(zv);
+ }
}
/* Remove minor for this specific volume only */
diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run
--- a/sys/contrib/openzfs/tests/runfiles/common.run
+++ b/sys/contrib/openzfs/tests/runfiles/common.run
@@ -128,7 +128,7 @@
'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum',
'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id',
- 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup']
+ 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2']
pre =
post =
tags = ['functional', 'cli_root', 'zdb']
@@ -472,8 +472,7 @@
tags = ['functional', 'cli_root', 'zpool_replace']
[tests/functional/cli_root/zpool_resilver]
-tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart',
- 'zpool_resilver_concurrent']
+tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart']
tags = ['functional', 'cli_root', 'zpool_resilver']
[tests/functional/cli_root/zpool_scrub]
diff --git a/sys/contrib/openzfs/tests/runfiles/freebsd.run b/sys/contrib/openzfs/tests/runfiles/freebsd.run
--- a/sys/contrib/openzfs/tests/runfiles/freebsd.run
+++ b/sys/contrib/openzfs/tests/runfiles/freebsd.run
@@ -25,8 +25,3 @@
[tests/functional/cli_root/zfs_jail:FreeBSD]
tests = ['zfs_jail_001_pos']
tags = ['functional', 'cli_root', 'zfs_jail']
-
-[tests/functional/pam:FreeBSD]
-tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive',
- 'pam_short_password']
-tags = ['functional', 'pam']
diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run
--- a/sys/contrib/openzfs/tests/runfiles/linux.run
+++ b/sys/contrib/openzfs/tests/runfiles/linux.run
@@ -140,8 +140,7 @@
tags = ['functional', 'mount']
[tests/functional/pam:Linux]
-tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive',
- 'pam_short_password']
+tests = ['pam_basic', 'pam_nounmount', 'pam_short_password']
tags = ['functional', 'pam']
[tests/functional/procfs:Linux]
diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in
--- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in
+++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in
@@ -152,7 +152,6 @@
['FAIL', rewind_reason],
'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason],
'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason],
- 'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053],
'privilege/setup': ['SKIP', na_reason],
'refreserv/refreserv_004_pos': ['FAIL', known_reason],
'rootpool/setup': ['SKIP', na_reason],
@@ -164,8 +163,6 @@
known.update({
'cli_root/zfs_receive/receive-o-x_props_override':
['FAIL', known_reason],
- 'cli_root/zpool_resilver/zpool_resilver_concurrent':
- ['SKIP', na_reason],
'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason],
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
@@ -173,7 +170,6 @@
'link_count/link_count_001': ['SKIP', na_reason],
'casenorm/mixed_create_failure': ['FAIL', 13215],
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
- 'rsend/send_raw_ashift': ['SKIP', 14961],
})
elif sys.platform.startswith('linux'):
known.update({
@@ -281,8 +277,6 @@
'mmp/mmp_inactive_import': ['FAIL', known_reason],
'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621],
'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason],
- 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872],
- 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872],
'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason],
'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason],
'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason],
diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c
--- a/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c
+++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c
@@ -501,7 +501,7 @@
srandom(seed);
zfs_btree_init();
- zfs_btree_create(&bt, zfs_btree_compare, NULL, sizeof (uint64_t));
+ zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t));
/*
* This runs the named negative test. None of them should
diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib
--- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib
+++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib
@@ -3706,7 +3706,7 @@
while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do
typeset stat1=$(get_arcstat $stat)
- sleep 0.5
+ sleep 2
typeset stat2=$(get_arcstat $stat)
do_once=false
done
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
@@ -572,7 +572,6 @@
functional/cli_root/zdb/zdb_006_pos.ksh \
functional/cli_root/zdb/zdb_args_neg.ksh \
functional/cli_root/zdb/zdb_args_pos.ksh \
- functional/cli_root/zdb/zdb_backup.ksh \
functional/cli_root/zdb/zdb_block_size_histogram.ksh \
functional/cli_root/zdb/zdb_checksum.ksh \
functional/cli_root/zdb/zdb_decompress.ksh \
@@ -1143,7 +1142,6 @@
functional/cli_root/zpool_resilver/setup.ksh \
functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \
functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \
- functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \
functional/cli_root/zpool_scrub/cleanup.ksh \
functional/cli_root/zpool_scrub/setup.ksh \
functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh
deleted file mode 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/ksh
-
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source. A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright (c) 2023, Klara Inc.
-#
-
-. $STF_SUITE/include/libtest.shlib
-
-write_count=8
-blksize=131072
-
-tmpfile=$TEST_BASE_DIR/tmpfile
-
-function cleanup
-{
- datasetexists $TESTPOOL && destroy_pool $TESTPOOL
- rm $tmpfile.1 $tmpfile.2
-}
-
-log_onexit cleanup
-
-log_assert "Verify that zfs send and zdb -B produce the same stream"
-
-verify_runnable "global"
-verify_disk_count "$DISKS" 2
-
-default_mirror_setup_noexit $DISKS
-file_write -o create -w -f $TESTDIR/file -b $blksize -c $write_count
-
-snap=$TESTPOOL/$TESTFS@snap
-log_must zfs snapshot $snap
-typeset -i objsetid=$(zfs get -Ho value objsetid $snap)
-
-sync_pool $TESTPOOL
-
-log_must eval "zfs send -ecL $snap > $tmpfile.1"
-log_must eval "zdb -B $TESTPOOL/$objsetid ecL > $tmpfile.2"
-
-typeset sum1=$(cat $tmpfile.1 | md5sum)
-typeset sum2=$(cat $tmpfile.2 | md5sum)
-
-log_must test "$sum1" = "$sum2"
-
-log_pass "zfs send and zdb -B produce the same stream"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh
deleted file mode 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
-
-#
-# DESCRIPTION:
-# Verify 'zpool clear' doesn't cause concurrent resilvers
-#
-# STRATEGY:
-# 1. Create N(10) virtual disk files.
-# 2. Create draid pool based on the virtual disk files.
-# 3. Fill the filesystem with directories and files.
-# 4. Force-fault 2 vdevs and verify distributed spare is kicked in.
-# 5. Free the distributed spare by replacing the faulty drive.
-# 6. Run zpool clear and verify that it does not initiate 2 resilvers
-# concurrently while distributed spare gets kicked in.
-#
-
-verify_runnable "global"
-
-typeset -ir devs=10
-typeset -ir nparity=1
-typeset -ir ndata=8
-typeset -ir dspare=1
-
-function cleanup
-{
- poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
-
- for i in {0..$devs}; do
- log_must rm -f "$BASEDIR/vdev$i"
- done
-
- for dir in $BASEDIR; do
- if [[ -d $dir ]]; then
- log_must rm -rf $dir
- fi
- done
-
- zed_stop
- zed_cleanup
-}
-
-log_assert "Verify zpool clear on draid pool doesn't cause concurrent resilvers"
-log_onexit cleanup
-
-setup_test_env $TESTPOOL draid${nparity}:${ndata}d:${dspare}s $devs
-
-# ZED needed for sequential resilver
-zed_setup
-log_must zed_start
-
-log_must zpool offline -f $TESTPOOL $BASEDIR/vdev5
-log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60
-log_must zpool wait -t resilver $TESTPOOL
-log_must zpool offline -f $TESTPOOL $BASEDIR/vdev6
-
-log_must zpool labelclear -f $BASEDIR/vdev5
-log_must zpool labelclear -f $BASEDIR/vdev6
-
-log_must zpool replace -w $TESTPOOL $BASEDIR/vdev5
-sync_pool $TESTPOOL
-
-log_must zpool events -c
-log_must zpool clear $TESTPOOL
-log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60
-log_must zpool wait -t resilver $TESTPOOL
-log_must zpool wait -t scrub $TESTPOOL
-
-nof_resilver=$(zpool events | grep -c resilver_start)
-if [ $nof_resilver = 1 ] ; then
- log_must verify_pool $TESTPOOL
- log_pass "zpool clear on draid pool doesn't cause concurrent resilvers"
-else
- log_fail "FAIL: sequential and healing resilver initiated concurrently"
-fi
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh
@@ -25,7 +25,7 @@
is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing"
set -A args "" "-s \",\"" "-v" \
- "-f time,cwc,imnb,imsb"
+ "-f time,zcwc,zimnb,zimsb"
log_assert "zilstat generates output and doesn't return an error code"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh
@@ -27,14 +27,15 @@
#
# STRATEGY:
# 1. Create pool with a cache device.
-# 2. Create a random file in that pool and random read for 10 sec.
-# 3. Export pool.
-# 4. Read the amount of log blocks written from the header of the
+# 2. Export and re-import pool without writing any data.
+# 3. Create a random file in that pool and random read for 10 sec.
+# 4. Export pool.
+# 5. Read the amount of log blocks written from the header of the
# L2ARC device.
-# 5. Import pool.
-# 6. Read the amount of log blocks rebuilt in arcstats and compare to
+# 6. Import pool.
+# 7. Read the amount of log blocks rebuilt in arcstats and compare to
# (5).
-# 7. Check if the labels of the L2ARC device are intact.
+# 8. Check if the labels of the L2ARC device are intact.
#
# * We can predict the minimum bytes of L2ARC restored if we subtract
# from the effective size of the cache device the bytes l2arc_evict()
@@ -76,8 +77,10 @@
log_must truncate -s ${cache_sz}M $VDEV_CACHE
-log_must zpool create -f -o ashift=12 $TESTPOOL $VDEV
-log_must zpool add $TESTPOOL cache $VDEV_CACHE
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+log_must zpool export $TESTPOOL
+log_must zpool import -d $VDIR $TESTPOOL
log_must fio $FIO_SCRIPTS/mkfiles.fio
log_must fio $FIO_SCRIPTS/random_reads.fio
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh
@@ -25,6 +25,5 @@
rmconfig
destroy_pool $TESTPOOL
del_user ${username}
-del_user ${username}rec
del_group pamtestgroup
log_must rm -rf "$runstatedir" $TESTDIRS
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh
deleted file mode 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or https://opensource.org/licenses/CDDL-1.0.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-. $STF_SUITE/tests/functional/pam/utilities.kshlib
-
-if [ -n "$ASAN_OPTIONS" ]; then
- export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}')
-fi
-
-log_mustnot ismounted "$TESTPOOL/pam/${username}"
-keystatus unavailable
-
-genconfig "homes=$TESTPOOL/pam runstatedir=${runstatedir}"
-
-printf "testpass\nsecondpass\nsecondpass\n" | pamtester -v ${pamservice} ${username} chauthtok
-
-log_mustnot ismounted "$TESTPOOL/pam/${username}"
-keystatus unavailable
-
-echo "secondpass" | pamtester ${pamservice} ${username} open_session
-references 1
-log_must ismounted "$TESTPOOL/pam/${username}"
-keystatus available
-
-printf "secondpass\ntestpass\ntestpass\n" | pamtester -v ${pamservice} ${username} chauthtok
-
-log_must ismounted "$TESTPOOL/pam/${username}"
-log_must ismounted "$TESTPOOL/pam/${username}"
-keystatus available
-
-log_must pamtester ${pamservice} ${username} close_session
-references 0
-log_mustnot ismounted "$TESTPOOL/pam/${username}"
-keystatus unavailable
-
-log_pass "done."
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh
deleted file mode 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or https://opensource.org/licenses/CDDL-1.0.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-. $STF_SUITE/tests/functional/pam/utilities.kshlib
-
-if [ -n "$ASAN_OPTIONS" ]; then
- export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}')
-fi
-
-username="${username}rec"
-
-# Set up a deeper hierarchy, a mountpoint that doesn't interfere with other tests,
-# and a user which references that mountpoint
-log_must zfs create "$TESTPOOL/pampam"
-log_must zfs create -o mountpoint="$TESTDIR/rec" "$TESTPOOL/pampam/pam"
-echo "recurpass" | zfs create -o encryption=aes-256-gcm -o keyformat=passphrase \
- -o keylocation=prompt "$TESTPOOL/pampam/pam/${username}"
-log_must zfs unmount "$TESTPOOL/pampam/pam/${username}"
-log_must zfs unload-key "$TESTPOOL/pampam/pam/${username}"
-log_must add_user pamtestgroup ${username} "$TESTDIR/rec"
-
-function keystatus {
- log_must [ "$(get_prop keystatus "$TESTPOOL/pampam/pam/${username}")" = "$1" ]
-}
-
-log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}"
-keystatus unavailable
-
-function test_session {
- echo "recurpass" | pamtester ${pamservice} ${username} open_session
- references 1
- log_must ismounted "$TESTPOOL/pampam/pam/${username}"
- keystatus available
-
- log_must pamtester ${pamservice} ${username} close_session
- references 0
- log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}"
- keystatus unavailable
-}
-
-genconfig "homes=$TESTPOOL/pampam/pam prop_mountpoint runstatedir=${runstatedir}"
-test_session
-
-genconfig "homes=$TESTPOOL/pampam recursive_homes prop_mountpoint runstatedir=${runstatedir}"
-test_session
-
-genconfig "homes=$TESTPOOL recursive_homes prop_mountpoint runstatedir=${runstatedir}"
-test_session
-
-genconfig "homes=* recursive_homes prop_mountpoint runstatedir=${runstatedir}"
-test_session
-
-log_pass "done."
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh
@@ -52,7 +52,7 @@
keystatus available
# Change user and dataset password to short one.
-printf "testpass\nshort\nshort\n" | pamtester -v ${pamservice} ${username} chauthtok
+printf "short\nshort\n" | pamtester ${pamservice} ${username} chauthtok
# Unmount and unload key.
log_must pamtester ${pamservice} ${username} close_session
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
@@ -38,8 +38,6 @@
verify_runnable "global"
-log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053"
-
function test_cleanup
{
# reset memory limit to 16M
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh
@@ -37,10 +37,6 @@
log_assert "Verify raw sending to pools with greater ashift succeeds"
-if is_freebsd; then
- log_unsupported "Runs too long on FreeBSD 14 (Issue #14961)"
-fi
-
function cleanup
{
rm -f $BACKDIR/fs@*
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
@@ -45,15 +45,6 @@
if ! is_linux ; then
log_unsupported "Only linux supports dd with oflag=dsync for FUA writes"
-else
- if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then
- log_unsupported "Disabled while issue #14872 is being worked"
- fi
-
- # Disabled for the CentOS 9 kernel
- if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then
- log_unsupported "Disabled while issue #14872 is being worked"
- fi
fi
typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh
@@ -44,15 +44,6 @@
verify_runnable "global"
if is_linux ; then
- if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then
- log_unsupported "Disabled while issue #14872 is being worked"
- fi
-
- # Disabled for the CentOS 9 kernel
- if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then
- log_unsupported "Disabled while issue #14872 is being worked"
- fi
-
# We need '--force' here since the prior tests may leave a filesystem
# on the zvol, and blkdiscard will see that filesystem and print a
# warning unless you force it.
@@ -132,6 +123,7 @@
# Remove old data from previous tests
log_must $trimcmd $zvolpath
+
set_blk_mq 1
log_must_busy zpool export $TESTPOOL
log_must zpool import $TESTPOOL
diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile
--- a/sys/modules/zfs/Makefile
+++ b/sys/modules/zfs/Makefile
@@ -38,7 +38,7 @@
CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS \
-DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \
- -D_SYS_VMEM_H_
+ -D_SYS_VMEM_H_ -DIN_FREEBSD_BASE
.if ${MACHINE_ARCH} == "amd64"
CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \
@@ -295,6 +295,7 @@
uberblock.c \
unique.c \
vdev.c \
+ vdev_cache.c \
vdev_draid.c \
vdev_draid_rand.c \
vdev_indirect.c \
diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@@ -653,9 +653,6 @@
/* qat is enabled and existed */
/* #undef HAVE_QAT */
-/* struct reclaim_state has reclaimed */
-/* #undef HAVE_RECLAIM_STATE_RECLAIMED */
-
/* register_shrinker is vararg */
/* #undef HAVE_REGISTER_SHRINKER_VARARG */
@@ -1051,7 +1048,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.2.0-FreeBSD_g009d3288"
+#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gad0a55461"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@@ -1060,7 +1057,7 @@
/* #undef ZFS_META_DATA */
/* Define the maximum compatible kernel version. */
-#define ZFS_META_KVER_MAX "6.3"
+#define ZFS_META_KVER_MAX "6.2"
/* Define the minimum compatible kernel version. */
#define ZFS_META_KVER_MIN "3.10"
@@ -1081,10 +1078,10 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
-#define ZFS_META_RELEASE "FreeBSD_g009d3288"
+#define ZFS_META_RELEASE "FreeBSD_gad0a55461"
/* Define the project version. */
-#define ZFS_META_VERSION "2.2.0"
+#define ZFS_META_VERSION "2.1.99"
/* count is located in percpu_ref.data */
/* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */
diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@@ -1 +1 @@
-#define ZFS_META_GITREV "zfs-2.2.0-rc1-0-g009d3288d"
+#define ZFS_META_GITREV "zfs-2.1.99-1955-gad0a55461"

File Metadata

Mime Type
text/plain
Expires
Tue, Sep 24, 11:24 PM (14 h, 11 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
12708010
Default Alt Text
D41483.id.diff (263 KB)

Event Timeline