Notable upstream pull request merges:
 #15892 -multiple Fast Dedup: Introduce the FDT on-disk format and feature flag
 #15893 -multiple Fast Dedup: “flat” DDT entry format
 #15895 -multiple Fast Dedup: FDT-log feature
 #16239 6be8bf555 zpool: Provide GUID to zpool-reguid(8) with -g
 #16277 -multiple Fast Dedup: prune unique entries
 #16316 5807de90a Fix null ptr deref when renaming a zvol with snaps and snapdev=visible
 #16343 77a797a38 Enable L2 cache of all (MRU+MFU) metadata but MFU data only
 #16446 83f359245 FreeBSD: fix build without kernel option MAC
 #16449 963e6c9f3 Fix incorrect error report on vdev attach/replace
 #16505 b10992582 spa_prop_get: require caller to supply output nvlist

Obtained from:	OpenZFS
OpenZFS commit:	b109925820
This commit is contained in:
Martin Matuska 2024-09-09 18:13:02 +02:00
commit e2df9bb441
132 changed files with 7382 additions and 1491 deletions

View File

@ -103,6 +103,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
CFLAGS+= -I${SRCTOP}/sys
CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
CFLAGS+= -DHAVE_ISSETUGID
CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h

View File

@ -100,6 +100,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
CFLAGS+= -I${SRCTOP}/sys
CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
CFLAGS+= -DHAVE_ISSETUGID -UHAVE_AVX -DRESCUE
CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h

View File

@ -63,10 +63,10 @@ KERNEL_C = \
zfs_fletcher_superscalar4.c \
zfs_namecheck.c \
zfs_prop.c \
zfs_valstr.c \
zpool_prop.c \
zprop_common.c
ARCH_C =
.if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386"
ARCH_C += zfs_fletcher_intel.c \
@ -92,6 +92,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libshare
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include
CFLAGS+= -I${SRCTOP}/sys
CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include

View File

@ -1,5 +1,7 @@
ZFSTOP= ${SRCTOP}/sys/contrib/openzfs
.PATH: ${ZFSTOP}/lib/libzpool
# ZFS_COMMON_SRCS
.PATH: ${ZFSTOP}/module/zfs
.PATH: ${ZFSTOP}/module/zcommon
@ -14,8 +16,6 @@ ZFSTOP= ${SRCTOP}/sys/contrib/openzfs
.PATH: ${ZFSTOP}/module/os/linux/zfs
.PATH: ${ZFSTOP}/lib/libzpool
.if exists(${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}/opensolaris_atomic.S)
.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}
ATOMIC_SRCS= opensolaris_atomic.S
@ -34,6 +34,7 @@ PACKAGE= zfs
LIB= zpool
USER_C = \
abd_os.c \
kernel.c \
taskq.c \
util.c
@ -51,7 +52,6 @@ KERNEL_C = \
zpool_prop.c \
zprop_common.c \
abd.c \
abd_os.c \
aggsum.c \
arc.c \
arc_os.c \
@ -67,6 +67,7 @@ KERNEL_C = \
dbuf.c \
dbuf_stats.c \
ddt.c \
ddt_log.c \
ddt_stats.c \
ddt_zap.c \
dmu.c \
@ -255,6 +256,7 @@ CFLAGS+= \
-I${ZFSTOP}/include \
-I${ZFSTOP}/lib/libspl/include \
-I${ZFSTOP}/lib/libspl/include/os/freebsd \
-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
-I${SRCTOP}/sys \
-I${ZFSTOP}/include/os/freebsd/zfs \
-I${SRCTOP}/cddl/compat/opensolaris/include \

View File

@ -22,6 +22,7 @@ MAN= \
zpool-create.8 \
zpool-destroy.8 \
zpool-detach.8 \
zpool-ddtprune.8 \
zpool-events.8 \
zpool-export.8 \
zpool-features.7 \
@ -66,6 +67,7 @@ CFLAGS+= \
-I${ZFSTOP}/include \
-I${ZFSTOP}/lib/libspl/include \
-I${ZFSTOP}/lib/libspl/include/os/freebsd \
-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
-I${SRCTOP}/sys \
-I${SRCTOP}/cddl/compat/opensolaris/include \
-I${ZFSTOP}/cmd/zpool \

View File

@ -15,6 +15,7 @@ CFLAGS+= \
-I${ZFSTOP}/include \
-I${ZFSTOP}/lib/libspl/include \
-I${ZFSTOP}/lib/libspl/include/os/freebsd \
-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
-I${SRCTOP}/sys \
-I${SRCTOP}/cddl/compat/opensolaris/include \
-I${ZFSTOP}/module/icp/include \

View File

@ -21,9 +21,11 @@ SYMLINKS= ${BINDIR}/zstream ${BINDIR}/zstreamdump
WARNS?= 2
CFLAGS+= \
-DIN_BASE \
-DZFS_DEBUG \
-I${ZFSTOP}/include \
-I${ZFSTOP}/lib/libspl/include \
-I${ZFSTOP}/lib/libspl/include/os/freebsd \
-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
-I${SRCTOP}/sys \
-I${SRCTOP}/cddl/compat/opensolaris/include \
-I${ZFSTOP}/module/icp/include \

View File

@ -15,6 +15,7 @@ CFLAGS+= \
-I${ZFSTOP}/include \
-I${ZFSTOP}/lib/libspl/include \
-I${ZFSTOP}/lib/libspl/include/os/freebsd \
-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
-I${SRCTOP}/cddl/compat/opensolaris/include \
-I${ZFSTOP}/module/icp/include \
-include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \

View File

@ -18,6 +18,7 @@ CFLAGS+= \
-I${ZFSTOP}/lib/libspl/include \
-I${ZFSTOP}/lib/libspl/include/os/freebsd \
-I${ZFSTOP}/lib/libspl/include/os/freebsd/spl \
-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
-I${SRCTOP}/sys \
-include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
-DHAVE_ISSETUGID

View File

@ -17,6 +17,7 @@ CFLAGS+= -DIN_BASE
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
CFLAGS+= -I${SRCTOP}/sys
CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
CFLAGS+= -I${SRCTOP}/cddl/usr.sbin

View File

@ -12,6 +12,7 @@ CFLAGS+= -DIN_BASE
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
CFLAGS+= -I${SRCTOP}/sys
CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include

View File

@ -57,6 +57,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzfs
CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
CFLAGS+= -I${SRCTOP}/sys
CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h

View File

@ -107,7 +107,7 @@ typedef struct zio_checksum_info {
#include "skein_zfs.c"
#ifdef HAS_ZSTD_ZFS
extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
extern int zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n);
#endif
@ -191,7 +191,7 @@ static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{NULL, zle_decompress, 64, "zle"},
{NULL, lz4_decompress, 0, "lz4"},
#ifdef HAS_ZSTD_ZFS
{NULL, zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
{NULL, zfs_zstd_decompress_buf, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
#endif
};

View File

@ -238,6 +238,7 @@ contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c optional zfs compile-
contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zcommon/zfs_namecheck.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zcommon/zfs_prop.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zcommon/zfs_valstr.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zcommon/zpool_prop.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zcommon/zprop_common.c optional zfs compile-with "${ZFS_C}"
@ -270,6 +271,7 @@ contrib/openzfs/module/zfs/dbuf.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/dbuf_stats.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/dataset_kstats.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/ddt.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/ddt_log.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/ddt_stats.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/dmu.c optional zfs compile-with "${ZFS_C}"

View File

@ -6,5 +6,5 @@ Release: 1
Release-Tags: relext
License: CDDL
Author: OpenZFS
Linux-Maximum: 6.9
Linux-Maximum: 6.10
Linux-Minimum: 3.10

View File

@ -24,7 +24,7 @@ zfs_ids_to_path_LDADD = \
libzfs.la
zhack_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
sbin_PROGRAMS += zhack
CPPCHECKTARGETS += zhack
@ -39,7 +39,7 @@ zhack_LDADD = \
ztest_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS)
ztest_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
ztest_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
sbin_PROGRAMS += ztest
CPPCHECKTARGETS += ztest

View File

@ -269,8 +269,7 @@ main(int argc, char **argv)
return (MOUNT_USAGE);
}
if (!zfsutil || sloppy ||
libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
}
@ -337,7 +336,7 @@ main(int argc, char **argv)
dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
if (!fake) {
if (zfsutil && !sloppy &&
if (!remount && !sloppy &&
!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint);
if (error) {

View File

@ -1,5 +1,5 @@
raidz_test_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS)
raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
bin_PROGRAMS += raidz_test
CPPCHECKTARGETS += raidz_test

View File

@ -1,4 +1,4 @@
zdb_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
zdb_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
zdb_CFLAGS = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS)
sbin_PROGRAMS += zdb

View File

@ -33,7 +33,7 @@
* under sponsorship from the FreeBSD Foundation.
* Copyright (c) 2021 Allan Jude
* Copyright (c) 2021 Toomas Soome <tsoome@me.com>
* Copyright (c) 2023, Klara Inc.
* Copyright (c) 2023, 2024, Klara Inc.
* Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
*/
@ -1914,23 +1914,25 @@ dump_log_spacemaps(spa_t *spa)
}
static void
dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
uint64_t index)
{
const ddt_phys_t *ddp = dde->dde_phys;
const ddt_key_t *ddk = &dde->dde_key;
const char *types[4] = { "ditto", "single", "double", "triple" };
const ddt_key_t *ddk = &ddlwe->ddlwe_key;
char blkbuf[BP_SPRINTF_LEN];
blkptr_t blk;
int p;
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
for (p = 0; p < DDT_NPHYS(ddt); p++) {
const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (ddt_phys_birth(ddp, v) == 0)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
(void) printf("index %llx refcnt %llu %s %s\n",
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
types[p], blkbuf);
(void) printf("index %llx refcnt %llu phys %d %s\n",
(u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
p, blkbuf);
}
}
@ -1956,11 +1958,37 @@ dump_dedup_ratio(const ddt_stat_t *dds)
dedup, compress, copies, dedup * compress / copies);
}
static void
dump_ddt_log(ddt_t *ddt)
{
for (int n = 0; n < 2; n++) {
ddt_log_t *ddl = &ddt->ddt_log[n];
uint64_t count = avl_numnodes(&ddl->ddl_tree);
if (count == 0)
continue;
printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
if (dump_opt['D'] < 4)
continue;
ddt_lightweight_entry_t ddlwe;
uint64_t index = 0;
for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
dump_ddt_entry(ddt, &ddlwe, index++);
}
}
}
static void
dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
{
char name[DDT_NAMELEN];
ddt_entry_t dde;
ddt_lightweight_entry_t ddlwe;
uint64_t walk = 0;
dmu_object_info_t doi;
uint64_t count, dspace, mspace;
@ -2001,8 +2029,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
(void) printf("%s contents:\n\n", name);
while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
dump_dde(ddt, &dde, walk);
while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
dump_ddt_entry(ddt, &ddlwe, walk);
ASSERT3U(error, ==, ENOENT);
@ -2017,7 +2045,7 @@ dump_all_ddts(spa_t *spa)
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (!ddt)
if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
continue;
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
@ -2025,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
dump_ddt(ddt, type, class);
}
}
dump_ddt_log(ddt);
}
ddt_get_dedup_stats(spa, &dds_total);
@ -2043,6 +2072,32 @@ dump_all_ddts(spa_t *spa)
}
dump_dedup_ratio(&dds_total);
/*
* Dump a histogram of unique class entry age
*/
if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
ddt_age_histo_t histogram;
(void) printf("DDT walk unique, building age histogram...\n");
ddt_prune_walk(spa, 0, &histogram);
/*
* print out histogram for unique entry class birth
*/
if (histogram.dah_entries > 0) {
(void) printf("%5s %9s %4s\n",
"age", "blocks", "amnt");
(void) printf("%5s %9s %4s\n",
"-----", "---------", "----");
for (int i = 0; i < HIST_BINS; i++) {
(void) printf("%5d %9d %4d%%\n", 1 << i,
(int)histogram.dah_age_histo[i],
(int)((histogram.dah_age_histo[i] * 100) /
histogram.dah_entries));
}
}
}
}
static void
@ -3287,9 +3342,45 @@ fuid_table_destroy(void)
}
}
/*
* Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
* a live pool are normally cleaned up during ddt_sync(). We can't do that (and
* wouldn't want to anyway), but if we don't clean up the presence of stuff on
* ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
*
* Note that this is not a particularly efficient way to do this, but
* ddt_remove() is the only public method that can do the work we need, and it
* requires the right locks and etc to do the job. This is only ever called
* during zdb shutdown so efficiency is not especially important.
*/
static void
zdb_ddt_cleanup(spa_t *spa)
{
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (!ddt)
continue;
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
ddt_enter(ddt);
ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
while (dde) {
next = AVL_NEXT(&ddt->ddt_tree, dde);
dde->dde_io = NULL;
ddt_remove(ddt, dde);
dde = next;
}
ddt_exit(ddt);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
}
static void
zdb_exit(int reason)
{
if (spa != NULL)
zdb_ddt_cleanup(spa);
if (os != NULL) {
close_objset(os, FTAG);
} else if (spa != NULL) {
@ -4592,7 +4683,6 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
l2arc_log_blk_phys_t this_lb;
uint64_t asize;
l2arc_log_blkptr_t lbps[2];
abd_t *abd;
zio_cksum_t cksum;
int failed = 0;
l2arc_dev_t dev;
@ -4646,20 +4736,25 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
case ZIO_COMPRESS_OFF:
break;
default:
abd = abd_alloc_for_io(asize, B_TRUE);
default: {
abd_t *abd = abd_alloc_linear(asize, B_TRUE);
abd_copy_from_buf_off(abd, &this_lb, 0, asize);
if (zio_decompress_data(L2BLK_GET_COMPRESS(
(&lbps[0])->lbp_prop), abd, &this_lb,
asize, sizeof (this_lb), NULL) != 0) {
abd_t dabd;
abd_get_from_buf_struct(&dabd, &this_lb,
sizeof (this_lb));
int err = zio_decompress_data(L2BLK_GET_COMPRESS(
(&lbps[0])->lbp_prop), abd, &dabd,
asize, sizeof (this_lb), NULL);
abd_free(&dabd);
abd_free(abd);
if (err != 0) {
(void) printf("L2ARC block decompression "
"failed\n");
abd_free(abd);
goto out;
}
abd_free(abd);
break;
}
}
if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
byteswap_uint64_array(&this_lb, sizeof (this_lb));
@ -5633,7 +5728,6 @@ static void
zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type)
{
uint64_t refcnt = 0;
int i;
ASSERT(type < ZDB_OT_TOTAL);
@ -5641,8 +5735,167 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
if (zilog && zil_bp_tree_add(zilog, bp) != 0)
return;
/*
* This flag controls if we will issue a claim for the block while
* counting it, to ensure that all blocks are referenced in space maps.
* We don't issue claims if we're not doing leak tracking, because it's
* expensive if the user isn't interested. We also don't claim the
* second or later occurences of cloned or dedup'd blocks, because we
* already claimed them the first time.
*/
boolean_t do_claim = !dump_opt['L'];
spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
blkptr_t tempbp;
if (BP_GET_DEDUP(bp)) {
/*
* Dedup'd blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them once.
*
* We use the existing dedup system to track what we've seen.
* The first time we see a block, we do a ddt_lookup() to see
* if it exists in the DDT. If we're doing leak tracking, we
* claim the block at this time.
*
* Each time we see a block, we reduce the refcount in the
* entry by one, and add to the size and count of dedup'd
* blocks to report at the end.
*/
ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
ddt_enter(ddt);
/*
* Find the block. This will create the entry in memory, but
* we'll know if that happened by its refcount.
*/
ddt_entry_t *dde = ddt_lookup(ddt, bp);
/*
* ddt_lookup() can return NULL if this block didn't exist
* in the DDT and creating it would take the DDT over its
* quota. Since we got the block from disk, it must exist in
* the DDT, so this can't happen. However, when unique entries
* are pruned, the dedup bit can be set with no corresponding
* entry in the DDT.
*/
if (dde == NULL) {
ddt_exit(ddt);
goto skipped;
}
/* Get the phys for this variant */
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
/*
* This entry may have multiple sets of DVAs. We must claim
* each set the first time we see them in a real block on disk,
* or count them on subsequent occurences. We don't have a
* convenient way to track the first time we see each variant,
* so we repurpose dde_io as a set of "seen" flag bits. We can
* do this safely in zdb because it never writes, so it will
* never have a writing zio for this block in that pointer.
*/
boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
if (!seen)
dde->dde_io =
(void *)(((uintptr_t)dde->dde_io) | (1 << v));
/* Consume a reference for this block. */
if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
ddt_phys_decref(dde->dde_phys, v);
/*
* If this entry has a single flat phys, it may have been
* extended with additional DVAs at some time in its life.
* This block might be from before it was fully extended, and
* so have fewer DVAs.
*
* If this is the first time we've seen this block, and we
* claimed it as-is, then we would miss the claim on some
* number of DVAs, which would then be seen as leaked.
*
* In all cases, if we've had fewer DVAs, then the asize would
* be too small, and would lead to the pool apparently using
* more space than allocated.
*
* To handle this, we copy the canonical set of DVAs from the
* entry back to the block pointer before we claim it.
*/
if (v == DDT_PHYS_FLAT) {
ASSERT3U(BP_GET_BIRTH(bp), ==,
ddt_phys_birth(dde->dde_phys, v));
tempbp = *bp;
ddt_bp_fill(dde->dde_phys, v, &tempbp,
BP_GET_BIRTH(bp));
bp = &tempbp;
}
if (seen) {
/*
* The second or later time we see this block,
* it's a duplicate and we count it.
*/
zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
zcb->zcb_dedup_blocks++;
/* Already claimed, don't do it again. */
do_claim = B_FALSE;
}
ddt_exit(ddt);
} else if (zcb->zcb_brt_is_active &&
brt_maybe_exists(zcb->zcb_spa, bp)) {
/*
* Cloned blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them once.
*
* To do this, we keep our own in-memory BRT. For each block
* we haven't seen before, we look it up in the real BRT and
* if its there, we note it and its refcount then proceed as
* normal. If we see the block again, we count it as a clone
* and then give it no further consideration.
*/
zdb_brt_entry_t zbre_search, *zbre;
avl_index_t where;
zbre_search.zbre_dva = bp->blk_dva[0];
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
if (zbre == NULL) {
/* Not seen before; track it */
uint64_t refcnt =
brt_entry_get_refcount(zcb->zcb_spa, bp);
if (refcnt > 0) {
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
UMEM_NOFAIL);
zbre->zbre_dva = bp->blk_dva[0];
zbre->zbre_refcount = refcnt;
avl_insert(&zcb->zcb_brt, zbre, where);
}
} else {
/*
* Second or later occurrence, count it and take a
* refcount.
*/
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
zcb->zcb_clone_blocks++;
zbre->zbre_refcount--;
if (zbre->zbre_refcount == 0) {
avl_remove(&zcb->zcb_brt, zbre);
umem_free(zbre, sizeof (zdb_brt_entry_t));
}
/* Already claimed, don't do it again. */
do_claim = B_FALSE;
}
}
skipped:
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
int t = (i & 1) ? type : ZDB_OT_TOTAL;
@ -5745,71 +5998,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
zcb->zcb_asize_total += BP_GET_ASIZE(bp);
if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
/*
* Cloned blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them them once.
*
* To do this, we keep our own in-memory BRT. For each block
* we haven't seen before, we look it up in the real BRT and
* if its there, we note it and its refcount then proceed as
* normal. If we see the block again, we count it as a clone
* and then give it no further consideration.
*/
zdb_brt_entry_t zbre_search, *zbre;
avl_index_t where;
zbre_search.zbre_dva = bp->blk_dva[0];
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
if (zbre != NULL) {
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
zcb->zcb_clone_blocks++;
zbre->zbre_refcount--;
if (zbre->zbre_refcount == 0) {
avl_remove(&zcb->zcb_brt, zbre);
umem_free(zbre, sizeof (zdb_brt_entry_t));
}
return;
}
uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
if (crefcnt > 0) {
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
UMEM_NOFAIL);
zbre->zbre_dva = bp->blk_dva[0];
zbre->zbre_refcount = crefcnt;
avl_insert(&zcb->zcb_brt, zbre, where);
}
}
if (dump_opt['L'])
if (!do_claim)
return;
if (BP_GET_DEDUP(bp)) {
ddt_t *ddt;
ddt_entry_t *dde;
ddt = ddt_select(zcb->zcb_spa, bp);
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_FALSE);
if (dde == NULL) {
refcnt = 0;
} else {
ddt_phys_t *ddp = ddt_phys_select(dde, bp);
ddt_phys_decref(ddp);
refcnt = ddp->ddp_refcnt;
if (ddt_phys_total_refcnt(dde) == 0)
ddt_remove(ddt, dde);
}
ddt_exit(ddt);
}
VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
ZIO_FLAG_CANFAIL)));
}
static void
@ -6120,49 +6314,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
return (counts);
}
static void
zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
ddt_bookmark_t ddb = {0};
ddt_entry_t dde;
int error;
int p;
ASSERT(!dump_opt['L']);
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
blkptr_t blk;
ddt_phys_t *ddp = dde.dde_phys;
if (ddb.ddb_class == DDT_CLASS_UNIQUE)
return;
ASSERT(ddt_phys_total_refcnt(&dde) > 1);
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
VERIFY(ddt);
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
continue;
ddt_bp_create(ddb.ddb_checksum,
&dde.dde_key, ddp, &blk);
if (p == DDT_PHYS_DITTO) {
zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
} else {
zcb->zcb_dedup_asize +=
BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
zcb->zcb_dedup_blocks++;
}
}
ddt_enter(ddt);
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
ddt_exit(ddt);
}
ASSERT(error == ENOENT);
}
typedef struct checkpoint_sm_exclude_entry_arg {
vdev_t *cseea_vd;
uint64_t cseea_checkpoint_size;
@ -6546,10 +6697,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
increment_indirect_mapping_cb, zcb, NULL);
}
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
zdb_ddt_leak_init(spa, zcb);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
static boolean_t
@ -6814,6 +6961,8 @@ dump_block_stats(spa_t *spa)
int e, c, err;
bp_embedded_type_t i;
ddt_prefetch_all(spa);
zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
@ -6938,7 +7087,6 @@ dump_block_stats(spa_t *spa)
(u_longlong_t)total_alloc,
(dump_opt['L']) ? "unreachable" : "leaked",
(longlong_t)(total_alloc - total_found));
leaks = B_TRUE;
}
if (tzb->zb_count == 0) {
@ -7272,29 +7420,27 @@ dump_simulated_ddt(spa_t *spa)
spa_config_exit(spa, SCL_CONFIG, FTAG);
while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
ddt_stat_t dds;
uint64_t refcnt = zdde->zdde_ref_blocks;
ASSERT(refcnt != 0);
dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
dds.dds_psize = zdde->zdde_ref_psize / refcnt;
dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];
dds.dds_ref_blocks = zdde->zdde_ref_blocks;
dds.dds_ref_lsize = zdde->zdde_ref_lsize;
dds.dds_ref_psize = zdde->zdde_ref_psize;
dds.dds_ref_dsize = zdde->zdde_ref_dsize;
dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
dds->dds_psize += zdde->zdde_ref_psize / refcnt;
dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;
ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
&dds, 0);
dds->dds_ref_blocks += zdde->zdde_ref_blocks;
dds->dds_ref_lsize += zdde->zdde_ref_lsize;
dds->dds_ref_psize += zdde->zdde_ref_psize;
dds->dds_ref_dsize += zdde->zdde_ref_dsize;
umem_free(zdde, sizeof (*zdde));
}
avl_destroy(&t);
ddt_histogram_stat(&dds_total, &ddh_total);
ddt_histogram_total(&dds_total, &ddh_total);
(void) printf("Simulated DDT histogram:\n");
@ -8022,16 +8168,28 @@ dump_mos_leaks(spa_t *spa)
mos_leak_vdev(spa->spa_root_vdev);
for (uint64_t class = 0; class < DDT_CLASSES; class++) {
for (uint64_t type = 0; type < DDT_TYPES; type++) {
for (uint64_t cksum = 0;
cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
ddt_t *ddt = spa->spa_ddt[cksum];
if (!ddt)
continue;
for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
continue;
/* DDT store objects */
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
class++) {
mos_obj_refd(ddt->ddt_object[type][class]);
}
}
/* FDT container */
if (ddt->ddt_version == DDT_VERSION_FDT)
mos_obj_refd(ddt->ddt_dir_object);
/* FDT log objects */
if (ddt->ddt_flags & DDT_FLAG_LOG) {
mos_obj_refd(ddt->ddt_log[0].ddl_object);
mos_obj_refd(ddt->ddt_log[1].ddl_object);
}
}
if (spa->spa_brt != NULL) {
@ -8499,13 +8657,22 @@ try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
memset(lbuf, 0x00, lsize);
memset(lbuf2, 0xff, lsize);
abd_t labd, labd2;
abd_get_from_buf_struct(&labd, lbuf, lsize);
abd_get_from_buf_struct(&labd2, lbuf2, lsize);
boolean_t ret = B_FALSE;
if (zio_decompress_data(cfunc, pabd,
lbuf, psize, lsize, NULL) == 0 &&
&labd, psize, lsize, NULL) == 0 &&
zio_decompress_data(cfunc, pabd,
lbuf2, psize, lsize, NULL) == 0 &&
&labd2, psize, lsize, NULL) == 0 &&
memcmp(lbuf, lbuf2, lsize) == 0)
return (B_TRUE);
return (B_FALSE);
ret = B_TRUE;
abd_free(&labd2);
abd_free(&labd);
return (ret);
}
static uint64_t
@ -9624,6 +9791,9 @@ retry_lookup:
}
fini:
if (spa != NULL)
zdb_ddt_cleanup(spa);
if (os != NULL) {
close_objset(os, FTAG);
} else if (spa != NULL) {

View File

@ -844,7 +844,6 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
const char *failmode = NULL;
boolean_t checkremove = B_FALSE;
uint32_t pri = 0;
int32_t flags = 0;
/*
* If this is a checksum or I/O error, then toss it into the
@ -922,18 +921,28 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
}
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
uint64_t flags = 0;
int32_t flags32 = 0;
/*
* We ignore ereports for checksum errors generated by
* scrub/resilver I/O to avoid potentially further
* degrading the pool while it's being repaired.
*
* Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to
* be int32. To allow newer zed to work on older
* kernels, if we don't find the flags, we look for
* the older ones too.
*/
if (((nvlist_lookup_uint32(nvl,
FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
(pri == ZIO_PRIORITY_SCRUB ||
pri == ZIO_PRIORITY_REBUILD)) ||
((nvlist_lookup_int32(nvl,
((nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
(flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
(flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) ||
((nvlist_lookup_int32(nvl,
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags32) == 0) &&
(flags32 & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
fmd_hdl_debug(hdl, "ignoring '%s' for "
"scrub/resilver I/O", class);
return;

View File

@ -75,6 +75,7 @@
#include "zpool_util.h"
#include "zfs_comutil.h"
#include "zfeature_common.h"
#include "zfs_valstr.h"
#include "statcommon.h"
@ -130,6 +131,8 @@ static int zpool_do_version(int, char **);
static int zpool_do_wait(int, char **);
static int zpool_do_ddt_prune(int, char **);
static int zpool_do_help(int argc, char **argv);
static zpool_compat_status_t zpool_do_load_compat(
@ -170,6 +173,7 @@ typedef enum {
HELP_CLEAR,
HELP_CREATE,
HELP_CHECKPOINT,
HELP_DDT_PRUNE,
HELP_DESTROY,
HELP_DETACH,
HELP_EXPORT,
@ -426,6 +430,8 @@ static zpool_command_t command_table[] = {
{ "sync", zpool_do_sync, HELP_SYNC },
{ NULL },
{ "wait", zpool_do_wait, HELP_WAIT },
{ NULL },
{ "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE },
};
#define NCOMMAND (ARRAY_SIZE(command_table))
@ -537,7 +543,7 @@ get_usage(zpool_help_t idx)
"\t [-o property=value] <pool> <newpool> "
"[<device> ...]\n"));
case HELP_REGUID:
return (gettext("\treguid <pool>\n"));
return (gettext("\treguid [-g guid] <pool>\n"));
case HELP_SYNC:
return (gettext("\tsync [pool] ...\n"));
case HELP_VERSION:
@ -545,6 +551,8 @@ get_usage(zpool_help_t idx)
case HELP_WAIT:
return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
"<pool> [interval]\n"));
case HELP_DDT_PRUNE:
return (gettext("\tddtprune -d|-p <amount> <pool>\n"));
default:
__builtin_unreachable();
}
@ -2025,7 +2033,7 @@ zpool_do_create(int argc, char **argv)
char *end;
u_longlong_t ver;
ver = strtoull(propval, &end, 10);
ver = strtoull(propval, &end, 0);
if (*end == '\0' &&
ver < SPA_VERSION_FEATURES) {
enable_pool_features = B_FALSE;
@ -8232,19 +8240,32 @@ zpool_do_clear(int argc, char **argv)
}
/*
* zpool reguid <pool>
* zpool reguid [-g <guid>] <pool>
*/
int
zpool_do_reguid(int argc, char **argv)
{
uint64_t guid;
uint64_t *guidp = NULL;
int c;
char *endptr;
char *poolname;
zpool_handle_t *zhp;
int ret = 0;
/* check options */
while ((c = getopt(argc, argv, "")) != -1) {
while ((c = getopt(argc, argv, "g:")) != -1) {
switch (c) {
case 'g':
errno = 0;
guid = strtoull(optarg, &endptr, 10);
if (errno != 0 || *endptr != '\0') {
(void) fprintf(stderr,
gettext("invalid GUID: %s\n"), optarg);
usage(B_FALSE);
}
guidp = &guid;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@ -8270,7 +8291,7 @@ zpool_do_reguid(int argc, char **argv)
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
ret = zpool_reguid(zhp);
ret = zpool_set_guid(zhp, guidp);
zpool_close(zhp);
return (ret);
@ -11916,6 +11937,7 @@ static void
zpool_do_events_nvprint(nvlist_t *nvl, int depth)
{
nvpair_t *nvp;
static char flagstr[256];
for (nvp = nvlist_next_nvpair(nvl, NULL);
nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
@ -11975,7 +11997,21 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth)
case DATA_TYPE_UINT32:
(void) nvpair_value_uint32(nvp, &i32);
printf(gettext("0x%x"), i32);
if (strcmp(name,
FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE) == 0 ||
strcmp(name,
FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE) == 0) {
zfs_valstr_zio_stage(i32, flagstr,
sizeof (flagstr));
printf(gettext("0x%x [%s]"), i32, flagstr);
} else if (strcmp(name,
FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY) == 0) {
zfs_valstr_zio_priority(i32, flagstr,
sizeof (flagstr));
printf(gettext("0x%x [%s]"), i32, flagstr);
} else {
printf(gettext("0x%x"), i32);
}
break;
case DATA_TYPE_INT64:
@ -11996,6 +12032,12 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth)
printf(gettext("\"%s\" (0x%llx)"),
zpool_state_to_name(i64, VDEV_AUX_NONE),
(u_longlong_t)i64);
} else if (strcmp(name,
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS) == 0) {
zfs_valstr_zio_flag(i64, flagstr,
sizeof (flagstr));
printf(gettext("0x%llx [%s]"),
(u_longlong_t)i64, flagstr);
} else {
printf(gettext("0x%llx"), (u_longlong_t)i64);
}
@ -13329,6 +13371,88 @@ found:;
return (error);
}
/*
* zpool ddtprune -d|-p <amount> <pool>
*
* -d <days> Prune entries <days> old and older
* -p <percent> Prune <percent> amount of entries
*
* Prune single reference entries from DDT to satisfy the amount specified.
*/
int
zpool_do_ddt_prune(int argc, char **argv)
{
zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE;
uint64_t amount = 0;
zpool_handle_t *zhp;
char *endptr;
int c;
while ((c = getopt(argc, argv, "d:p:")) != -1) {
switch (c) {
case 'd':
if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
(void) fprintf(stderr, gettext("-d cannot be "
"combined with -p option\n"));
usage(B_FALSE);
}
errno = 0;
amount = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0' || amount == 0) {
(void) fprintf(stderr,
gettext("invalid days value\n"));
usage(B_FALSE);
}
amount *= 86400; /* convert days to seconds */
unit = ZPOOL_DDT_PRUNE_AGE;
break;
case 'p':
if (unit == ZPOOL_DDT_PRUNE_AGE) {
(void) fprintf(stderr, gettext("-p cannot be "
"combined with -d option\n"));
usage(B_FALSE);
}
errno = 0;
amount = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0' ||
amount == 0 || amount > 100) {
(void) fprintf(stderr,
gettext("invalid percentage value\n"));
usage(B_FALSE);
}
unit = ZPOOL_DDT_PRUNE_PERCENTAGE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (unit == ZPOOL_DDT_PRUNE_NONE) {
(void) fprintf(stderr,
gettext("missing amount option (-d|-p <value>)\n"));
usage(B_FALSE);
} else if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool argument\n"));
usage(B_FALSE);
} else if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
zhp = zpool_open(g_zfs, argv[0]);
if (zhp == NULL)
return (-1);
int error = zpool_ddt_prune(zhp, unit, amount);
zpool_close(zhp);
return (error);
}
static int
find_command_idx(const char *command, int *idx)
{

View File

@ -1,3 +1,5 @@
zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
sbin_PROGRAMS += zstream
CPPCHECKTARGETS += zstream

View File

@ -22,6 +22,8 @@
/*
* Copyright 2022 Axcient. All rights reserved.
* Use is subject to license terms.
*
* Copyright (c) 2024, Klara, Inc.
*/
#include <err.h>
@ -257,83 +259,73 @@ zstream_do_decompress(int argc, char *argv[])
ENTRY e = {.key = key};
p = hsearch(e, FIND);
if (p != NULL) {
zio_decompress_func_t *xfunc = NULL;
switch ((enum zio_compress)(intptr_t)p->data) {
case ZIO_COMPRESS_OFF:
xfunc = NULL;
break;
case ZIO_COMPRESS_LZJB:
xfunc = lzjb_decompress;
break;
case ZIO_COMPRESS_GZIP_1:
xfunc = gzip_decompress;
break;
case ZIO_COMPRESS_ZLE:
xfunc = zle_decompress;
break;
case ZIO_COMPRESS_LZ4:
xfunc = lz4_decompress_zfs;
break;
case ZIO_COMPRESS_ZSTD:
xfunc = zfs_zstd_decompress;
break;
default:
assert(B_FALSE);
}
/*
* Read and decompress the block
*/
char *lzbuf = safe_calloc(payload_size);
(void) sfread(lzbuf, payload_size, stdin);
if (xfunc == NULL) {
memcpy(buf, lzbuf, payload_size);
drrw->drr_compressiontype =
ZIO_COMPRESS_OFF;
if (verbose)
fprintf(stderr, "Resetting "
"compression type to off "
"for ino %llu offset "
"%llu\n",
(u_longlong_t)
drrw->drr_object,
(u_longlong_t)
drrw->drr_offset);
} else if (0 != xfunc(lzbuf, buf,
payload_size, payload_size, 0)) {
/*
* The block must not be compressed,
* at least not with this compression
* type, possibly because it gets
* written multiple times in this
* stream.
*/
warnx("decompression failed for "
"ino %llu offset %llu",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
memcpy(buf, lzbuf, payload_size);
} else if (verbose) {
drrw->drr_compressiontype =
ZIO_COMPRESS_OFF;
fprintf(stderr, "successfully "
"decompressed ino %llu "
"offset %llu\n",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
} else {
drrw->drr_compressiontype =
ZIO_COMPRESS_OFF;
}
free(lzbuf);
} else {
if (p == NULL) {
/*
* Read the contents of the block unaltered
*/
(void) sfread(buf, payload_size, stdin);
break;
}
/*
* Read and decompress the block
*/
enum zio_compress c =
(enum zio_compress)(intptr_t)p->data;
if (c == ZIO_COMPRESS_OFF) {
(void) sfread(buf, payload_size, stdin);
drrw->drr_compressiontype = 0;
drrw->drr_compressed_size = 0;
if (verbose)
fprintf(stderr,
"Resetting compression type to "
"off for ino %llu offset %llu\n",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
break;
}
uint64_t lsize = drrw->drr_logical_size;
ASSERT3U(payload_size, <=, lsize);
char *lzbuf = safe_calloc(payload_size);
(void) sfread(lzbuf, payload_size, stdin);
abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
abd_get_from_buf_struct(&dabd, buf, lsize);
int err = zio_decompress_data(c, &sabd, &dabd,
payload_size, lsize, NULL);
abd_free(&dabd);
abd_free(&sabd);
if (err == 0) {
drrw->drr_compressiontype = 0;
drrw->drr_compressed_size = 0;
payload_size = lsize;
if (verbose) {
fprintf(stderr,
"successfully decompressed "
"ino %llu offset %llu\n",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
}
} else {
/*
* The block must not be compressed, at least
* not with this compression type, possibly
* because it gets written multiple times in
* this stream.
*/
warnx("decompression failed for "
"ino %llu offset %llu",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
memcpy(buf, lzbuf, payload_size);
}
free(lzbuf);
break;
}

View File

@ -22,10 +22,9 @@
/*
* Copyright 2022 Axcient. All rights reserved.
* Use is subject to license terms.
*/
/*
*
* Copyright (c) 2022 by Delphix. All rights reserved.
* Copyright (c) 2024, Klara, Inc.
*/
#include <err.h>
@ -72,7 +71,7 @@ zstream_do_recompress(int argc, char *argv[])
dmu_replay_record_t *drr = &thedrr;
zio_cksum_t stream_cksum;
int c;
int level = -1;
int level = 0;
while ((c = getopt(argc, argv, "l:")) != -1) {
switch (c) {
@ -97,34 +96,22 @@ zstream_do_recompress(int argc, char *argv[])
if (argc != 1)
zstream_usage();
int type = 0;
zio_compress_info_t *cinfo = NULL;
if (0 == strcmp(argv[0], "off")) {
type = ZIO_COMPRESS_OFF;
cinfo = &zio_compress_table[type];
} else if (0 == strcmp(argv[0], "inherit") ||
0 == strcmp(argv[0], "empty") ||
0 == strcmp(argv[0], "on")) {
// Fall through to invalid compression type case
} else {
for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
if (0 == strcmp(zio_compress_table[i].ci_name,
argv[0])) {
cinfo = &zio_compress_table[i];
type = i;
break;
}
}
}
if (cinfo == NULL) {
fprintf(stderr, "Invalid compression type %s.\n",
argv[0]);
exit(2);
}
if (cinfo->ci_compress == NULL) {
type = 0;
cinfo = &zio_compress_table[0];
enum zio_compress ctype;
if (strcmp(argv[0], "off") == 0) {
ctype = ZIO_COMPRESS_OFF;
} else {
for (ctype = 0; ctype < ZIO_COMPRESS_FUNCTIONS; ctype++) {
if (strcmp(argv[0],
zio_compress_table[ctype].ci_name) == 0)
break;
}
if (ctype == ZIO_COMPRESS_FUNCTIONS ||
zio_compress_table[ctype].ci_compress == NULL) {
fprintf(stderr, "Invalid compression type %s.\n",
argv[0]);
exit(2);
}
}
if (isatty(STDIN_FILENO)) {
@ -135,6 +122,7 @@ zstream_do_recompress(int argc, char *argv[])
exit(1);
}
abd_init();
fletcher_4_init();
zio_init();
zstd_init();
@ -247,63 +235,78 @@ zstream_do_recompress(int argc, char *argv[])
(void) sfread(buf, payload_size, stdin);
break;
}
if (drrw->drr_compressiontype >=
ZIO_COMPRESS_FUNCTIONS) {
enum zio_compress dtype = drrw->drr_compressiontype;
if (dtype >= ZIO_COMPRESS_FUNCTIONS) {
fprintf(stderr, "Invalid compression type in "
"stream: %d\n", drrw->drr_compressiontype);
"stream: %d\n", dtype);
exit(3);
}
zio_compress_info_t *dinfo =
&zio_compress_table[drrw->drr_compressiontype];
if (zio_compress_table[dtype].ci_decompress == NULL)
dtype = ZIO_COMPRESS_OFF;
/* Set up buffers to minimize memcpys */
char *cbuf, *dbuf;
if (cinfo->ci_compress == NULL)
if (ctype == ZIO_COMPRESS_OFF)
dbuf = buf;
else
dbuf = safe_calloc(bufsz);
if (dinfo->ci_decompress == NULL)
if (dtype == ZIO_COMPRESS_OFF)
cbuf = dbuf;
else
cbuf = safe_calloc(payload_size);
/* Read and decompress the payload */
(void) sfread(cbuf, payload_size, stdin);
if (dinfo->ci_decompress != NULL) {
if (0 != dinfo->ci_decompress(cbuf, dbuf,
payload_size, MIN(bufsz,
drrw->drr_logical_size), dinfo->ci_level)) {
if (dtype != ZIO_COMPRESS_OFF) {
abd_t cabd, dabd;
abd_get_from_buf_struct(&cabd,
cbuf, payload_size);
abd_get_from_buf_struct(&dabd, dbuf,
MIN(bufsz, drrw->drr_logical_size));
if (zio_decompress_data(dtype, &cabd, &dabd,
payload_size, abd_get_size(&dabd),
NULL) != 0) {
warnx("decompression type %d failed "
"for ino %llu offset %llu",
type,
dtype,
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
exit(4);
}
payload_size = drrw->drr_logical_size;
abd_free(&dabd);
abd_free(&cabd);
free(cbuf);
}
/* Recompress the payload */
if (cinfo->ci_compress != NULL) {
payload_size = P2ROUNDUP(cinfo->ci_compress(
dbuf, buf, drrw->drr_logical_size,
MIN(payload_size, bufsz), (level == -1 ?
cinfo->ci_level : level)),
SPA_MINBLOCKSIZE);
if (payload_size != drrw->drr_logical_size) {
drrw->drr_compressiontype = type;
drrw->drr_compressed_size =
payload_size;
} else {
if (ctype != ZIO_COMPRESS_OFF) {
abd_t dabd, abd;
abd_get_from_buf_struct(&dabd,
dbuf, drrw->drr_logical_size);
abd_t *pabd =
abd_get_from_buf_struct(&abd, buf, bufsz);
size_t csize = zio_compress_data(ctype, &dabd,
&pabd, drrw->drr_logical_size, level);
size_t rounded =
P2ROUNDUP(csize, SPA_MINBLOCKSIZE);
if (rounded >= drrw->drr_logical_size) {
memcpy(buf, dbuf, payload_size);
drrw->drr_compressiontype = 0;
drrw->drr_compressed_size = 0;
} else {
abd_zero_off(pabd, csize,
rounded - csize);
drrw->drr_compressiontype = ctype;
drrw->drr_compressed_size =
payload_size = rounded;
}
abd_free(&abd);
abd_free(&dabd);
free(dbuf);
} else {
drrw->drr_compressiontype = type;
drrw->drr_compressiontype = 0;
drrw->drr_compressed_size = 0;
}
break;
@ -371,6 +374,7 @@ zstream_do_recompress(int argc, char *argv[])
fletcher_4_fini();
zio_fini();
zstd_fini();
abd_fini();
return (0);
}

View File

@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction;
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
extern uint64_t raidz_expand_max_reflow_bytes;
extern uint_t raidz_expand_pause_point;
extern boolean_t ddt_prune_artificial_age;
extern boolean_t ddt_dump_prune_histogram;
static ztest_shared_opts_t *ztest_shared_opts;
@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt;
ztest_func_t ztest_pool_prefetch_ddt;
ztest_func_t ztest_ddt_prune;
static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely),
ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely),
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
@ -6747,7 +6751,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
load = spa_load_guid(spa);
(void) pthread_rwlock_wrlock(&ztest_name_lock);
error = spa_change_guid(spa);
error = spa_change_guid(spa, NULL);
zs->zs_guid = spa_guid(spa);
(void) pthread_rwlock_unlock(&ztest_name_lock);
@ -7289,6 +7293,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id)
mutex_exit(&ztest_vdev_lock);
}
void
ztest_ddt_prune(ztest_ds_t *zd, uint64_t id)
{
(void) zd, (void) id;
spa_t *spa = ztest_spa;
uint64_t pct = ztest_random(15) + 1;
(void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct);
}
/*
* Verify pool integrity by running zdb.
*/
@ -7470,6 +7485,13 @@ ztest_resume_thread(void *arg)
{
spa_t *spa = arg;
/*
* Synthesize aged DDT entries for ddt prune testing
*/
ddt_prune_artificial_age = B_TRUE;
if (ztest_opts.zo_verbose >= 3)
ddt_dump_prune_histogram = B_TRUE;
while (!ztest_exiting) {
if (spa_suspended(spa))
ztest_resume(spa);
@ -8588,6 +8610,12 @@ ztest_init(ztest_shared_t *zs)
if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
continue;
/*
* split 50/50 between legacy and fast dedup
*/
if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0)
continue;
VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
spa_feature_table[i].fi_uname));
fnvlist_add_uint64(props, buf, 0);

View File

@ -10,7 +10,8 @@ AM_CPPFLAGS = \
-I$(top_srcdir)/include \
-I$(top_srcdir)/module/icp/include \
-I$(top_srcdir)/lib/libspl/include \
-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@
-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ \
-I$(top_srcdir)/lib/libzpool/include
AM_LIBTOOLFLAGS = --silent
@ -70,4 +71,7 @@ KERNEL_CFLAGS = $(FRAME_LARGER_THAN)
LIBRARY_CFLAGS = -no-suppress
# Forcibly enable asserts/debugging for libzpool &al.
FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
# Since ZFS_DEBUG can change shared data structures, all libzpool users must
# be compiled with the same flags.
# See https://github.com/openzfs/zfs/issues/16476
LIBZPOOL_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG

View File

@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8
usr/share/man/man8/zpool-create.8
usr/share/man/man8/zpool-destroy.8
usr/share/man/man8/zpool-detach.8
usr/share/man/man8/zpool-ddtprune.8
usr/share/man/man8/zpool-events.8
usr/share/man/man8/zpool-export.8
usr/share/man/man8/zpool-get.8

View File

@ -14,6 +14,7 @@ COMMON_H = \
zfs_fletcher.h \
zfs_namecheck.h \
zfs_prop.h \
zfs_valstr.h \
\
sys/abd.h \
sys/abd_impl.h \

View File

@ -300,10 +300,14 @@ _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
_LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
_LIBZFS_H int zpool_reguid(zpool_handle_t *);
_LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *);
_LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
_LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t,
uint64_t);
_LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int,
vdev_state_t *);
_LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);

View File

@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);
_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);
_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t,
uint64_t);
#ifdef __cplusplus
}
#endif

View File

@ -77,6 +77,8 @@ noinst_HEADERS = \
%D%/spl/sys/zmod.h \
%D%/spl/sys/zone.h \
\
%D%/zfs/sys/abd_os.h \
%D%/zfs/sys/abd_impl_os.h \
%D%/zfs/sys/arc_os.h \
%D%/zfs/sys/freebsd_crypto.h \
%D%/zfs/sys/freebsd_event.h \

View File

@ -0,0 +1,41 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ABD_IMPL_OS_H
#define _ABD_IMPL_OS_H
#ifdef __cplusplus
extern "C" {
#endif
#define abd_enter_critical(flags) critical_enter()
#define abd_exit_critical(flags) critical_exit()
#ifdef __cplusplus
}
#endif
#endif /* _ABD_IMPL_OS_H */

View File

@ -0,0 +1,46 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#ifndef _ABD_OS_H
#define _ABD_OS_H
#ifdef __cplusplus
extern "C" {
#endif
struct abd_scatter {
uint_t abd_offset;
void *abd_chunks[1]; /* actually variable-length */
};
struct abd_linear {
void *abd_buf;
};
#ifdef __cplusplus
}
#endif
#endif /* _ABD_H */

View File

@ -20,6 +20,8 @@ kernel_linux_HEADERS = \
kernel_sysdir = $(kerneldir)/sys
kernel_sys_HEADERS = \
%D%/zfs/sys/abd_os.h \
%D%/zfs/sys/abd_impl_os.h \
%D%/zfs/sys/policy.h \
%D%/zfs/sys/trace_acl.h \
%D%/zfs/sys/trace_arc.h \

View File

@ -20,6 +20,10 @@
* You should have received a copy of the GNU General Public License along
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* Copyright (c) 2024, Klara Inc.
* Copyright (c) 2024, Syneto
*/
#ifndef _SPL_TASKQ_H
#define _SPL_TASKQ_H
@ -33,6 +37,9 @@
#include <sys/thread.h>
#include <sys/rwlock.h>
#include <sys/wait.h>
#include <sys/wmsum.h>
typedef struct kstat_s kstat_t;
#define TASKQ_NAMELEN 31
@ -74,6 +81,32 @@ typedef enum tq_lock_role {
typedef unsigned long taskqid_t;
typedef void (task_func_t)(void *);
typedef struct taskq_sums {
/* gauges (inc/dec counters, current value) */
wmsum_t tqs_threads_active; /* threads running a task */
wmsum_t tqs_threads_idle; /* threads waiting for work */
wmsum_t tqs_threads_total; /* total threads */
wmsum_t tqs_tasks_pending; /* tasks waiting to execute */
wmsum_t tqs_tasks_priority; /* hi-pri tasks waiting */
wmsum_t tqs_tasks_total; /* total waiting tasks */
wmsum_t tqs_tasks_delayed; /* tasks deferred to future */
wmsum_t tqs_entries_free; /* task entries on free list */
/* counters (inc only, since taskq creation) */
wmsum_t tqs_threads_created; /* threads created */
wmsum_t tqs_threads_destroyed; /* threads destroyed */
wmsum_t tqs_tasks_dispatched; /* tasks dispatched */
wmsum_t tqs_tasks_dispatched_delayed; /* tasks delayed to future */
wmsum_t tqs_tasks_executed_normal; /* normal pri tasks executed */
wmsum_t tqs_tasks_executed_priority; /* high pri tasks executed */
wmsum_t tqs_tasks_executed; /* total tasks executed */
wmsum_t tqs_tasks_delayed_requeued; /* delayed tasks requeued */
wmsum_t tqs_tasks_cancelled; /* tasks cancelled before run */
wmsum_t tqs_thread_wakeups; /* total thread wakeups */
wmsum_t tqs_thread_wakeups_nowork; /* thread woken but no tasks */
wmsum_t tqs_thread_sleeps; /* total thread sleeps */
} taskq_sums_t;
typedef struct taskq {
spinlock_t tq_lock; /* protects taskq_t */
char *tq_name; /* taskq name */
@ -105,6 +138,8 @@ typedef struct taskq {
struct hlist_node tq_hp_cb_node;
boolean_t tq_hp_support;
unsigned long lastspawnstop; /* when to purge dynamic */
taskq_sums_t tq_sums;
kstat_t *tq_ksp;
} taskq_t;
typedef struct taskq_ent {
@ -123,6 +158,13 @@ typedef struct taskq_ent {
#define TQENT_FLAG_PREALLOC 0x1
#define TQENT_FLAG_CANCEL 0x2
/* bits 2-3 are which list tqent is on */
#define TQENT_LIST_NONE 0x0
#define TQENT_LIST_PENDING 0x4
#define TQENT_LIST_PRIORITY 0x8
#define TQENT_LIST_DELAY 0xc
#define TQENT_LIST_MASK 0xc
typedef struct taskq_thread {
struct list_head tqt_thread_list;
struct list_head tqt_active_list;

View File

@ -0,0 +1,41 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ABD_IMPL_OS_H
#define _ABD_IMPL_OS_H
#ifdef __cplusplus
extern "C" {
#endif
#define abd_enter_critical(flags) local_irq_save(flags)
#define abd_exit_critical(flags) local_irq_restore(flags)
#ifdef __cplusplus
}
#endif
#endif /* _ABD_IMPL_OS_H */

View File

@ -0,0 +1,62 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#ifndef _ABD_OS_H
#define _ABD_OS_H
#ifdef __cplusplus
extern "C" {
#endif
struct abd_scatter {
uint_t abd_offset;
uint_t abd_nents;
struct scatterlist *abd_sgl;
};
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
};
typedef struct abd abd_t;
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
/*
* Linux ABD bio functions
* Note: these are only needed to support vdev_classic. See comment in
* vdev_disk.c.
*/
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
#ifdef __cplusplus
}
#endif
#endif /* _ABD_H */

View File

@ -30,6 +30,7 @@
#include <sys/debug.h>
#include <sys/zfs_refcount.h>
#include <sys/uio.h>
#include <sys/abd_os.h>
#ifdef __cplusplus
extern "C" {
@ -44,8 +45,7 @@ typedef enum abd_flags {
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */
ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */
ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */
ABD_FLAG_ALLOCD = 1 << 9, /* we allocated the abd_t */
ABD_FLAG_ALLOCD = 1 << 8, /* we allocated the abd_t */
} abd_flags_t;
typedef struct abd {
@ -58,19 +58,8 @@ typedef struct abd {
#endif
kmutex_t abd_mtx;
union {
struct abd_scatter {
uint_t abd_offset;
#if defined(__FreeBSD__) && defined(_KERNEL)
void *abd_chunks[1]; /* actually variable-length */
#else
uint_t abd_nents;
struct scatterlist *abd_sgl;
#endif
} abd_scatter;
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
struct abd_scatter abd_scatter;
struct abd_linear abd_linear;
struct abd_gang {
list_t abd_gang_chain;
} abd_gang;
@ -79,9 +68,6 @@ typedef struct abd {
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
#if defined(__linux__) && defined(_KERNEL)
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
#endif
extern int zfs_abd_scatter_enabled;
@ -107,6 +93,7 @@ abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t);
abd_t *abd_get_zeros(size_t);
abd_t *abd_get_from_buf(void *, size_t);
abd_t *abd_get_from_buf_struct(abd_t *, void *, size_t);
void abd_cache_reap_now(void);
/*
@ -128,10 +115,6 @@ void abd_release_ownership_of_buf(abd_t *);
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *);
#if defined(__linux__) && defined(_KERNEL)
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
#endif
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
@ -225,16 +208,6 @@ abd_get_size(abd_t *abd)
void abd_init(void);
void abd_fini(void);
/*
* Linux ABD bio functions
* Note: these are only needed to support vdev_classic. See comment in
* vdev_disk.c.
*/
#if defined(__linux__) && defined(_KERNEL)
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
#endif
#ifdef __cplusplus
}
#endif

View File

@ -28,6 +28,7 @@
#define _ABD_IMPL_H
#include <sys/abd.h>
#include <sys/abd_impl_os.h>
#include <sys/wmsum.h>
#ifdef __cplusplus
@ -111,19 +112,6 @@ void abd_iter_page(struct abd_iter *);
#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
#define ABD_GANG(abd) (abd->abd_u.abd_gang)
#if defined(_KERNEL)
#if defined(__FreeBSD__)
#define abd_enter_critical(flags) critical_enter()
#define abd_exit_critical(flags) critical_exit()
#else
#define abd_enter_critical(flags) local_irq_save(flags)
#define abd_exit_critical(flags) local_irq_restore(flags)
#endif
#else /* !_KERNEL */
#define abd_enter_critical(flags) ((void)0)
#define abd_exit_critical(flags) ((void)0)
#endif
#ifdef __cplusplus
}
#endif

View File

@ -39,6 +39,13 @@ extern "C" {
struct abd;
/*
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
*/
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
#define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */
#define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG)
/*
* DDT on-disk storage object types. Each one corresponds to specific
* implementation, see ddt_ops_t. The value itself is not stored on disk.
@ -120,30 +127,80 @@ typedef struct {
* characteristics of the stored block, such as its location on disk (DVAs),
* birth txg and ref count.
*
* Note that an entry has an array of four ddt_phys_t, one for each number of
* DVAs (copies= property) and another for additional "ditto" copies. Most
* users of ddt_phys_t will handle indexing into or counting the phys they
* want.
* The "traditional" entry has an array of four, one for each number of DVAs
* (copies= property) and another for additional "ditto" copies. Users of the
* traditional struct will specify the variant (index) of the one they want.
*
* The newer "flat" entry has only a single form that is specified using the
* DDT_PHYS_FLAT variant.
*
* Since the value size varies, use one of the size macros when interfacing
* with the ddt zap.
*/
typedef struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddt_phys_t;
#define DDT_PHYS_MAX (4)
/*
* Named indexes into the ddt_phys_t array in each entry.
* Note - this can be used in a flexible array and allocated for
* a specific size (ddp_trad or ddp_flat). So be careful not to
* copy using "=" assignment but instead use ddt_phys_copy().
*/
typedef union {
/*
* Traditional physical payload value for DDT zap (256 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddp_trad[DDT_PHYS_MAX];
/*
* Flat physical payload value for DDT zap (72 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth; /* txg based from BP */
uint64_t ddp_class_start; /* in realtime seconds */
} ddp_flat;
} ddt_univ_phys_t;
/*
* This enum denotes which variant of a ddt_univ_phys_t to target. For
* a traditional DDT entry, it represents the indexes into the ddp_trad
* array. Any consumer of a ddt_univ_phys_t needs to know which variant
* is being targeted.
*
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
* we maintain the ability to free existing dedup-ditto blocks.
*/
enum ddt_phys_type {
typedef enum {
DDT_PHYS_DITTO = 0,
DDT_PHYS_SINGLE = 1,
DDT_PHYS_DOUBLE = 2,
DDT_PHYS_TRIPLE = 3,
DDT_PHYS_TYPES
};
DDT_PHYS_FLAT = 4,
DDT_PHYS_NONE = 5
} ddt_phys_variant_t;
#define DDT_PHYS_VARIANT(ddt, p) \
(ASSERT((p) < DDT_PHYS_NONE), \
((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))
/*
* A "live" entry, holding changes to an entry made this txg, and other data to
@ -153,17 +210,27 @@ enum ddt_phys_type {
/* State flags for dde_flags */
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */
/*
* Additional data to support entry update or repair. This is fixed size
* because its relatively rarely used.
*/
typedef struct {
/* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd;
/* original phys contents before update, for error handling */
ddt_univ_phys_t dde_orig_phys;
/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_MAX];
} ddt_entry_io_t;
typedef struct {
/* key must be first for ddt_key_compare */
ddt_key_t dde_key; /* ddt_tree key */
ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */
/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
/* copy of data after a repair read, to be rewritten */
struct abd *dde_repair_abd;
ddt_key_t dde_key; /* ddt_tree key */
avl_node_t dde_node; /* ddt_tree_node */
/* storage type and class the entry was loaded from */
ddt_type_t dde_type;
@ -173,9 +240,35 @@ typedef struct {
kcondvar_t dde_cv; /* signaled when load completes */
uint64_t dde_waiters; /* count of waiters on dde_cv */
avl_node_t dde_node; /* ddt_tree node */
ddt_entry_io_t *dde_io; /* IO support, when required */
ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */
} ddt_entry_t;
/*
* A lightweight entry is for short-lived or transient uses, like iterating or
* inspecting, when you don't care where it came from.
*/
typedef struct {
ddt_key_t ddlwe_key;
ddt_type_t ddlwe_type;
ddt_class_t ddlwe_class;
ddt_univ_phys_t ddlwe_phys;
} ddt_lightweight_entry_t;
/*
* In-core DDT log. A separate struct to make it easier to switch between the
* appending and flushing logs.
*/
typedef struct {
avl_tree_t ddl_tree; /* logged entries */
uint32_t ddl_flags; /* flags for this log */
uint64_t ddl_object; /* log object id */
uint64_t ddl_length; /* on-disk log size */
uint64_t ddl_first_txg; /* txg log became active */
ddt_key_t ddl_checkpoint; /* last checkpoint */
} ddt_log_t;
/*
* In-core DDT object. This covers all entries and stats for a the whole pool
* for a given checksum type.
@ -184,23 +277,49 @@ typedef struct {
kmutex_t ddt_lock; /* protects changes to all fields */
avl_tree_t ddt_tree; /* "live" (changed) entries this txg */
avl_tree_t ddt_log_tree; /* logged entries */
avl_tree_t ddt_repair_tree; /* entries being repaired */
enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */
ddt_log_t ddt_log[2]; /* active/flushing logs */
ddt_log_t *ddt_log_active; /* pointers into ddt_log */
ddt_log_t *ddt_log_flushing; /* swapped when flush starts */
hrtime_t ddt_flush_start; /* log flush start this txg */
uint32_t ddt_flush_pass; /* log flush pass this txg */
int32_t ddt_flush_count; /* entries flushed this txg */
int32_t ddt_flush_min; /* min rem entries to flush */
int32_t ddt_log_ingest_rate; /* rolling log ingest rate */
int32_t ddt_log_flush_rate; /* rolling log flush rate */
int32_t ddt_log_flush_time_rate; /* avg time spent flushing */
uint64_t ddt_flush_force_txg; /* flush hard before this txg */
kstat_t *ddt_ksp; /* kstats context */
enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */
uint64_t ddt_dir_object; /* MOS dir holding ddt objects */
uint64_t ddt_version; /* DDT version */
uint64_t ddt_flags; /* FDT option flags */
/* per-type/per-class entry store objects */
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
/* object ids for whole-ddt and per-type/per-class stats */
/* object ids for stored, logged and per-type/per-class stats */
uint64_t ddt_stat_object;
ddt_object_t ddt_log_stats;
ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
/* type/class stats by power-2-sized referenced blocks */
ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
/* log stats power-2-sized referenced blocks */
ddt_histogram_t ddt_log_histogram;
} ddt_t;
/*
@ -215,20 +334,36 @@ typedef struct {
uint64_t ddb_cursor;
} ddt_bookmark_t;
extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
uint64_t txg);
extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
blkptr_t *bp, uint64_t txg);
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp);
const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);
extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
extern void ddt_phys_clear(ddt_phys_t *ddp);
extern void ddt_phys_addref(ddt_phys_t *ddp);
extern void ddt_phys_decref(ddt_phys_t *ddp);
extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
const blkptr_t *bp);
extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
ddt_phys_variant_t v);
extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
const ddt_entry_t *dde, const blkptr_t *bp);
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted);
extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
const ddt_lightweight_entry_t *ddlwe);
extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
const ddt_lightweight_entry_t *ddlwe);
extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh);
extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
@ -243,7 +378,7 @@ extern void ddt_enter(ddt_t *ddt);
extern void ddt_exit(ddt_t *ddt);
extern void ddt_init(void);
extern void ddt_fini(void);
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
extern void ddt_prefetch_all(spa_t *spa);
@ -251,6 +386,8 @@ extern void ddt_prefetch_all(spa_t *spa);
extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
const blkptr_t *bp);
extern void ddt_alloc_entry_io(ddt_entry_t *dde);
extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
@ -260,10 +397,17 @@ extern void ddt_create(spa_t *spa);
extern int ddt_load(spa_t *spa);
extern void ddt_unload(spa_t *spa);
extern void ddt_sync(spa_t *spa, uint64_t txg);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
extern void ddt_walk_init(spa_t *spa, uint64_t txg);
extern boolean_t ddt_walk_ready(spa_t *spa);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
uint64_t amount);
#ifdef __cplusplus
}
#endif

View File

@ -28,11 +28,132 @@
#define _SYS_DDT_IMPL_H
#include <sys/ddt.h>
#include <sys/bitops.h>
#ifdef __cplusplus
extern "C" {
#endif
/* DDT version numbers */
#define DDT_VERSION_LEGACY (0)
#define DDT_VERSION_FDT (1)
/* Dummy version to signal that configure is still necessary */
#define DDT_VERSION_UNCONFIGURED (UINT64_MAX)
/* Names of interesting objects in the DDT root dir */
#define DDT_DIR_VERSION "version"
#define DDT_DIR_FLAGS "flags"
/* Fill a lightweight entry from a live entry. */
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (dde)->dde_key; \
(ddlwe)->ddlwe_type = (dde)->dde_type; \
(ddlwe)->ddlwe_class = (dde)->dde_class; \
memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
} while (0)
#define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (ddle)->ddle_key; \
(ddlwe)->ddlwe_type = (ddle)->ddle_type; \
(ddlwe)->ddlwe_class = (ddle)->ddle_class; \
memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
} while (0)
/*
* An entry on the log tree. These are "frozen", and a record of what's in
* the on-disk log. They can't be used in place, but can be "loaded" back into
* the live tree.
*/
typedef struct {
ddt_key_t ddle_key; /* ddt_log_tree key */
avl_node_t ddle_node; /* ddt_log_tree node */
ddt_type_t ddle_type; /* storage type */
ddt_class_t ddle_class; /* storage class */
/* extra allocation for flat/trad phys */
ddt_univ_phys_t ddle_phys[];
} ddt_log_entry_t;
/* On-disk log record types. */
typedef enum {
DLR_INVALID = 0, /* end of block marker */
DLR_ENTRY = 1, /* an entry to add or replace in the log tree */
} ddt_log_record_type_t;
/* On-disk log record header. */
typedef struct {
/*
* dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
* access it.
*
* bits 0-7: record type (ddt_log_record_type_t)
* bits 8-15: length of record header+payload
* bits 16-47: reserved, all zero
* bits 48-55: if type==DLR_ENTRY, storage type (ddt_type)
* otherwise all zero
* bits 56-63: if type==DLR_ENTRY, storage class (ddt_class)
* otherwise all zero
*/
uint64_t dlr_info;
uint8_t dlr_payload[];
} ddt_log_record_t;
#define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8)
#define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v)
#define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16)
#define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v)
#define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8)
#define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v)
#define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8)
#define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v)
/* Payload for DLR_ENTRY. */
typedef struct {
ddt_key_t dlre_key;
ddt_univ_phys_t dlre_phys[];
} ddt_log_record_entry_t;
/* Log flags (ddl_flags, dlh_flags) */
#define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */
#define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */
/* On-disk log header, stored in the bonus buffer. */
typedef struct {
/*
* dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
* access it.
*
* bits 0-7: log version
* bits 8-15: log flags
* bits 16-63: reserved, all zero
*/
uint64_t dlh_info;
uint64_t dlh_length; /* log size in bytes */
uint64_t dlh_first_txg; /* txg this log went active */
ddt_key_t dlh_checkpoint; /* last checkpoint */
} ddt_log_header_t;
#define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8)
#define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v)
#define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8)
#define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v)
/* DDT log update state */
typedef struct {
dmu_tx_t *dlu_tx; /* tx the update is being applied to */
dnode_t *dlu_dn; /* log object dnode */
dmu_buf_t **dlu_dbp; /* array of block buffer pointers */
int dlu_ndbp; /* number of block buffer pointers */
uint16_t dlu_reclen; /* cached length of record */
uint64_t dlu_block; /* block for next entry */
uint64_t dlu_offset; /* offset for next entry */
} ddt_log_update_t;
/*
* Ops vector to access a specific DDT object type.
*/
@ -42,25 +163,53 @@ typedef struct {
boolean_t prehash);
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
int (*ddt_op_lookup)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
const ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_contains)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
int (*ddt_op_update)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
const ddt_key_t *ddk, const void *phys, size_t psize,
dmu_tx_t *tx);
int (*ddt_op_remove)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, dmu_tx_t *tx);
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
} ddt_ops_t;
extern const ddt_ops_t ddt_zap_ops;
extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
/* Dedup log API */
extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
ddt_log_update_t *dlu);
extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
ddt_log_update_t *dlu);
extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
const ddt_key_t *ddk);
extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
dmu_tx_t *tx);
extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
extern int ddt_log_load(ddt_t *ddt);
extern void ddt_log_alloc(ddt_t *ddt);
extern void ddt_log_free(ddt_t *ddt);
extern void ddt_log_init(void);
extern void ddt_log_fini(void);
/*
* These are only exposed so that zdb can access them. Try not to use them
@ -68,22 +217,59 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
* them up.
*/
/*
* We use a histogram to convert a percentage request into a
* cutoff value where entries older than the cutoff get pruned.
*
* The histogram bins represent hours in power-of-two increments.
* 16 bins covers up to four years.
*/
#define HIST_BINS 16
typedef struct ddt_age_histo {
uint64_t dah_entries;
uint64_t dah_age_histo[HIST_BINS];
} ddt_age_histo_t;
void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram);
#if defined(_KERNEL) || !defined(ZFS_DEBUG)
#define ddt_dump_age_histogram(histo, cutoff) ((void)0)
#else
static inline void
ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff)
{
if (histogram->dah_entries == 0)
return;
(void) printf("DDT prune unique class age, %llu hour cutoff\n",
(u_longlong_t)(gethrestime_sec() - cutoff)/3600);
(void) printf("%5s %9s %4s\n", "age", "blocks", "amnt");
(void) printf("%5s %9s %4s\n", "-----", "---------", "----");
for (int i = 0; i < HIST_BINS; i++) {
(void) printf("%5d %9llu %4d%%\n", 1<<i,
(u_longlong_t)histogram->dah_age_histo[i],
(int)((histogram->dah_age_histo[i] * 100) /
histogram->dah_entries));
}
}
#endif
/*
* Enough room to expand DMU_POOL_DDT format for all possible DDT
* checksum/class/type combinations.
*/
#define DDT_NAMELEN 32
extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
const ddt_univ_phys_t *ddp);
extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
char *name);
extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
uint64_t *walk, ddt_entry_t *dde);
uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
uint64_t *count);
extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,

View File

@ -375,7 +375,9 @@ typedef struct dmu_buf {
#define DMU_POOL_L2CACHE "l2cache"
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
#define DMU_POOL_DDT "DDT-%s-%s-%s"
#define DMU_POOL_DDT_LOG "DDT-log-%s-%u"
#define DMU_POOL_DDT_STATS "DDT-statistics"
#define DMU_POOL_DDT_DIR "DDT-%s"
#define DMU_POOL_CREATION_VERSION "creation_version"
#define DMU_POOL_SCAN "scan"
#define DMU_POOL_ERRORSCRUB "error_scrub"

View File

@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx);
ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,

View File

@ -1422,7 +1422,7 @@ typedef enum {
*/
typedef enum zfs_ioc {
/*
* Core features - 88/128 numbers reserved.
* Core features - 89/128 numbers reserved.
*/
#ifdef __FreeBSD__
ZFS_IOC_FIRST = 0,
@ -1519,6 +1519,7 @@ typedef enum zfs_ioc {
ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */
ZFS_IOC_POOL_SCRUB, /* 0x5a57 */
ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */
ZFS_IOC_DDT_PRUNE, /* 0x5a59 */
/*
* Per-platform (Optional) - 8/128 numbers reserved.
@ -1655,6 +1656,12 @@ typedef enum {
ZPOOL_PREFETCH_DDT
} zpool_prefetch_type_t;
typedef enum {
ZPOOL_DDT_PRUNE_NONE,
ZPOOL_DDT_PRUNE_AGE, /* in seconds */
ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */
} zpool_ddt_prune_unit_t;
/*
* Bookmark name values.
*/
@ -1710,6 +1717,11 @@ typedef enum {
#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
/*
* The following are names used when invoking ZFS_IOC_POOL_REGUID.
*/
#define ZPOOL_REGUID_GUID "guid"
/*
* The following are names used when invoking ZFS_IOC_POOL_TRIM.
*/
@ -1748,6 +1760,12 @@ typedef enum {
*/
#define ZPOOL_PREFETCH_TYPE "prefetch_type"
/*
* The following are names used when invoking ZFS_IOC_DDT_PRUNE.
*/
#define DDT_PRUNE_UNIT "ddt_prune_unit"
#define DDT_PRUNE_AMOUNT "ddt_prune_amount"
/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/

View File

@ -572,7 +572,7 @@ typedef struct blkptr {
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
BP_GET_PSIZE(bp))
#define BP_ZERO(bp) \
#define BP_ZERO_DVAS(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \
@ -580,6 +580,11 @@ typedef struct blkptr {
(bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \
}
#define BP_ZERO(bp) \
{ \
BP_ZERO_DVAS(bp); \
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
@ -1087,7 +1092,7 @@ extern void spa_strfree(char *);
extern uint64_t spa_generate_guid(spa_t *spa);
extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
extern void spa_freeze(spa_t *spa);
extern int spa_change_guid(spa_t *spa);
extern int spa_change_guid(spa_t *spa, const uint64_t *guidp);
extern void spa_upgrade(spa_t *spa, uint64_t version);
extern void spa_evict_all(void);
extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,

View File

@ -412,6 +412,7 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
boolean_t spa_active_ddt_prune; /* ddt prune process active */
struct brt *spa_brt; /* in-core BRT */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */

View File

@ -167,6 +167,9 @@ typedef enum zio_suspend_reason {
* This was originally an enum type. However, those are 32-bit and there is no
* way to make a 64-bit enum type. Since we ran out of bits for flags, we were
* forced to upgrade it to a uint64_t.
*
* NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
* FLAG.
*/
typedef uint64_t zio_flag_t;
/*

View File

@ -22,7 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2019, Allan Jude
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, 2024, Klara, Inc.
* Use is subject to license terms.
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
*/
@ -122,25 +122,15 @@ enum zio_zstd_levels {
struct zio_prop;
/* Common signature for all zio compress functions. */
typedef size_t zio_compress_func_t(void *src, void *dst,
typedef size_t zio_compress_func_t(abd_t *src, abd_t *dst,
size_t s_len, size_t d_len, int);
/* Common signature for all zio decompress functions. */
typedef int zio_decompress_func_t(void *src, void *dst,
typedef int zio_decompress_func_t(abd_t *src, abd_t *dst,
size_t s_len, size_t d_len, int);
/* Common signature for all zio decompress and get level functions. */
typedef int zio_decompresslevel_func_t(void *src, void *dst,
typedef int zio_decompresslevel_func_t(abd_t *src, abd_t *dst,
size_t s_len, size_t d_len, uint8_t *level);
/* Common signature for all zio get-compression-level functions. */
typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level);
/*
* Common signature for all zio decompress functions using an ABD as input.
* This is helpful if you have both compressed ARC and scatter ABDs enabled,
* but is not a requirement for all compression algorithms.
*/
typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
size_t s_len, size_t d_len, int);
/*
* Information about each compression function.
*/
@ -163,34 +153,66 @@ extern void lz4_fini(void);
/*
* Compression routines.
*/
extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern size_t zfs_lzjb_compress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
extern int zfs_lzjb_decompress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
extern size_t zfs_gzip_compress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
extern int zfs_gzip_decompress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
extern size_t zfs_zle_compress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
extern int zfs_zle_decompress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
extern size_t zfs_lz4_compress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
/*
* Compress and decompress data if necessary.
*/
extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst,
extern size_t zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst,
size_t s_len, uint8_t level);
extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len, size_t d_len, uint8_t *level);
extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
extern int zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *abd,
size_t s_len, size_t d_len, uint8_t *level);
extern int zio_compress_to_feature(enum zio_compress comp);
#define ZFS_COMPRESS_WRAP_DECL(name) \
size_t \
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n) \
{ \
void *s_buf = abd_borrow_buf_copy(src, s_len); \
void *d_buf = abd_borrow_buf(dst, d_len); \
size_t c_len = name##_buf(s_buf, d_buf, s_len, d_len, n); \
abd_return_buf(src, s_buf, s_len); \
abd_return_buf_copy(dst, d_buf, d_len); \
return (c_len); \
}
#define ZFS_DECOMPRESS_WRAP_DECL(name) \
int \
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n) \
{ \
void *s_buf = abd_borrow_buf_copy(src, s_len); \
void *d_buf = abd_borrow_buf(dst, d_len); \
int err = name##_buf(s_buf, d_buf, s_len, d_len, n); \
abd_return_buf(src, s_buf, s_len); \
abd_return_buf_copy(dst, d_buf, d_len); \
return (err); \
}
#define ZFS_DECOMPRESS_LEVEL_WRAP_DECL(name) \
int \
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *n) \
{ \
void *s_buf = abd_borrow_buf_copy(src, s_len); \
void *d_buf = abd_borrow_buf(dst, d_len); \
int err = name##_buf(s_buf, d_buf, s_len, d_len, n); \
abd_return_buf(src, s_buf, s_len); \
abd_return_buf_copy(dst, d_buf, d_len); \
return (err); \
}
#ifdef __cplusplus
}
#endif

View File

@ -120,6 +120,9 @@ extern "C" {
/*
* zio pipeline stage definitions
*
* NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
* FLAG.
*/
enum zio_stage {
ZIO_STAGE_OPEN = 1 << 0, /* RWFCXT */

View File

@ -22,6 +22,10 @@
extern "C" {
#endif
/*
* NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
* VALUE.
*/
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */

View File

@ -90,14 +90,12 @@ typedef struct zfs_zstd_meta {
int zstd_init(void);
void zstd_fini(void);
size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
size_t d_len, int level);
size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len,
size_t zfs_zstd_compress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
int zfs_zstd_decompress_level(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, uint8_t *level);
int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
int zfs_zstd_decompress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int n);
void zfs_zstd_cache_reap_now(void);

View File

@ -82,6 +82,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION,
SPA_FEATURE_FAST_DEDUP,
SPA_FEATURES
} spa_feature_t;

View File

@ -0,0 +1,84 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2024, Klara Inc.
*/
#ifndef _ZFS_VALSTR_H
#define _ZFS_VALSTR_H extern __attribute__((visibility("default")))
#include <sys/fs/zfs.h>
#include <sys/types.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* These macros create function prototypes for pretty-printing or stringifying
* certain kinds of numeric types.
*
* _ZFS_VALSTR_DECLARE_BITFIELD(name) creates:
*
* size_t zfs_valstr_<name>_bits(uint64_t bits, char *out, size_t outlen);
* expands single char for each set bit, and space for each clear bit
*
* size_t zfs_valstr_<name>_pairs(uint64_t bits, char *out, size_t outlen);
* expands two-char mnemonic for each bit set in `bits`, separated by `|`
*
* size_t zfs_valstr_<name>(uint64_t bits, char *out, size_t outlen);
* expands full name of each bit set in `bits`, separated by spaces
*
* _ZFS_VALSTR_DECLARE_ENUM(name) creates:
*
* size_t zfs_valstr_<name>(int v, char *out, size_t outlen);
* expands full name of enum value
*
* Each _ZFS_VALSTR_DECLARE_xxx needs a corresponding _VALSTR_xxx_IMPL string
* table in vfs_valstr.c.
*/
#define _ZFS_VALSTR_DECLARE_BITFIELD(name) \
_ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _bits( \
uint64_t bits, char *out, size_t outlen); \
_ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _pairs( \
uint64_t bits, char *out, size_t outlen); \
_ZFS_VALSTR_H size_t zfs_valstr_ ## name( \
uint64_t bits, char *out, size_t outlen); \
#define _ZFS_VALSTR_DECLARE_ENUM(name) \
_ZFS_VALSTR_H size_t zfs_valstr_ ## name( \
int v, char *out, size_t outlen); \
_ZFS_VALSTR_DECLARE_BITFIELD(zio_flag)
_ZFS_VALSTR_DECLARE_BITFIELD(zio_stage)
_ZFS_VALSTR_DECLARE_ENUM(zio_priority)
#undef _ZFS_VALSTR_DECLARE_BITFIELD
#undef _ZFS_VALSTR_DECLARE_ENUM
#ifdef __cplusplus
}
#endif
#endif /* _ZFS_VALSTR_H */

View File

@ -47,6 +47,7 @@ nodist_libzfs_la_SOURCES = \
module/zcommon/zfs_fletcher_superscalar4.c \
module/zcommon/zfs_namecheck.c \
module/zcommon/zfs_prop.c \
module/zcommon/zfs_valstr.c \
module/zcommon/zpool_prop.c \
module/zcommon/zprop_common.c

View File

@ -183,8 +183,8 @@
<elf-symbol name='fsleep' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_dataset_depth' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_system_hostid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_timestamp' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getextmntent' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getmntany' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getprop_uint64' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -454,6 +454,13 @@
<elf-symbol name='zfs_userns' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_userspace' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_valid_proplist' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_valstr_zio_flag' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_valstr_zio_flag_bits' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_valstr_zio_flag_pairs' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_valstr_zio_priority' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_valstr_zio_stage' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_valstr_zio_stage_bits' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_valstr_zio_stage_pairs' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_version_kernel' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_version_nvlist' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_version_print' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -466,7 +473,9 @@
<elf-symbol name='zpool_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_clear_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_close' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_default_search_paths' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_disable_datasets' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -485,8 +494,8 @@
<elf-symbol name='zpool_export_force' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_feature_init' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_config' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_parent_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_vdev_by_physpath' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_free_handles' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_get_all_vdev_props' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -529,7 +538,6 @@
<elf-symbol name='zpool_prefetch' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prepare_and_label_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prepare_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_align_right' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_column_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_default_numeric' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -556,6 +564,7 @@
<elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_vdev_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_skip_pool' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -616,7 +625,7 @@
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -5928,6 +5937,7 @@
<enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
<enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
<enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
<enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
<enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
<enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
<enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@ -5962,6 +5972,13 @@
<enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
</enum-decl>
<typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
<enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
<enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
<enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
</enum-decl>
<typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
<enum-decl name='spa_feature' id='33ecb627'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='SPA_FEATURE_NONE' value='-1'/>
@ -6006,7 +6023,8 @@
<enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
<enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
<enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
<enumerator name='SPA_FEATURES' value='41'/>
<enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
<enumerator name='SPA_FEATURES' value='42'/>
</enum-decl>
<typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
<qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@ -6137,6 +6155,12 @@
<parameter type-id='857bb57e'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='02e25ab0'/>
<parameter type-id='9c313c2d'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zfs_resolve_shortname' mangled-name='zfs_resolve_shortname' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_resolve_shortname'>
<parameter type-id='80f4b756'/>
<parameter type-id='26a90f95'/>
@ -6638,6 +6662,11 @@
<parameter type-id='9c313c2d' name='guid'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_set_guid' mangled-name='zpool_set_guid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_set_guid'>
<parameter type-id='4c81de99' name='zhp'/>
<parameter type-id='713a56f5' name='guid'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_reguid' mangled-name='zpool_reguid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_reguid'>
<parameter type-id='4c81de99' name='zhp'/>
<return type-id='95e97e5e'/>
@ -6791,6 +6820,12 @@
<parameter type-id='80f4b756' name='propval'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_ddt_prune' mangled-name='zpool_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_ddt_prune'>
<parameter type-id='4c81de99' name='zhp'/>
<parameter type-id='02e25ab0' name='unit'/>
<parameter type-id='9c313c2d' name='amount'/>
<return type-id='95e97e5e'/>
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='lib/libzfs/libzfs_sendrecv.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='8901473c' size-in-bits='576' id='f5da478b'>
@ -7830,7 +7865,7 @@
</data-member>
</class-decl>
<typedef-decl name='vdev_cbdata_t' type-id='b8006be8' id='a9679c94'/>
<class-decl name='zprop_get_cbdata' size-in-bits='832' is-struct='yes' visibility='default' id='f3d3c319'>
<class-decl name='zprop_get_cbdata' size-in-bits='960' is-struct='yes' visibility='default' id='f3d3c319'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='cb_sources' type-id='95e97e5e' visibility='default'/>
</data-member>
@ -7849,6 +7884,9 @@
<data-member access='public' layout-offset-in-bits='448'>
<var-decl name='cb_first' type-id='c19b74c3' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='480'>
<var-decl name='cb_json' type-id='c19b74c3' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='512'>
<var-decl name='cb_proplist' type-id='3a9b2288' visibility='default'/>
</data-member>
@ -7858,6 +7896,15 @@
<data-member access='public' layout-offset-in-bits='640'>
<var-decl name='cb_vdevs' type-id='a9679c94' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='832'>
<var-decl name='cb_jsobj' type-id='5ce45b60' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='896'>
<var-decl name='cb_json_as_int' type-id='c19b74c3' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='928'>
<var-decl name='cb_json_pool_key_guid' type-id='c19b74c3' visibility='default'/>
</data-member>
</class-decl>
<typedef-decl name='zprop_get_cbdata_t' type-id='f3d3c319' id='f3d87113'/>
<typedef-decl name='zprop_func' type-id='2e711a2a' id='1ec3747a'/>
@ -7961,6 +8008,11 @@
<qualified-type-def type-id='d33f11cb' restrict='yes' id='5c53ba29'/>
<pointer-type-def type-id='ffa52b96' size-in-bits='64' id='76c8174b'/>
<pointer-type-def type-id='f3d87113' size-in-bits='64' id='0d2a0670'/>
<function-decl name='nvlist_print_json' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='822cd80b'/>
<parameter type-id='5ce45b60'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_label_disk' mangled-name='zpool_label_disk' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_label_disk'>
<parameter type-id='b0382bb3'/>
<parameter type-id='4c81de99'/>
@ -8068,6 +8120,11 @@
<parameter type-id='d33f11cb'/>
<return type-id='48b5725f'/>
</function-decl>
<function-decl name='putc' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='95e97e5e'/>
<parameter type-id='822cd80b'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='puts' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<return type-id='95e97e5e'/>
@ -8086,6 +8143,11 @@
<parameter type-id='95e97e5e'/>
<return type-id='48b5725f'/>
</function-decl>
<function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='80f4b756'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='strnlen' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='b59d7dce'/>
@ -8285,12 +8347,12 @@
<function-decl name='zfs_version_print' mangled-name='zfs_version_print' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_print'>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zfs_version_nvlist' mangled-name='zfs_version_nvlist' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_nvlist'>
<return type-id='5ce45b60'/>
</function-decl>
<function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='printf_color' mangled-name='printf_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='printf_color'>
<parameter type-id='80f4b756' name='color'/>
<parameter type-id='80f4b756' name='format'/>
@ -8795,11 +8857,6 @@
<parameter type-id='78c01427'/>
<return type-id='13956559'/>
</function-decl>
<function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='80f4b756'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='zfs_dirnamelen' mangled-name='zfs_dirnamelen' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_dirnamelen'>
<parameter type-id='80f4b756' name='path'/>
<return type-id='79a0948f'/>
@ -9131,8 +9188,8 @@
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'>
<subrange length='41' type-id='7359adad' id='cb834f44'/>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
<subrange length='42' type-id='7359adad' id='cb7c937f'/>
</array-type-def>
<enum-decl name='zfeature_flags' id='6db816a4'>
<underlying-type type-id='9cac1fee'/>
@ -9209,7 +9266,7 @@
<pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
<qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
<pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
<var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
<function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
@ -9781,6 +9838,50 @@
<return type-id='c19b74c3'/>
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfs_valstr.c' language='LANG_C99'>
<function-decl name='zfs_valstr_zio_flag' mangled-name='zfs_valstr_zio_flag' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag'>
<parameter type-id='9c313c2d' name='bits'/>
<parameter type-id='26a90f95' name='out'/>
<parameter type-id='b59d7dce' name='outlen'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='zfs_valstr_zio_flag_bits' mangled-name='zfs_valstr_zio_flag_bits' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag_bits'>
<parameter type-id='9c313c2d' name='bits'/>
<parameter type-id='26a90f95' name='out'/>
<parameter type-id='b59d7dce' name='outlen'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='zfs_valstr_zio_flag_pairs' mangled-name='zfs_valstr_zio_flag_pairs' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag_pairs'>
<parameter type-id='9c313c2d' name='bits'/>
<parameter type-id='26a90f95' name='out'/>
<parameter type-id='b59d7dce' name='outlen'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='zfs_valstr_zio_stage' mangled-name='zfs_valstr_zio_stage' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage'>
<parameter type-id='9c313c2d' name='bits'/>
<parameter type-id='26a90f95' name='out'/>
<parameter type-id='b59d7dce' name='outlen'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='zfs_valstr_zio_stage_bits' mangled-name='zfs_valstr_zio_stage_bits' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage_bits'>
<parameter type-id='9c313c2d' name='bits'/>
<parameter type-id='26a90f95' name='out'/>
<parameter type-id='b59d7dce' name='outlen'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='zfs_valstr_zio_stage_pairs' mangled-name='zfs_valstr_zio_stage_pairs' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage_pairs'>
<parameter type-id='9c313c2d' name='bits'/>
<parameter type-id='26a90f95' name='out'/>
<parameter type-id='b59d7dce' name='outlen'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='zfs_valstr_zio_priority' mangled-name='zfs_valstr_zio_priority' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_priority'>
<parameter type-id='95e97e5e' name='v'/>
<parameter type-id='26a90f95' name='out'/>
<parameter type-id='b59d7dce' name='outlen'/>
<return type-id='b59d7dce'/>
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='module/zcommon/zpool_prop.c' language='LANG_C99'>
<function-decl name='zpool_prop_string_to_index' mangled-name='zpool_prop_string_to_index' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_prop_string_to_index'>
<parameter type-id='5d0c23fb' name='prop'/>

View File

@ -3733,6 +3733,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
(void) zpool_standard_error(hdl, errno, errbuf);
}
break;
case ZFS_ERR_ASHIFT_MISMATCH:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"The new device cannot have a higher alignment requirement "
"than the top-level vdev."));
(void) zfs_error(hdl, EZFS_BADTARGET, errbuf);
break;
default:
(void) zpool_standard_error(hdl, errno, errbuf);
}
@ -4303,22 +4310,55 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
/*
* Change the GUID for a pool.
*
* Similar to zpool_reguid(), but may take a GUID.
*
* If the guid argument is NULL, then no GUID is passed in the nvlist to the
* ioctl().
*/
int
zpool_reguid(zpool_handle_t *zhp)
zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid)
{
char errbuf[ERRBUFLEN];
libzfs_handle_t *hdl = zhp->zpool_hdl;
nvlist_t *nvl = NULL;
zfs_cmd_t zc = {"\0"};
int error = -1;
if (guid != NULL) {
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
return (no_memory(hdl));
if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) {
nvlist_free(nvl);
return (no_memory(hdl));
}
zcmd_write_src_nvlist(hdl, &zc, nvl);
}
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
return (0);
error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc);
if (error) {
return (zpool_standard_error(hdl, errno, errbuf));
}
if (guid != NULL) {
zcmd_free_nvlists(&zc);
nvlist_free(nvl);
}
return (0);
}
return (zpool_standard_error(hdl, errno, errbuf));
/*
* Change the GUID for a pool.
*/
int
zpool_reguid(zpool_handle_t *zhp)
{
return (zpool_set_guid(zhp, NULL));
}
/*
@ -5609,3 +5649,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
return (ret);
}
/*
* Prune older entries from the DDT to reclaim space under the quota
*/
int
zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit,
uint64_t amount)
{
int error = lzc_ddt_prune(zhp->zpool_name, unit, amount);
if (error != 0) {
libzfs_handle_t *hdl = zhp->zpool_hdl;
char errbuf[ERRBUFLEN];
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot prune dedup table on '%s'"), zhp->zpool_name);
if (error == EALREADY) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"a prune operation is already in progress"));
(void) zfs_error(hdl, EZFS_BUSY, errbuf);
} else {
(void) zpool_standard_error(hdl, errno, errbuf);
}
return (-1);
}
return (0);
}

View File

@ -162,6 +162,7 @@
<elf-symbol name='lzc_channel_program_nosync' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_clone' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy_bookmarks' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy_snaps' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -1444,6 +1445,7 @@
<enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
<enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
<enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
<enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
<enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
<enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
<enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@ -1484,6 +1486,13 @@
<enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
</enum-decl>
<typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
<enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
<enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
<enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
</enum-decl>
<typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
<enum-decl name='data_type_t' naming-typedef-id='8d0687d2' id='aeeae136'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='DATA_TYPE_DONTCARE' value='-1'/>
@ -3015,6 +3024,12 @@
<parameter type-id='857bb57e' name='outnvl'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='lzc_ddt_prune' mangled-name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='lzc_ddt_prune'>
<parameter type-id='80f4b756' name='pool'/>
<parameter type-id='02e25ab0' name='unit'/>
<parameter type-id='9c313c2d' name='amount'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-type size-in-bits='64' id='c70fa2e8'>
<parameter type-id='95e97e5e'/>
<parameter type-id='eaa32e2f'/>

View File

@ -1927,3 +1927,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl)
{
return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl));
}
/*
* Prune the specified amount from the pool's dedup table.
*/
int
lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount)
{
int error;
nvlist_t *result = NULL;
nvlist_t *args = fnvlist_alloc();
fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit);
fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount);
error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result);
fnvlist_free(args);
fnvlist_free(result);
return (error);
}

View File

@ -1,7 +1,9 @@
include $(srcdir)/%D%/include/Makefile.am
libzpool_la_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
libzpool_la_CFLAGS += $(ZLIB_CFLAGS)
libzpool_la_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
libzpool_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs
libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD
@ -9,6 +11,7 @@ lib_LTLIBRARIES += libzpool.la
CPPCHECKTARGETS += libzpool.la
dist_libzpool_la_SOURCES = \
%D%/abd_os.c \
%D%/kernel.c \
%D%/taskq.c \
%D%/util.c
@ -39,7 +42,6 @@ nodist_libzpool_la_SOURCES = \
module/lua/lvm.c \
module/lua/lzio.c \
\
module/os/linux/zfs/abd_os.c \
module/os/linux/zfs/arc_os.c \
module/os/linux/zfs/trace.c \
module/os/linux/zfs/vdev_file.c \
@ -62,6 +64,7 @@ nodist_libzpool_la_SOURCES = \
module/zcommon/zfs_fletcher_superscalar4.c \
module/zcommon/zfs_namecheck.c \
module/zcommon/zfs_prop.c \
module/zcommon/zfs_valstr.c \
module/zcommon/zpool_prop.c \
module/zcommon/zprop_common.c \
\
@ -79,6 +82,7 @@ nodist_libzpool_la_SOURCES = \
module/zfs/dbuf.c \
module/zfs/dbuf_stats.c \
module/zfs/ddt.c \
module/zfs/ddt_log.c \
module/zfs/ddt_stats.c \
module/zfs/ddt_zap.c \
module/zfs/dmu.c \

View File

@ -0,0 +1,365 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/abd_impl.h>
#include <sys/param.h>
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
/*
* We're simulating scatter/gather with 4K allocations, since that's more like
* what a typical kernel does.
*/
#define ABD_PAGESIZE (4096)
#define ABD_PAGESHIFT (12)
#define ABD_PAGEMASK (ABD_PAGESIZE-1)
/*
* See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is
* mostly useful to get a mix of linear and scatter ABDs for testing.
*/
#define ABD_SCATTER_MIN_SIZE (512 * 3)
abd_t *abd_zero_scatter = NULL;
static uint_t
abd_iovcnt_for_bytes(size_t size)
{
/*
* Each iovec points to a 4K page. There's no real reason to do this
* in userspace, but our whole point here is to make it feel a bit
* more like a real paged memory model.
*/
return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE);
}
abd_t *
abd_alloc_struct_impl(size_t size)
{
/*
* Zero-sized means it will be used for a linear or gang abd, so just
* allocate the abd itself and return.
*/
if (size == 0)
return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL));
/*
* Allocating for a scatter abd, so compute how many ABD_PAGESIZE
* iovecs we will need to hold this size. Append that allocation to the
* end. Note that struct abd_scatter has includes abd_iov[1], so we
* allocate one less iovec than we need.
*
* Note we're not allocating the pages proper, just the iovec pointers.
* That's down in abd_alloc_chunks. We _could_ do it here in a single
* allocation, but it's fiddly and harder to read for no real gain.
*/
uint_t n = abd_iovcnt_for_bytes(size);
abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec),
UMEM_NOFAIL);
ABD_SCATTER(abd).abd_offset = 0;
ABD_SCATTER(abd).abd_iovcnt = n;
return (abd);
}
void
abd_free_struct_impl(abd_t *abd)
{
/* For scatter, compute the extra amount we need to free */
uint_t iovcnt =
abd_is_linear(abd) || abd_is_gang(abd) ?
0 : (ABD_SCATTER(abd).abd_iovcnt - 1);
umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec));
}
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
/*
* We've already allocated the iovec array; ensure that the wanted size
* actually matches, otherwise the caller has made a mistake somewhere.
*/
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
ASSERT3U(n, ==, abd_iovcnt_for_bytes(size));
/*
* Allocate a ABD_PAGESIZE region for each iovec.
*/
struct iovec *iov = ABD_SCATTER(abd).abd_iov;
for (int i = 0; i < n; i++) {
iov[i].iov_base =
umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
iov[i].iov_len = ABD_PAGESIZE;
}
}
void
abd_free_chunks(abd_t *abd)
{
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
struct iovec *iov = ABD_SCATTER(abd).abd_iov;
for (int i = 0; i < n; i++)
umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE);
}
boolean_t
abd_size_alloc_linear(size_t size)
{
return (size < ABD_SCATTER_MIN_SIZE);
}
void
abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
{
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size;
if (op == ABDSTAT_INCR) {
arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
} else {
arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
}
}
void
abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
{
(void) abd;
(void) op;
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
}
void
abd_verify_scatter(abd_t *abd)
{
#ifdef ZFS_DEBUG
/*
* scatter abds shall have:
* - at least one iovec
* - all iov_base point somewhere
* - all iov_len are ABD_PAGESIZE
* - offset set within the abd pages somewhere
*/
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
ASSERT3U(n, >, 0);
uint_t len = 0;
for (int i = 0; i < n; i++) {
ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL);
ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE);
len += ABD_PAGESIZE;
}
ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len);
#endif
}
void
abd_init(void)
{
/*
* Create the "zero" scatter abd. This is always the size of the
* largest possible block, but only actually has a single allocated
* page, which all iovecs in the abd point to.
*/
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
void *zero =
umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
memset(zero, 0, ABD_PAGESIZE);
uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE);
struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov;
for (int i = 0; i < n; i++) {
iov[i].iov_base = zero;
iov[i].iov_len = ABD_PAGESIZE;
}
}
void
abd_fini(void)
{
umem_free_aligned(
ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE);
abd_free_struct(abd_zero_scatter);
abd_zero_scatter = NULL;
}
void
abd_free_linear_page(abd_t *abd)
{
/*
* LINEAR_PAGE is specific to the Linux kernel; we never set this
* flag, so this will never be called.
*/
(void) abd;
PANIC("unreachable");
}
abd_t *
abd_alloc_for_io(size_t size, boolean_t is_metadata)
{
return (abd_alloc(size, is_metadata));
}
abd_t *
abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size)
{
/*
* Create a new scatter dabd by borrowing data pages from sabd to cover
* off+size.
*
* sabd is an existing scatter abd with a set of iovecs, each covering
* an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset.
*
* [........][........][........][........]
* ^- sabd_offset
*
* We want to produce a new abd, referencing those allocations at the
* given offset.
*
* [........][........][........][........]
* ^- dabd_offset = sabd_offset + off
* ^- dabd_offset + size
*
* In this example, dabd needs three iovecs. The first iovec is offset
* 0, so the final dabd_offset is masked back into the first iovec.
*
* [........][........][........]
* ^- dabd_offset
*/
size_t soff = ABD_SCATTER(sabd).abd_offset + off;
size_t doff = soff & ABD_PAGEMASK;
size_t iovcnt = abd_iovcnt_for_bytes(doff + size);
/*
* If the passed-in abd has enough allocated iovecs already, reuse it.
* Otherwise, make a new one. The caller will free the original if the
* one it gets back is not the same.
*
* Note that it's ok if we reuse an abd with more iovecs than we need.
* abd_size has the usable amount of data, and the abd does not own the
* pages referenced by the iovecs. At worst, they're holding dangling
* pointers that we'll never use anyway.
*/
if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt)
dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT);
/* Set offset into first page in view */
ABD_SCATTER(dabd).abd_offset = doff;
/* Copy the wanted iovecs from the source to the dest */
memcpy(&ABD_SCATTER(dabd).abd_iov[0],
&ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT],
iovcnt * sizeof (struct iovec));
return (dabd);
}
void
abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
}
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
if (abd_iter_at_end(aiter))
return;
aiter->iter_pos += amount;
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
}
void
abd_iter_map(struct abd_iter *aiter)
{
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
if (abd_iter_at_end(aiter))
return;
if (abd_is_linear(aiter->iter_abd)) {
aiter->iter_mapaddr =
ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
aiter->iter_mapsize =
aiter->iter_abd->abd_size - aiter->iter_pos;
return;
}
/*
* For scatter, we index into the appropriate iovec, and return the
* smaller of the amount requested, or up to the end of the page.
*/
size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset;
ASSERT3U(poff >> ABD_PAGESHIFT, <=,
ABD_SCATTER(aiter->iter_abd).abd_iovcnt);
struct iovec *iov = &ABD_SCATTER(aiter->iter_abd).
abd_iov[poff >> ABD_PAGESHIFT];
aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK),
aiter->iter_abd->abd_size - aiter->iter_pos);
ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE);
aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK);
}
void
abd_iter_unmap(struct abd_iter *aiter)
{
if (abd_iter_at_end(aiter))
return;
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
ASSERT3U(aiter->iter_mapsize, >, 0);
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
void
abd_cache_reap_now(void)
{
}

View File

@ -0,0 +1,4 @@
libzpooldir = $(includedir)/libzpool
libzpool_HEADERS = \
%D%/sys/abd_os.h \
%D%/sys/abd_impl_os.h

View File

@ -0,0 +1,41 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ABD_IMPL_OS_H
#define _ABD_IMPL_OS_H
#ifdef __cplusplus
extern "C" {
#endif
#define abd_enter_critical(flags) ((void)0)
#define abd_exit_critical(flags) ((void)0)
#ifdef __cplusplus
}
#endif
#endif /* _ABD_IMPL_OS_H */

View File

@ -0,0 +1,47 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#ifndef _ABD_OS_H
#define _ABD_OS_H
#ifdef __cplusplus
extern "C" {
#endif
struct abd_scatter {
uint_t abd_offset;
uint_t abd_iovcnt;
struct iovec abd_iov[1]; /* actually variable-length */
};
struct abd_linear {
void *abd_buf;
};
#ifdef __cplusplus
}
#endif
#endif /* _ABD_H */

View File

@ -72,6 +72,7 @@ dist_man_MANS = \
%D%/man8/zpool-create.8 \
%D%/man8/zpool-destroy.8 \
%D%/man8/zpool-detach.8 \
%D%/man8/zpool-ddtprune.8 \
%D%/man8/zpool-events.8 \
%D%/man8/zpool-export.8 \
%D%/man8/zpool-get.8 \

View File

@ -175,17 +175,6 @@ Increasing this value will
result in a slower thread creation rate which may be preferable for some
configurations.
.
.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint
The maximum number of tasks per pending list in each taskq shown in
.Pa /proc/spl/taskq{,-all} .
Write
.Sy 0
to turn off the limit.
The proc file will walk the lists with lock held,
reading it could cause a lock-up if the list grow too large
without limiting the output.
"(truncated)" will be shown if the list is larger than the limit.
.
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
Minimum idle threads exit interval for dynamic taskqs.
Smaller values allow idle threads exit more often and potentially be

View File

@ -77,6 +77,17 @@ the array is dynamically sized based on total system memory.
dnode slots allocated in a single operation as a power of 2.
The default value minimizes lock contention for the bulk operation performed.
.
.It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint
Controls the number of copies stored for DeDup Table
.Pq DDT
objects.
Reducing the number of copies to 1 from the previous default of 3
can reduce the write inflation caused by deduplication.
This assumes redundancy for this data is provided by the vdev layer.
If the DDT is damaged, space may be leaked
.Pq not freed
when the DDT can not report the correct reference count.
.
.It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
Limit the amount we can prefetch with one call to this amount in bytes.
This helps to limit the amount of memory that can be used by prefetching.
@ -121,20 +132,26 @@ Controls whether buffers present on special vdevs are eligible for caching
into L2ARC.
If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
.
.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int
.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
Controls whether only MFU metadata and data are cached from ARC into L2ARC.
This may be desired to avoid wasting space on L2ARC when reading/writing large
amounts of data that are not expected to be accessed more than once.
.Pp
The default is off,
The default is 0,
meaning both MRU and MFU data and metadata are cached.
When turning off this feature, some MRU buffers will still be present
in ARC and eventually cached on L2ARC.
When turning off this feature (setting it to 0), some MRU buffers will
still be present in ARC and eventually cached on L2ARC.
.No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
some prefetched buffers will be cached to L2ARC, and those might later
transition to MRU, in which case the
.Sy l2arc_mru_asize No arcstat will not be Sy 0 .
.Pp
Setting it to 1 means to L2 cache only MFU data and metadata.
.Pp
Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
only MFU data (ie: MRU data are not cached). This can be the right setting
to cache as much metadata as possible even when having high data turnover.
.Pp
Regardless of
.Sy l2arc_noprefetch ,
some MFU buffers might be evicted from ARC,
@ -821,6 +838,7 @@ This is a limit on how many pages the ARC shrinker makes available for
eviction in response to one page allocation attempt.
Note that in practice, the kernel's shrinker can ask us to evict
up to about four times this for one allocation attempt.
To reduce OOM risk, this limit is applied for kswapd reclaims only.
.Pp
The default limit of
.Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
@ -974,6 +992,88 @@ milliseconds until the operation completes.
.It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
Enable prefetching dedup-ed blocks which are going to be freed.
.
.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
Maximum number of dedup log flush passes (iterations) each transaction.
.Pp
At the start of each transaction, OpenZFS will estimate how many entries it
needs to flush out to keep up with the change rate, taking the amount and time
taken to flush on previous txgs into account (see
.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
It will spread this amount into a number of passes.
At each pass, it will use the amount already flushed and the total time taken
by flushing and by other IO to recompute how much it should do for the remainder
of the txg.
.Pp
Reducing the max number of passes will make flushing more aggressive, flushing
out more entries on each pass.
This can be faster, but also more likely to compete with other IO.
Increasing the max number of passes will put fewer entries onto each pass,
keeping the overhead of dedup changes to a minimum but possibly causing a large
number of changes to be dumped on the last pass, which can blow out the txg
sync time beyond
.Sy zfs_txg_timeout .
.
.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
Minimum time to spend on dedup log flush each transaction.
.Pp
At least this long will be spent flushing dedup log entries each transaction,
up to
.Sy zfs_txg_timeout .
This occurs even if doing so would delay the transaction, that is, other IO
completes under this time.
.
.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
Flush at least this many entries each transaction.
.Pp
OpenZFS will estimate how many entries it needs to flush each transaction to
keep up with the ingest rate (see
.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
This sets the minimum for that estimate.
Raising it can force OpenZFS to flush more aggressively, keeping the log small
and so reducing pool import times, but can make it less able to back off if
log flushing would compete with other IO too much.
.
.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
Number of transactions to use to compute the flow rate.
.Pp
OpenZFS will estimate how many entries it needs to flush each transaction by
monitoring the number of entries changed (ingest rate), number of entries
flushed (flush rate) and time spent flushing (flush time rate) and combining
these into an overall "flow rate".
It will use an exponential weighted moving average over some number of recent
transactions to compute these rates.
This sets the number of transactions to compute these averages over.
Setting it higher can help to smooth out the flow rate in the face of spiky
workloads, but will take longer for the flow rate to adjust to a sustained
change in the ingress rate.
.
.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
Max transactions to before starting to flush dedup logs.
.Pp
OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
If there is nothing to flush, it will accumulate changes for no more than this
many transactions before switching the logs and starting to flush entries out.
.
.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
Max memory to use for dedup logs.
.Pp
OpenZFS will spend no more than this much memory on maintaining the in-memory
dedup log.
Flushing will begin when around half this amount is being spent on logs.
The default value of
.Sy 0
will cause it to be set by
.Sy zfs_dedup_log_mem_max_percent
instead.
.
.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
Max memory to use for dedup logs, as a percentage of total memory.
.Pp
If
.Sy zfs_dedup_log_mem_max
is not set, it will be initialised as a percentage of the total memory in the
system.
.
.It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
Start to delay each transaction once there is this amount of dirty data,
expressed as a percentage of

View File

@ -17,8 +17,9 @@
.\" Copyright (c) 2019, Klara Inc.
.\" Copyright (c) 2019, Allan Jude
.\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
.\" Copyright (c) 2023, Klara Inc.
.\"
.Dd June 23, 2022
.Dd February 14, 2024
.Dt ZPOOL-FEATURES 7
.Os
.
@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the
.Sy enabled
state when all datasets that use this feature are destroyed.
.
.feature com.klarasystems fast_dedup yes
This feature allows more advanced deduplication features to be enabled on new
dedup tables.
.Pp
This feature will be
.Sy active
when the first deduplicated block is written after a new dedup table is created
(ie after a new pool creation, or new checksum used on a dataset with
.Sy dedup
enabled).
It will be returned to the
.Sy enabled
state when all deduplicated blocks using it are freed.
.
.feature com.delphix extensible_dataset no
This feature allows more flexible use of internal ZFS data structures,
and exists for other features to depend on.

View File

@ -0,0 +1,48 @@
.\"
.\" CDDL HEADER START
.\"
.\" The contents of this file are subject to the terms of the
.\" Common Development and Distribution License (the "License").
.\" You may not use this file except in compliance with the License.
.\"
.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
.\" or http://www.opensolaris.org/os/licensing.
.\" See the License for the specific language governing permissions
.\" and limitations under the License.
.\"
.\" When distributing Covered Code, include this CDDL HEADER in each
.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
.\" If applicable, add the following below this CDDL HEADER, with the
.\" fields enclosed by brackets "[]" replaced with your own identifying
.\" information: Portions Copyright [yyyy] [name of copyright owner]
.\"
.\" CDDL HEADER END
.\"
.\"
.\" Copyright (c) 2024, Klara Inc.
.\"
.Dd June 17, 2024
.Dt ZPOOL-DDTPRUNE 8
.Os
.
.Sh NAME
.Nm zpool-ddtprune
.Nd Prunes the oldest entries from the single reference dedup table(s)
.Sh SYNOPSIS
.Nm zpool
.Cm ddtprune
.Fl d Ar days | Fl p Ar percentage
.Ar pool
.Sh DESCRIPTION
This command prunes older unique entries from the dedup table.
As a complement to the dedup quota feature,
.Sy ddtprune
allows removal of older non-duplicate entries to make room for
newer duplicate entries.
.Pp
The amount to prune can be based on a target percentage of the unique entries
or based on the age (i.e., every unique entry older than N days).
.
.Sh SEE ALSO
.Xr zdb 8 ,
.Xr zpool-status 8

View File

@ -25,8 +25,10 @@
.\" Copyright (c) 2018 George Melikov. All Rights Reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\" Copyright (c) 2024, Klara Inc.
.\" Copyright (c) 2024, Mateusz Piotrowski
.\"
.Dd May 31, 2021
.Dd June 21, 2023
.Dt ZPOOL-REGUID 8
.Os
.
@ -36,6 +38,7 @@
.Sh SYNOPSIS
.Nm zpool
.Cm reguid
.Op Fl g Ar guid
.Ar pool
.
.Sh DESCRIPTION
@ -43,6 +46,15 @@ Generates a new unique identifier for the pool.
You must ensure that all devices in this pool are online and healthy before
performing this action.
.
.Bl -tag -width Ds
.It Fl g Ar guid
Set the pool GUID to the provided value.
The GUID can be any 64-bit value accepted by
.Xr strtoull 3
in base 10.
.Nm
will return an error if the provided GUID is already in use.
.El
.Sh SEE ALSO
.Xr zpool-export 8 ,
.Xr zpool-import 8

View File

@ -592,6 +592,7 @@ don't wait.
.Xr zpool-checkpoint 8 ,
.Xr zpool-clear 8 ,
.Xr zpool-create 8 ,
.Xr zpool-ddtprune 8 ,
.Xr zpool-destroy 8 ,
.Xr zpool-detach 8 ,
.Xr zpool-events 8 ,

View File

@ -16,8 +16,8 @@ src = @abs_srcdir@
obj = @abs_builddir@
else
zfs_include = $(srctree)/include/zfs
icp_include = $(srctree)/$(src)/icp/include
zstd_include = $(srctree)/$(src)/zstd/include
icp_include = $(src)/icp/include
zstd_include = $(src)/zstd/include
ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
endif
@ -240,6 +240,7 @@ ZCOMMON_OBJS := \
zfs_fletcher_superscalar4.o \
zfs_namecheck.o \
zfs_prop.o \
zfs_valstr.o \
zpool_prop.o \
zprop_common.o
@ -322,6 +323,7 @@ ZFS_OBJS := \
dbuf.o \
dbuf_stats.o \
ddt.o \
ddt_log.o \
ddt_stats.o \
ddt_zap.o \
dmu.o \

View File

@ -233,6 +233,7 @@ SRCS+= cityhash.c \
zfs_fletcher_superscalar.c \
zfs_namecheck.c \
zfs_prop.c \
zfs_valstr.c \
zpool_prop.c \
zprop_common.c
@ -252,6 +253,7 @@ SRCS+= abd.c \
dbuf.c \
dbuf_stats.c \
ddt.c \
ddt_log.c \
ddt_stats.c \
ddt_zap.c \
dmu.c \
@ -426,6 +428,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast
CFLAGS.abd.c= -Wno-cast-qual
CFLAGS.ddt.c= -Wno-cast-qual
CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
CFLAGS.ddt_zap.c= -Wno-cast-qual
CFLAGS.dmu.c= -Wno-cast-qual
CFLAGS.dmu_traverse.c= -Wno-cast-qual

View File

@ -95,14 +95,12 @@ struct {
*/
static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
#if defined(_KERNEL)
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
#endif
kmem_cache_t *abd_chunk_cache;
static kstat_t *abd_ksp;
@ -250,7 +248,7 @@ abd_alloc_zero_scatter(void)
n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;

View File

@ -124,7 +124,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
"ZFS livelist condense");
SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
"ZFS VDEV mirror");

View File

@ -868,16 +868,16 @@ spl_init(void)
if ((rc = spl_tsd_init()))
goto out2;
if ((rc = spl_taskq_init()))
if ((rc = spl_proc_init()))
goto out3;
if ((rc = spl_kmem_cache_init()))
if ((rc = spl_kstat_init()))
goto out4;
if ((rc = spl_proc_init()))
if ((rc = spl_taskq_init()))
goto out5;
if ((rc = spl_kstat_init()))
if ((rc = spl_kmem_cache_init()))
goto out6;
if ((rc = spl_zlib_init()))
@ -891,13 +891,13 @@ spl_init(void)
out8:
spl_zlib_fini();
out7:
spl_kstat_fini();
out6:
spl_proc_fini();
out5:
spl_kmem_cache_fini();
out4:
out6:
spl_taskq_fini();
out5:
spl_kstat_fini();
out4:
spl_proc_fini();
out3:
spl_tsd_fini();
out2:
@ -913,10 +913,10 @@ spl_fini(void)
{
spl_zone_fini();
spl_zlib_fini();
spl_kstat_fini();
spl_proc_fini();
spl_kmem_cache_fini();
spl_taskq_fini();
spl_kstat_fini();
spl_proc_fini();
spl_tsd_fini();
spl_kvmem_fini();
spl_random_fini();

View File

@ -31,7 +31,6 @@
#include <sys/kmem.h>
#include <sys/kmem_cache.h>
#include <sys/vmem.h>
#include <sys/taskq.h>
#include <sys/proc.h>
#include <linux/ctype.h>
#include <linux/kmod.h>
@ -63,8 +62,6 @@ static struct ctl_table_header *spl_kstat = NULL;
static struct proc_dir_entry *proc_spl = NULL;
static struct proc_dir_entry *proc_spl_kmem = NULL;
static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
static struct proc_dir_entry *proc_spl_taskq_all = NULL;
static struct proc_dir_entry *proc_spl_taskq = NULL;
struct proc_dir_entry *proc_spl_kstat = NULL;
#ifdef DEBUG_KMEM
@ -177,195 +174,6 @@ proc_dohostid(CONST_CTL_TABLE *table, int write,
return (0);
}
static void
taskq_seq_show_headers(struct seq_file *f)
{
seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
"taskq", "act", "nthr", "spwn", "maxt", "pri",
"mina", "maxa", "cura", "flags");
}
/* indices into the lheads array below */
#define LHEAD_PEND 0
#define LHEAD_PRIO 1
#define LHEAD_DELAY 2
#define LHEAD_WAIT 3
#define LHEAD_ACTIVE 4
#define LHEAD_SIZE 5
static unsigned int spl_max_show_tasks = 512;
/* CSTYLED */
module_param(spl_max_show_tasks, uint, 0644);
MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
static int
taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
{
taskq_t *tq = p;
taskq_thread_t *tqt = NULL;
spl_wait_queue_entry_t *wq;
struct task_struct *tsk;
taskq_ent_t *tqe;
char name[100];
struct list_head *lheads[LHEAD_SIZE], *lh;
static char *list_names[LHEAD_SIZE] =
{"pend", "prio", "delay", "wait", "active" };
int i, j, have_lheads = 0;
unsigned long wflags, flags;
spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
/* get the various lists and check whether they're empty */
lheads[LHEAD_PEND] = &tq->tq_pend_list;
lheads[LHEAD_PRIO] = &tq->tq_prio_list;
lheads[LHEAD_DELAY] = &tq->tq_delay_list;
#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
#else
lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
#endif
lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
for (i = 0; i < LHEAD_SIZE; ++i) {
if (list_empty(lheads[i]))
lheads[i] = NULL;
else
++have_lheads;
}
/* early return in non-"all" mode if lists are all empty */
if (!allflag && !have_lheads) {
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
spin_unlock_irqrestore(&tq->tq_lock, flags);
return (0);
}
/* unlock the waitq quickly */
if (!lheads[LHEAD_WAIT])
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
/* show the base taskq contents */
snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
seq_printf(f, "%-25s ", name);
seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
tq->tq_nalloc, tq->tq_flags);
/* show the active list */
if (lheads[LHEAD_ACTIVE]) {
j = 0;
list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
if (j == 0)
seq_printf(f, "\t%s:",
list_names[LHEAD_ACTIVE]);
else if (j == 2) {
seq_printf(f, "\n\t ");
j = 0;
}
seq_printf(f, " [%d]%pf(%ps)",
tqt->tqt_thread->pid,
tqt->tqt_task->tqent_func,
tqt->tqt_task->tqent_arg);
++j;
}
seq_printf(f, "\n");
}
for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
if (lheads[i]) {
j = 0;
list_for_each(lh, lheads[i]) {
if (spl_max_show_tasks != 0 &&
j >= spl_max_show_tasks) {
seq_printf(f, "\n\t(truncated)");
break;
}
/* show the wait waitq list */
if (i == LHEAD_WAIT) {
#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
wq = list_entry(lh,
spl_wait_queue_entry_t, entry);
#else
wq = list_entry(lh,
spl_wait_queue_entry_t, task_list);
#endif
if (j == 0)
seq_printf(f, "\t%s:",
list_names[i]);
else if (j % 8 == 0)
seq_printf(f, "\n\t ");
tsk = wq->private;
seq_printf(f, " %d", tsk->pid);
/* pend, prio and delay lists */
} else {
tqe = list_entry(lh, taskq_ent_t,
tqent_list);
if (j == 0)
seq_printf(f, "\t%s:",
list_names[i]);
else if (j % 2 == 0)
seq_printf(f, "\n\t ");
seq_printf(f, " %pf(%ps)",
tqe->tqent_func,
tqe->tqent_arg);
}
++j;
}
seq_printf(f, "\n");
}
if (lheads[LHEAD_WAIT])
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
spin_unlock_irqrestore(&tq->tq_lock, flags);
return (0);
}
static int
taskq_all_seq_show(struct seq_file *f, void *p)
{
return (taskq_seq_show_impl(f, p, B_TRUE));
}
static int
taskq_seq_show(struct seq_file *f, void *p)
{
return (taskq_seq_show_impl(f, p, B_FALSE));
}
static void *
taskq_seq_start(struct seq_file *f, loff_t *pos)
{
struct list_head *p;
loff_t n = *pos;
down_read(&tq_list_sem);
if (!n)
taskq_seq_show_headers(f);
p = tq_list.next;
while (n--) {
p = p->next;
if (p == &tq_list)
return (NULL);
}
return (list_entry(p, taskq_t, tq_taskqs));
}
static void *
taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
{
taskq_t *tq = p;
++*pos;
return ((tq->tq_taskqs.next == &tq_list) ?
NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
}
static void
slab_seq_show_headers(struct seq_file *f)
{
@ -501,66 +309,6 @@ static const kstat_proc_op_t proc_slab_operations = {
#endif
};
static void
taskq_seq_stop(struct seq_file *f, void *v)
{
up_read(&tq_list_sem);
}
static const struct seq_operations taskq_all_seq_ops = {
.show = taskq_all_seq_show,
.start = taskq_seq_start,
.next = taskq_seq_next,
.stop = taskq_seq_stop,
};
static const struct seq_operations taskq_seq_ops = {
.show = taskq_seq_show,
.start = taskq_seq_start,
.next = taskq_seq_next,
.stop = taskq_seq_stop,
};
static int
proc_taskq_all_open(struct inode *inode, struct file *filp)
{
return (seq_open(filp, &taskq_all_seq_ops));
}
static int
proc_taskq_open(struct inode *inode, struct file *filp)
{
return (seq_open(filp, &taskq_seq_ops));
}
static const kstat_proc_op_t proc_taskq_all_operations = {
#ifdef HAVE_PROC_OPS_STRUCT
.proc_open = proc_taskq_all_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
#else
.open = proc_taskq_all_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
#endif
};
static const kstat_proc_op_t proc_taskq_operations = {
#ifdef HAVE_PROC_OPS_STRUCT
.proc_open = proc_taskq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
#else
.open = proc_taskq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
#endif
};
static struct ctl_table spl_kmem_table[] = {
#ifdef DEBUG_KMEM
{
@ -677,8 +425,6 @@ static void spl_proc_cleanup(void)
remove_proc_entry("kstat", proc_spl);
remove_proc_entry("slab", proc_spl_kmem);
remove_proc_entry("kmem", proc_spl);
remove_proc_entry("taskq-all", proc_spl);
remove_proc_entry("taskq", proc_spl);
remove_proc_entry("spl", NULL);
#ifndef HAVE_REGISTER_SYSCTL_TABLE
@ -761,20 +507,6 @@ spl_proc_init(void)
goto out;
}
proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
&proc_taskq_all_operations, NULL);
if (proc_spl_taskq_all == NULL) {
rc = -EUNATCH;
goto out;
}
proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
&proc_taskq_operations, NULL);
if (proc_spl_taskq == NULL) {
rc = -EUNATCH;
goto out;
}
proc_spl_kmem = proc_mkdir("kmem", proc_spl);
if (proc_spl_kmem == NULL) {
rc = -EUNATCH;

View File

@ -22,16 +22,98 @@
*
* Solaris Porting Layer (SPL) Task Queue Implementation.
*/
/*
* Copyright (c) 2024, Klara Inc.
* Copyright (c) 2024, Syneto
*/
#include <sys/timer.h>
#include <sys/taskq.h>
#include <sys/kmem.h>
#include <sys/tsd.h>
#include <sys/trace_spl.h>
#include <sys/time.h>
#include <sys/atomic.h>
#include <sys/kstat.h>
#ifdef HAVE_CPU_HOTPLUG
#include <linux/cpuhotplug.h>
#endif
typedef struct taskq_kstats {
/* static values, for completeness */
kstat_named_t tqks_threads_max;
kstat_named_t tqks_entry_pool_min;
kstat_named_t tqks_entry_pool_max;
/* gauges (inc/dec counters, current value) */
kstat_named_t tqks_threads_active;
kstat_named_t tqks_threads_idle;
kstat_named_t tqks_threads_total;
kstat_named_t tqks_tasks_pending;
kstat_named_t tqks_tasks_priority;
kstat_named_t tqks_tasks_total;
kstat_named_t tqks_tasks_delayed;
kstat_named_t tqks_entries_free;
/* counters (inc only, since taskq creation) */
kstat_named_t tqks_threads_created;
kstat_named_t tqks_threads_destroyed;
kstat_named_t tqks_tasks_dispatched;
kstat_named_t tqks_tasks_dispatched_delayed;
kstat_named_t tqks_tasks_executed_normal;
kstat_named_t tqks_tasks_executed_priority;
kstat_named_t tqks_tasks_executed;
kstat_named_t tqks_tasks_delayed_requeued;
kstat_named_t tqks_tasks_cancelled;
kstat_named_t tqks_thread_wakeups;
kstat_named_t tqks_thread_wakeups_nowork;
kstat_named_t tqks_thread_sleeps;
} taskq_kstats_t;
static taskq_kstats_t taskq_kstats_template = {
{ "threads_max", KSTAT_DATA_UINT64 },
{ "entry_pool_min", KSTAT_DATA_UINT64 },
{ "entry_pool_max", KSTAT_DATA_UINT64 },
{ "threads_active", KSTAT_DATA_UINT64 },
{ "threads_idle", KSTAT_DATA_UINT64 },
{ "threads_total", KSTAT_DATA_UINT64 },
{ "tasks_pending", KSTAT_DATA_UINT64 },
{ "tasks_priority", KSTAT_DATA_UINT64 },
{ "tasks_total", KSTAT_DATA_UINT64 },
{ "tasks_delayed", KSTAT_DATA_UINT64 },
{ "entries_free", KSTAT_DATA_UINT64 },
{ "threads_created", KSTAT_DATA_UINT64 },
{ "threads_destroyed", KSTAT_DATA_UINT64 },
{ "tasks_dispatched", KSTAT_DATA_UINT64 },
{ "tasks_dispatched_delayed", KSTAT_DATA_UINT64 },
{ "tasks_executed_normal", KSTAT_DATA_UINT64 },
{ "tasks_executed_priority", KSTAT_DATA_UINT64 },
{ "tasks_executed", KSTAT_DATA_UINT64 },
{ "tasks_delayed_requeued", KSTAT_DATA_UINT64 },
{ "tasks_cancelled", KSTAT_DATA_UINT64 },
{ "thread_wakeups", KSTAT_DATA_UINT64 },
{ "thread_wakeups_nowork", KSTAT_DATA_UINT64 },
{ "thread_sleeps", KSTAT_DATA_UINT64 },
};
#define TQSTAT_INC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, 1)
#define TQSTAT_DEC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, -1)
#define _TQSTAT_MOD_LIST(mod, tq, t) do { \
switch (t->tqent_flags & TQENT_LIST_MASK) { \
case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
case TQENT_LIST_PENDING: mod(tq, tasks_pending); break; \
case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break; \
case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break; \
} \
} while (0)
#define TQSTAT_INC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
#define TQSTAT_DEC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
#define TQENT_SET_LIST(t, l) \
t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
static int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644);
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
@ -134,6 +216,7 @@ retry:
ASSERT(!timer_pending(&t->tqent_timer));
list_del_init(&t->tqent_list);
TQSTAT_DEC(tq, entries_free);
return (t);
}
@ -204,12 +287,11 @@ task_done(taskq_t *tq, taskq_ent_t *t)
{
ASSERT(tq);
ASSERT(t);
ASSERT(list_empty(&t->tqent_list));
/* Wake tasks blocked in taskq_wait_id() */
wake_up_all(&t->tqent_waitq);
list_del_init(&t->tqent_list);
if (tq->tq_nalloc <= tq->tq_minalloc) {
t->tqent_id = TASKQID_INVALID;
t->tqent_func = NULL;
@ -217,6 +299,7 @@ task_done(taskq_t *tq, taskq_ent_t *t)
t->tqent_flags = 0;
list_add_tail(&t->tqent_list, &tq->tq_free_list);
TQSTAT_INC(tq, entries_free);
} else {
task_free(tq, t);
}
@ -263,6 +346,8 @@ task_expire_impl(taskq_ent_t *t)
spin_unlock_irqrestore(&tq->tq_lock, flags);
wake_up(&tq->tq_work_waitq);
TQSTAT_INC(tq, tasks_delayed_requeued);
}
static void
@ -534,7 +619,11 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
t = taskq_find(tq, id);
if (t && t != ERR_PTR(-EBUSY)) {
list_del_init(&t->tqent_list);
TQSTAT_DEC_LIST(tq, t);
TQSTAT_DEC(tq, tasks_total);
t->tqent_flags |= TQENT_FLAG_CANCEL;
TQSTAT_INC(tq, tasks_cancelled);
/*
* When canceling the lowest outstanding task id we
@ -604,13 +693,19 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
spin_lock(&t->tqent_lock);
/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
if (flags & TQ_NOQUEUE)
if (flags & TQ_NOQUEUE) {
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
list_add(&t->tqent_list, &tq->tq_prio_list);
/* Queue to the priority list instead of the pending list */
else if (flags & TQ_FRONT)
} else if (flags & TQ_FRONT) {
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
list_add_tail(&t->tqent_list, &tq->tq_prio_list);
else
} else {
TQENT_SET_LIST(t, TQENT_LIST_PENDING);
list_add_tail(&t->tqent_list, &tq->tq_pend_list);
}
TQSTAT_INC_LIST(tq, t);
TQSTAT_INC(tq, tasks_total);
t->tqent_id = rc = tq->tq_next_id;
tq->tq_next_id++;
@ -629,6 +724,8 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
wake_up(&tq->tq_work_waitq);
TQSTAT_INC(tq, tasks_dispatched);
/* Spawn additional taskq threads if required. */
if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
@ -662,6 +759,9 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
/* Queue to the delay list for subsequent execution */
list_add_tail(&t->tqent_list, &tq->tq_delay_list);
TQENT_SET_LIST(t, TQENT_LIST_DELAY);
TQSTAT_INC_LIST(tq, t);
TQSTAT_INC(tq, tasks_total);
t->tqent_id = rc = tq->tq_next_id;
tq->tq_next_id++;
@ -676,6 +776,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
spin_unlock(&t->tqent_lock);
TQSTAT_INC(tq, tasks_dispatched_delayed);
/* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
@ -724,10 +826,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
t->tqent_flags |= TQENT_FLAG_PREALLOC;
/* Queue to the priority list instead of the pending list */
if (flags & TQ_FRONT)
if (flags & TQ_FRONT) {
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
list_add_tail(&t->tqent_list, &tq->tq_prio_list);
else
} else {
TQENT_SET_LIST(t, TQENT_LIST_PENDING);
list_add_tail(&t->tqent_list, &tq->tq_pend_list);
}
TQSTAT_INC_LIST(tq, t);
TQSTAT_INC(tq, tasks_total);
t->tqent_id = tq->tq_next_id;
tq->tq_next_id++;
@ -742,6 +849,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
wake_up(&tq->tq_work_waitq);
TQSTAT_INC(tq, tasks_dispatched);
/* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
@ -908,6 +1017,8 @@ taskq_thread(void *args)
wake_up(&tq->tq_wait_waitq);
set_current_state(TASK_INTERRUPTIBLE);
TQSTAT_INC(tq, threads_total);
while (!kthread_should_stop()) {
if (list_empty(&tq->tq_pend_list) &&
@ -919,9 +1030,15 @@ taskq_thread(void *args)
add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
spin_unlock_irqrestore(&tq->tq_lock, flags);
TQSTAT_INC(tq, thread_sleeps);
TQSTAT_INC(tq, threads_idle);
schedule();
seq_tasks = 0;
TQSTAT_DEC(tq, threads_idle);
TQSTAT_INC(tq, thread_wakeups);
spin_lock_irqsave_nested(&tq->tq_lock, flags,
tq->tq_lock_class);
remove_wait_queue(&tq->tq_work_waitq, &wait);
@ -931,6 +1048,8 @@ taskq_thread(void *args)
if ((t = taskq_next_ent(tq)) != NULL) {
list_del_init(&t->tqent_list);
TQSTAT_DEC_LIST(tq, t);
TQSTAT_DEC(tq, tasks_total);
/*
* A TQENT_FLAG_PREALLOC task may be reused or freed
@ -955,6 +1074,7 @@ taskq_thread(void *args)
tq->tq_nactive++;
spin_unlock_irqrestore(&tq->tq_lock, flags);
TQSTAT_INC(tq, threads_active);
DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
/* Perform the requested task */
@ -962,8 +1082,17 @@ taskq_thread(void *args)
DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
TQSTAT_DEC(tq, threads_active);
if ((t->tqent_flags & TQENT_LIST_MASK) ==
TQENT_LIST_PENDING)
TQSTAT_INC(tq, tasks_executed_normal);
else
TQSTAT_INC(tq, tasks_executed_priority);
TQSTAT_INC(tq, tasks_executed);
spin_lock_irqsave_nested(&tq->tq_lock, flags,
tq->tq_lock_class);
tq->tq_nactive--;
list_del_init(&tqt->tqt_active_list);
tqt->tqt_task = NULL;
@ -989,7 +1118,8 @@ taskq_thread(void *args)
tqt->tqt_id = TASKQID_INVALID;
tqt->tqt_flags = 0;
wake_up_all(&tq->tq_wait_waitq);
}
} else
TQSTAT_INC(tq, thread_wakeups_nowork);
set_current_state(TASK_INTERRUPTIBLE);
@ -998,6 +1128,10 @@ taskq_thread(void *args)
__set_current_state(TASK_RUNNING);
tq->tq_nthreads--;
list_del_init(&tqt->tqt_thread_list);
TQSTAT_DEC(tq, threads_total);
TQSTAT_INC(tq, threads_destroyed);
error:
kmem_free(tqt, sizeof (taskq_thread_t));
spin_unlock_irqrestore(&tq->tq_lock, flags);
@ -1037,9 +1171,156 @@ taskq_thread_create(taskq_t *tq)
wake_up_process(tqt->tqt_thread);
TQSTAT_INC(tq, threads_created);
return (tqt);
}
static void
taskq_stats_init(taskq_t *tq)
{
taskq_sums_t *tqs = &tq->tq_sums;
wmsum_init(&tqs->tqs_threads_active, 0);
wmsum_init(&tqs->tqs_threads_idle, 0);
wmsum_init(&tqs->tqs_threads_total, 0);
wmsum_init(&tqs->tqs_tasks_pending, 0);
wmsum_init(&tqs->tqs_tasks_priority, 0);
wmsum_init(&tqs->tqs_tasks_total, 0);
wmsum_init(&tqs->tqs_tasks_delayed, 0);
wmsum_init(&tqs->tqs_entries_free, 0);
wmsum_init(&tqs->tqs_threads_created, 0);
wmsum_init(&tqs->tqs_threads_destroyed, 0);
wmsum_init(&tqs->tqs_tasks_dispatched, 0);
wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
wmsum_init(&tqs->tqs_tasks_executed, 0);
wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
wmsum_init(&tqs->tqs_tasks_cancelled, 0);
wmsum_init(&tqs->tqs_thread_wakeups, 0);
wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
wmsum_init(&tqs->tqs_thread_sleeps, 0);
}
static void
taskq_stats_fini(taskq_t *tq)
{
taskq_sums_t *tqs = &tq->tq_sums;
wmsum_fini(&tqs->tqs_threads_active);
wmsum_fini(&tqs->tqs_threads_idle);
wmsum_fini(&tqs->tqs_threads_total);
wmsum_fini(&tqs->tqs_tasks_pending);
wmsum_fini(&tqs->tqs_tasks_priority);
wmsum_fini(&tqs->tqs_tasks_total);
wmsum_fini(&tqs->tqs_tasks_delayed);
wmsum_fini(&tqs->tqs_entries_free);
wmsum_fini(&tqs->tqs_threads_created);
wmsum_fini(&tqs->tqs_threads_destroyed);
wmsum_fini(&tqs->tqs_tasks_dispatched);
wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
wmsum_fini(&tqs->tqs_tasks_executed_normal);
wmsum_fini(&tqs->tqs_tasks_executed_priority);
wmsum_fini(&tqs->tqs_tasks_executed);
wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
wmsum_fini(&tqs->tqs_tasks_cancelled);
wmsum_fini(&tqs->tqs_thread_wakeups);
wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
wmsum_fini(&tqs->tqs_thread_sleeps);
}
static int
taskq_kstats_update(kstat_t *ksp, int rw)
{
if (rw == KSTAT_WRITE)
return (EACCES);
taskq_t *tq = ksp->ks_private;
taskq_kstats_t *tqks = ksp->ks_data;
tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
taskq_sums_t *tqs = &tq->tq_sums;
tqks->tqks_threads_active.value.ui64 =
wmsum_value(&tqs->tqs_threads_active);
tqks->tqks_threads_idle.value.ui64 =
wmsum_value(&tqs->tqs_threads_idle);
tqks->tqks_threads_total.value.ui64 =
wmsum_value(&tqs->tqs_threads_total);
tqks->tqks_tasks_pending.value.ui64 =
wmsum_value(&tqs->tqs_tasks_pending);
tqks->tqks_tasks_priority.value.ui64 =
wmsum_value(&tqs->tqs_tasks_priority);
tqks->tqks_tasks_total.value.ui64 =
wmsum_value(&tqs->tqs_tasks_total);
tqks->tqks_tasks_delayed.value.ui64 =
wmsum_value(&tqs->tqs_tasks_delayed);
tqks->tqks_entries_free.value.ui64 =
wmsum_value(&tqs->tqs_entries_free);
tqks->tqks_threads_created.value.ui64 =
wmsum_value(&tqs->tqs_threads_created);
tqks->tqks_threads_destroyed.value.ui64 =
wmsum_value(&tqs->tqs_threads_destroyed);
tqks->tqks_tasks_dispatched.value.ui64 =
wmsum_value(&tqs->tqs_tasks_dispatched);
tqks->tqks_tasks_dispatched_delayed.value.ui64 =
wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
tqks->tqks_tasks_executed_normal.value.ui64 =
wmsum_value(&tqs->tqs_tasks_executed_normal);
tqks->tqks_tasks_executed_priority.value.ui64 =
wmsum_value(&tqs->tqs_tasks_executed_priority);
tqks->tqks_tasks_executed.value.ui64 =
wmsum_value(&tqs->tqs_tasks_executed);
tqks->tqks_tasks_delayed_requeued.value.ui64 =
wmsum_value(&tqs->tqs_tasks_delayed_requeued);
tqks->tqks_tasks_cancelled.value.ui64 =
wmsum_value(&tqs->tqs_tasks_cancelled);
tqks->tqks_thread_wakeups.value.ui64 =
wmsum_value(&tqs->tqs_thread_wakeups);
tqks->tqks_thread_wakeups_nowork.value.ui64 =
wmsum_value(&tqs->tqs_thread_wakeups_nowork);
tqks->tqks_thread_sleeps.value.ui64 =
wmsum_value(&tqs->tqs_thread_sleeps);
return (0);
}
static void
taskq_kstats_init(taskq_t *tq)
{
char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (ksp == NULL)
return;
ksp->ks_private = tq;
ksp->ks_update = taskq_kstats_update;
ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
kstat_install(ksp);
tq->tq_ksp = ksp;
}
static void
taskq_kstats_fini(taskq_t *tq)
{
if (tq->tq_ksp == NULL)
return;
kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
kstat_delete(tq->tq_ksp);
tq->tq_ksp = NULL;
}
taskq_t *
taskq_create(const char *name, int threads_arg, pri_t pri,
int minalloc, int maxalloc, uint_t flags)
@ -1104,6 +1385,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
init_waitqueue_head(&tq->tq_wait_waitq);
tq->tq_lock_class = TQ_LOCK_GENERAL;
INIT_LIST_HEAD(&tq->tq_taskqs);
taskq_stats_init(tq);
if (flags & TASKQ_PREPOPULATE) {
spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
@ -1137,14 +1419,17 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
if (rc) {
taskq_destroy(tq);
tq = NULL;
} else {
down_write(&tq_list_sem);
tq->tq_instance = taskq_find_by_name(name) + 1;
list_add_tail(&tq->tq_taskqs, &tq_list);
up_write(&tq_list_sem);
return (NULL);
}
down_write(&tq_list_sem);
tq->tq_instance = taskq_find_by_name(name) + 1;
list_add_tail(&tq->tq_taskqs, &tq_list);
up_write(&tq_list_sem);
/* Install kstats late, because the name includes tq_instance */
taskq_kstats_init(tq);
return (tq);
}
EXPORT_SYMBOL(taskq_create);
@ -1177,6 +1462,8 @@ taskq_destroy(taskq_t *tq)
taskq_wait(tq);
taskq_kstats_fini(tq);
/* remove taskq from global list used by the kstats */
down_write(&tq_list_sem);
list_del(&tq->tq_taskqs);
@ -1230,6 +1517,7 @@ taskq_destroy(taskq_t *tq)
spin_unlock_irqrestore(&tq->tq_lock, flags);
taskq_stats_fini(tq);
kmem_strfree(tq->tq_name);
kmem_free(tq, sizeof (taskq_t));
}
@ -1271,6 +1559,100 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri,
}
EXPORT_SYMBOL(taskq_create_synced);
static kstat_t *taskq_summary_ksp = NULL;
static int
spl_taskq_kstat_headers(char *buf, size_t size)
{
size_t n = snprintf(buf, size,
"%-20s | %-17s | %-23s\n"
"%-20s | %-17s | %-23s\n"
"%-20s | %-17s | %-23s\n",
"", "threads", "tasks on queue",
"taskq name", "tot [act idl] max", " pend [ norm high] dly",
"--------------------", "-----------------",
"-----------------------");
return (n >= size ? ENOMEM : 0);
}
static int
spl_taskq_kstat_data(char *buf, size_t size, void *data)
{
struct list_head *tql = NULL;
taskq_t *tq;
char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
char threads[25];
char tasks[30];
size_t n;
int err = 0;
down_read(&tq_list_sem);
list_for_each_prev(tql, &tq_list) {
tq = list_entry(tql, taskq_t, tq_taskqs);
mutex_enter(tq->tq_ksp->ks_lock);
taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
tq->tq_instance);
snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
tqks->tqks_threads_total.value.ui64,
tqks->tqks_threads_active.value.ui64,
tqks->tqks_threads_idle.value.ui64,
tqks->tqks_threads_max.value.ui64);
snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
tqks->tqks_tasks_total.value.ui64,
tqks->tqks_tasks_pending.value.ui64,
tqks->tqks_tasks_priority.value.ui64,
tqks->tqks_tasks_delayed.value.ui64);
mutex_exit(tq->tq_ksp->ks_lock);
n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
name, threads, tasks);
if (n >= size) {
err = ENOMEM;
break;
}
buf = &buf[n];
size -= n;
}
up_read(&tq_list_sem);
return (err);
}
static void
spl_taskq_kstat_init(void)
{
kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
if (ksp == NULL)
return;
ksp->ks_data = (void *)(uintptr_t)1;
ksp->ks_ndata = 1;
kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
spl_taskq_kstat_data, NULL);
kstat_install(ksp);
taskq_summary_ksp = ksp;
}
static void
spl_taskq_kstat_fini(void)
{
if (taskq_summary_ksp == NULL)
return;
kstat_delete(taskq_summary_ksp);
taskq_summary_ksp = NULL;
}
static unsigned int spl_taskq_kick = 0;
/*
@ -1451,12 +1833,16 @@ spl_taskq_init(void)
*/
dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
spl_taskq_kstat_init();
return (0);
}
void
spl_taskq_fini(void)
{
spl_taskq_kstat_fini();
taskq_destroy(dynamic_taskq);
dynamic_taskq = NULL;

View File

@ -186,6 +186,13 @@ issig(void)
schedule();
#endif
/*
* Dequeued SIGSTOP/SIGTSTP.
* Check if process has other singal pending.
*/
if (signal_pending(current))
return (1);
return (0);
}

View File

@ -58,22 +58,16 @@
#include <sys/arc.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h>
#include <linux/mm_compat.h>
#include <linux/scatterlist.h>
#include <linux/version.h>
#endif
#ifdef _KERNEL
#if defined(MAX_ORDER)
#define ABD_MAX_ORDER (MAX_ORDER)
#elif defined(MAX_PAGE_ORDER)
#define ABD_MAX_ORDER (MAX_PAGE_ORDER)
#endif
#else
#define ABD_MAX_ORDER (1)
#endif
typedef struct abd_stats {
kstat_named_t abdstat_struct_size;
@ -193,11 +187,9 @@ abd_t *abd_zero_scatter = NULL;
struct page;
/*
* _KERNEL - Will point to ZERO_PAGE if it is available or it will be
* an allocated zero'd PAGESIZE buffer.
* Userspace - Will be an allocated zero'ed PAGESIZE buffer.
*
* abd_zero_page is assigned to each of the pages of abd_zero_scatter.
* abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
* point to ZERO_PAGE if it is available or it will be an allocated zero'd
* PAGESIZE buffer.
*/
static struct page *abd_zero_page = NULL;
@ -232,7 +224,6 @@ abd_free_struct_impl(abd_t *abd)
ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
}
#ifdef _KERNEL
static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
/*
@ -509,7 +500,7 @@ abd_alloc_zero_scatter(void)
ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
@ -520,134 +511,6 @@ abd_alloc_zero_scatter(void)
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
}
#else /* _KERNEL */
#ifndef PAGE_SHIFT
#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
#endif
#define zfs_kmap_local(chunk) ((void *)chunk)
#define zfs_kunmap_local(addr) do { (void)(addr); } while (0)
#define local_irq_save(flags) do { (void)(flags); } while (0)
#define local_irq_restore(flags) do { (void)(flags); } while (0)
#define nth_page(pg, i) \
((struct page *)((void *)(pg) + (i) * PAGESIZE))
struct scatterlist {
struct page *page;
int length;
int end;
};
static void
sg_init_table(struct scatterlist *sg, int nr)
{
memset(sg, 0, nr * sizeof (struct scatterlist));
sg[nr - 1].end = 1;
}
/*
* This must be called if any of the sg_table allocation functions
* are called.
*/
static void
abd_free_sg_table(abd_t *abd)
{
int nents = ABD_SCATTER(abd).abd_nents;
vmem_free(ABD_SCATTER(abd).abd_sgl,
nents * sizeof (struct scatterlist));
}
#define for_each_sg(sgl, sg, nr, i) \
for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
static inline void
sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
unsigned int offset)
{
/* currently we don't use offset */
ASSERT(offset == 0);
sg->page = page;
sg->length = len;
}
static inline struct page *
sg_page(struct scatterlist *sg)
{
return (sg->page);
}
static inline struct scatterlist *
sg_next(struct scatterlist *sg)
{
if (sg->end)
return (NULL);
return (sg + 1);
}
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
unsigned nr_pages = abd_chunkcnt_for_bytes(size);
struct scatterlist *sg;
int i;
ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
sizeof (struct scatterlist), KM_SLEEP);
sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
abd_for_each_sg(abd, sg, nr_pages, i) {
struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
sg_set_page(sg, p, PAGESIZE, 0);
}
ABD_SCATTER(abd).abd_nents = nr_pages;
}
void
abd_free_chunks(abd_t *abd)
{
int i, n = ABD_SCATTER(abd).abd_nents;
struct scatterlist *sg;
abd_for_each_sg(abd, sg, n, i) {
struct page *p = nth_page(sg_page(sg), 0);
umem_free_aligned(p, PAGESIZE);
}
abd_free_sg_table(abd);
}
static void
abd_alloc_zero_scatter(void)
{
unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
struct scatterlist *sg;
int i;
abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
memset(abd_zero_page, 0, PAGESIZE);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
sizeof (struct scatterlist), KM_SLEEP);
sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
}
ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
}
#endif /* _KERNEL */
boolean_t
abd_size_alloc_linear(size_t size)
{
@ -712,14 +575,10 @@ abd_free_zero_scatter(void)
abd_free_struct(abd_zero_scatter);
abd_zero_scatter = NULL;
ASSERT3P(abd_zero_page, !=, NULL);
#if defined(_KERNEL)
#if defined(HAVE_ZERO_PAGE_GPL_ONLY)
abd_unmark_zfs_page(abd_zero_page);
__free_page(abd_zero_page);
#endif /* HAVE_ZERO_PAGE_GPL_ONLY */
#else
umem_free_aligned(abd_zero_page, PAGESIZE);
#endif /* _KERNEL */
}
static int
@ -1014,8 +873,6 @@ abd_cache_reap_now(void)
{
}
#if defined(_KERNEL)
/*
* This is abd_iter_page(), the function underneath abd_iterate_page_func().
* It yields the next page struct and data offset and size within it, without
@ -1297,5 +1154,3 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD.");
#endif /* _KERNEL */

View File

@ -201,9 +201,9 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
* See also the comment above zfs_arc_shrinker_limit.
*/
int64_t can_free = btop(arc_evictable_memory());
int64_t limit = zfs_arc_shrinker_limit != 0 ?
zfs_arc_shrinker_limit : INT64_MAX;
return (MIN(can_free, limit));
if (current_is_kswapd() && zfs_arc_shrinker_limit)
can_free = MIN(can_free, zfs_arc_shrinker_limit);
return (can_free);
}
static unsigned long

View File

@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
zfsvfs_t *snap_zfsvfs;
zfs_snapentry_t *se;
char *full_name, *full_path;
char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
NULL };
char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
NULL, NULL, NULL };
char *envp[] = { NULL };
int error;
struct path spath;
@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
* value from call_usermodehelper() will be (exitcode << 8 + signal).
*/
dprintf("mount; name=%s path=%s\n", full_name, full_path);
argv[5] = full_name;
argv[6] = full_path;
argv[6] = full_name;
argv[7] = full_path;
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
if (error) {
if (!(error & MOUNT_BUSY << 8)) {

View File

@ -292,6 +292,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
{
struct super_block *s;
objset_t *os;
boolean_t issnap = B_FALSE;
int err;
err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
@ -323,6 +324,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
if (zpl_enter(zfsvfs, FTAG) == 0) {
if (os != zfsvfs->z_os)
err = -SET_ERROR(EBUSY);
issnap = zfsvfs->z_issnap;
zpl_exit(zfsvfs, FTAG);
} else {
err = -SET_ERROR(EBUSY);
@ -346,7 +348,11 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
return (ERR_PTR(err));
}
s->s_flags |= SB_ACTIVE;
} else if ((flags ^ s->s_flags) & SB_RDONLY) {
} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
/*
* Skip ro check for snap since snap is always ro regardless
* ro flag is passed by mount or not.
*/
deactivate_locked_super(s);
return (ERR_PTR(-EBUSY));
}

View File

@ -1213,6 +1213,7 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
qlimits->io_opt = limits->zql_io_opt;
qlimits->physical_block_size = limits->zql_physical_block_size;
qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
qlimits->discard_granularity = limits->zql_discard_granularity;
#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
qlimits->features =
@ -1251,7 +1252,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
zso->zvo_disk->minors = ZVOL_MINORS;
zso->zvo_queue = zso->zvo_disk->queue;
zvol_queue_limits_apply(limits, zso->zvo_queue);
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
struct queue_limits qlimits;
zvol_queue_limits_convert(limits, &qlimits);
@ -1261,13 +1261,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
return (1);
}
#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
blk_queue_set_write_cache(zso->zvo_queue, B_TRUE);
#endif
zso->zvo_disk = disk;
zso->zvo_disk->minors = ZVOL_MINORS;
zso->zvo_queue = zso->zvo_disk->queue;
#else
zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
if (zso->zvo_queue == NULL)
@ -1361,7 +1358,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
* request queue and generic disk structures for the block device.
*/
static zvol_state_t *
zvol_alloc(dev_t dev, const char *name)
zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
{
zvol_state_t *zv;
struct zvol_state_os *zso;
@ -1381,6 +1378,7 @@ zvol_alloc(dev_t dev, const char *name)
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
zv->zv_zso = zso;
zv->zv_volmode = volmode;
zv->zv_volblocksize = volblocksize;
list_link_init(&zv->zv_next);
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
@ -1670,7 +1668,8 @@ zvol_os_create_minor(const char *name)
if (error)
goto out_dmu_objset_disown;
zv = zvol_alloc(MKDEV(zvol_major, minor), name);
zv = zvol_alloc(MKDEV(zvol_major, minor), name,
doi->doi_data_block_size);
if (zv == NULL) {
error = SET_ERROR(EAGAIN);
goto out_dmu_objset_disown;
@ -1680,7 +1679,6 @@ zvol_os_create_minor(const char *name)
if (dmu_objset_is_snapshot(os))
zv->zv_flags |= ZVOL_RDONLY;
zv->zv_volblocksize = doi->doi_data_block_size;
zv->zv_volsize = volsize;
zv->zv_objset = os;

View File

@ -754,6 +754,12 @@ zpool_feature_init(void)
"Support for raidz expansion",
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
zfeature_register(SPA_FEATURE_FAST_DEDUP,
"com.klarasystems:fast_dedup", "fast_dedup",
"Support for advanced deduplication",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
sfeatures);
zfs_mod_list_supported_free(sfeatures);
}

View File

@ -0,0 +1,277 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2024, Klara Inc.
*/
#include <sys/fs/zfs.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/string.h>
#include <sys/debug.h>
#include "zfs_valstr.h"
/*
* Each bit in a bitfield has three possible string representations:
* - single char
* - two-char pair
* - full name
*/
typedef struct {
const char vb_bit;
const char vb_pair[2];
const char *vb_name;
} valstr_bit_t;
/*
* Emits a character for each bit in `bits`, up to the number of elements
* in the table. Set bits get the character in vb_bit, clear bits get a
* space. This results in all strings having the same width, for easier
* visual comparison.
*/
static size_t
valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems,
uint64_t bits, char *out, size_t outlen)
{
ASSERT(out);
size_t n = 0;
for (int b = 0; b < nelems; b++) {
if (n == outlen)
break;
uint64_t mask = (1ULL << b);
out[n++] = (bits & mask) ? table[b].vb_bit : ' ';
}
if (n < outlen)
out[n++] = '\0';
return (n);
}
/*
* Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and
* separated by a `|` character. This gives a concise representation of the
* whole value.
*/
static size_t
valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems,
uint64_t bits, char *out, size_t outlen)
{
ASSERT(out);
size_t n = 0;
for (int b = 0; b < nelems; b++) {
ASSERT3U(n, <=, outlen);
if (n == outlen)
break;
uint64_t mask = (1ULL << b);
if (bits & mask) {
size_t len = (n > 0) ? 3 : 2;
if (n > outlen-len)
break;
if (n > 0)
out[n++] = '|';
out[n++] = table[b].vb_pair[0];
out[n++] = table[b].vb_pair[1];
}
}
if (n < outlen)
out[n++] = '\0';
return (n);
}
/*
* Emits the full name for each bit set in `bits`, taken from vb_name, and
* separated by a space. This unambiguously shows the entire set of bits, but
* can get very long.
*/
static size_t
valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems,
uint64_t bits, char *out, size_t outlen)
{
ASSERT(out);
size_t n = 0;
for (int b = 0; b < nelems; b++) {
ASSERT3U(n, <=, outlen);
if (n == outlen)
break;
uint64_t mask = (1ULL << b);
if (bits & mask) {
size_t len = strlen(table[b].vb_name);
if (n > 0)
len++;
if (n > outlen-len)
break;
if (n > 0) {
out[n++] = ' ';
len--;
}
memcpy(&out[n], table[b].vb_name, len);
n += len;
}
}
if (n < outlen)
out[n++] = '\0';
return (n);
}
/*
* Emits the name of the given enum value in the table.
*/
static size_t
valstr_enum_str(const char **table, const size_t nelems,
int v, char *out, size_t outlen)
{
ASSERT(out);
ASSERT3U(v, <, nelems);
if (v >= nelems)
return (0);
return (MIN(strlcpy(out, table[v], outlen), outlen));
}
/*
* These macros create the string tables for the given name, and implement
* the public functions described in zfs_valstr.h.
*/
#define _VALSTR_BITFIELD_IMPL(name, ...) \
static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\
size_t \
zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen) \
{ \
return (valstr_bitfield_bits(valstr_ ## name ## _table, \
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
} \
\
size_t \
zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen) \
{ \
return (valstr_bitfield_pairs(valstr_ ## name ## _table, \
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
} \
\
size_t \
zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen) \
{ \
return (valstr_bitfield_str(valstr_ ## name ## _table, \
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
} \
#define _VALSTR_ENUM_IMPL(name, ...) \
static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ }; \
size_t \
zfs_valstr_ ## name(int v, char *out, size_t outlen) \
{ \
return (valstr_enum_str(valstr_ ## name ## _table, \
ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen)); \
} \
/* String tables */
/* ZIO flags: zio_flag_t, typically zio->io_flags */
/* BEGIN CSTYLED */
_VALSTR_BITFIELD_IMPL(zio_flag,
{ '.', "DA", "DONT_AGGREGATE" },
{ '.', "RP", "IO_REPAIR" },
{ '.', "SH", "SELF_HEAL" },
{ '.', "RS", "RESILVER" },
{ '.', "SC", "SCRUB" },
{ '.', "ST", "SCAN_THREAD" },
{ '.', "PH", "PHYSICAL" },
{ '.', "CF", "CANFAIL" },
{ '.', "SP", "SPECULATIVE" },
{ '.', "CW", "CONFIG_WRITER" },
{ '.', "DR", "DONT_RETRY" },
{ '?', "??", "[UNUSED 11]" },
{ '.', "ND", "NODATA" },
{ '.', "ID", "INDUCE_DAMAGE" },
{ '.', "AL", "IO_ALLOCATING" },
{ '.', "RE", "IO_RETRY" },
{ '.', "PR", "PROBE" },
{ '.', "TH", "TRYHARD" },
{ '.', "OP", "OPTIONAL" },
{ '.', "DQ", "DONT_QUEUE" },
{ '.', "DP", "DONT_PROPAGATE" },
{ '.', "BY", "IO_BYPASS" },
{ '.', "RW", "IO_REWRITE" },
{ '.', "CM", "RAW_COMPRESS" },
{ '.', "EN", "RAW_ENCRYPT" },
{ '.', "GG", "GANG_CHILD" },
{ '.', "DD", "DDT_CHILD" },
{ '.', "GF", "GODFATHER" },
{ '.', "NP", "NOPWRITE" },
{ '.', "EX", "REEXECUTED" },
{ '.', "DG", "DELEGATED" },
)
/* END CSTYLED */
/*
* ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or
* zio->io_pipeline.
*/
/* BEGIN CSTYLED */
_VALSTR_BITFIELD_IMPL(zio_stage,
{ 'O', "O ", "OPEN" },
{ 'I', "RI", "READ_BP_INIT" },
{ 'I', "WI", "WRITE_BP_INIT" },
{ 'I', "FI", "FREE_BP_INIT" },
{ 'A', "IA", "ISSUE_ASYNC" },
{ 'W', "WC", "WRITE_COMPRESS" },
{ 'E', "EN", "ENCRYPT" },
{ 'C', "CG", "CHECKSUM_GENERATE" },
{ 'N', "NW", "NOP_WRITE" },
{ 'B', "BF", "BRT_FREE" },
{ 'd', "dS", "DDT_READ_START" },
{ 'd', "dD", "DDT_READ_DONE" },
{ 'd', "dW", "DDT_WRITE" },
{ 'd', "dF", "DDT_FREE" },
{ 'G', "GA", "GANG_ASSEMBLE" },
{ 'G', "GI", "GANG_ISSUE" },
{ 'D', "DT", "DVA_THROTTLE" },
{ 'D', "DA", "DVA_ALLOCATE" },
{ 'D', "DF", "DVA_FREE" },
{ 'D', "DC", "DVA_CLAIM" },
{ 'R', "R ", "READY" },
{ 'V', "VS", "VDEV_IO_START" },
{ 'V', "VD", "VDEV_IO_DONE" },
{ 'V', "VA", "VDEV_IO_ASSESS" },
{ 'C', "CV", "CHECKSUM_VERIFY" },
{ 'X', "X ", "DONE" },
)
/* END CSTYLED */
/* ZIO priority: zio_priority_t, typically zio->io_priority */
/* BEGIN CSTYLED */
_VALSTR_ENUM_IMPL(zio_priority,
"SYNC_READ",
"SYNC_WRITE",
"ASYNC_READ",
"ASYNC_WRITE",
"SCRUB",
"REMOVAL",
"INITIALIZING",
"TRIM",
"REBUILD",
"[NUM_QUEUEABLE]",
"NOW",
)
/* END CSTYLED */
#undef _VALSTR_BITFIELD_IMPL
#undef _VALSTR_ENUM_IMPL

View File

@ -113,7 +113,7 @@ abd_verify(abd_t *abd)
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
@ -603,13 +603,11 @@ abd_get_zeros(size_t size)
}
/*
* Allocate a linear ABD structure for buf.
* Create a linear ABD for an existing buf.
*/
abd_t *
abd_get_from_buf(void *buf, size_t size)
static abd_t *
abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
{
abd_t *abd = abd_alloc_struct(0);
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
/*
@ -625,6 +623,20 @@ abd_get_from_buf(void *buf, size_t size)
return (abd);
}
abd_t *
abd_get_from_buf(void *buf, size_t size)
{
abd_t *abd = abd_alloc_struct(0);
return (abd_get_from_buf_impl(abd, buf, size));
}
abd_t *
abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
{
abd_init_struct(abd);
return (abd_get_from_buf_impl(abd, buf, size));
}
/*
* Get the raw buffer associated with a linear ABD.
*/

View File

@ -1767,12 +1767,12 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
uint64_t csize;
uint64_t lsize = HDR_GET_LSIZE(hdr);
uint64_t psize = HDR_GET_PSIZE(hdr);
void *tmpbuf = NULL;
abd_t *abd = hdr->b_l1hdr.b_pabd;
boolean_t free_abd = B_FALSE;
ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT(HDR_AUTHENTICATED(hdr));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3P(abd, !=, NULL);
/*
* The MAC is calculated on the compressed data that is stored on disk.
@ -1784,14 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
*/
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
!HDR_COMPRESSION_ENABLED(hdr)) {
abd = NULL;
csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
ASSERT3P(tmpbuf, !=, NULL);
hdr->b_l1hdr.b_pabd, &abd, lsize, hdr->b_complevel);
ASSERT3P(abd, !=, NULL);
ASSERT3U(csize, <=, psize);
abd = abd_get_from_buf(tmpbuf, lsize);
abd_take_ownership_of_buf(abd, B_TRUE);
abd_zero_off(abd, csize, psize - csize);
free_abd = B_TRUE;
}
/*
@ -1810,16 +1809,10 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
if (ret == 0)
arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
else if (ret != ENOENT)
goto error;
else if (ret == ENOENT)
ret = 0;
if (tmpbuf != NULL)
abd_free(abd);
return (0);
error:
if (tmpbuf != NULL)
if (free_abd)
abd_free(abd);
return (ret);
@ -1836,7 +1829,6 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
{
int ret;
abd_t *cabd = NULL;
void *tmp = NULL;
boolean_t no_crypt = B_FALSE;
boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
@ -1871,17 +1863,14 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
* linear buffer and wrapping it in an abd later.
*/
cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
HDR_GET_LSIZE(hdr), &hdr->b_complevel);
if (ret != 0) {
abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
goto error;
}
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_pabd = cabd;
@ -2123,10 +2112,14 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
/* Skip byteswapping and checksumming (already done) */
return (0);
} else {
abd_t dabd;
abd_get_from_buf_struct(&dabd, buf->b_data,
HDR_GET_LSIZE(hdr));
error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, buf->b_data,
hdr->b_l1hdr.b_pabd, &dabd,
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
&hdr->b_complevel);
abd_free(&dabd);
/*
* Absent hardware errors or software bugs, this should
@ -8531,18 +8524,15 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
!HDR_COMPRESSION_ENABLED(hdr)) {
abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
ARC_HDR_USE_RESERVE);
void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
HDR_GET_LSIZE(hdr), &hdr->b_complevel);
if (ret != 0) {
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
goto error;
}
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_pabd = cabd;
@ -9037,9 +9027,8 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
}
if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
size_t bufsize = MAX(size, asize);
void *buf = zio_buf_alloc(bufsize);
uint64_t csize = zio_compress_data(compress, to_write, &buf,
cabd = abd_alloc_for_io(MAX(size, asize), ismd);
uint64_t csize = zio_compress_data(compress, to_write, &cabd,
size, hdr->b_complevel);
if (csize > psize) {
/*
@ -9047,13 +9036,12 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
* psize. Even if it fits into asize, it does not
* matter, since checksum will never match on read.
*/
zio_buf_free(buf, bufsize);
abd_free(cabd);
return (SET_ERROR(EIO));
}
if (asize > csize)
memset((char *)buf + csize, 0, asize - csize);
to_write = cabd = abd_get_from_buf(buf, bufsize);
abd_take_ownership_of_buf(cabd, B_TRUE);
abd_zero_off(cabd, csize, asize - csize);
to_write = cabd;
}
if (HDR_ENCRYPTED(hdr)) {
@ -9158,12 +9146,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
*/
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
/*
* If pass == 1 or 3, we cache MRU metadata and data
* respectively.
* pass == 0: MFU meta
* pass == 1: MRU meta
* pass == 2: MFU data
* pass == 3: MRU data
*/
if (l2arc_mfuonly) {
if (l2arc_mfuonly == 1) {
if (pass == 1 || pass == 3)
continue;
} else if (l2arc_mfuonly > 1) {
if (pass == 3)
continue;
}
uint64_t passed_sz = 0;
@ -10179,7 +10172,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
{
int err = 0;
zio_cksum_t cksum;
abd_t *abd = NULL;
uint64_t asize;
ASSERT(this_lbp != NULL && next_lbp != NULL);
@ -10241,16 +10233,22 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
case ZIO_COMPRESS_OFF:
break;
case ZIO_COMPRESS_LZ4:
abd = abd_alloc_for_io(asize, B_TRUE);
case ZIO_COMPRESS_LZ4: {
abd_t *abd = abd_alloc_linear(asize, B_TRUE);
abd_copy_from_buf_off(abd, this_lb, 0, asize);
if ((err = zio_decompress_data(
abd_t dabd;
abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
err = zio_decompress_data(
L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
abd, &dabd, asize, sizeof (*this_lb), NULL);
abd_free(&dabd);
abd_free(abd);
if (err != 0) {
err = SET_ERROR(EINVAL);
goto cleanup;
}
break;
}
default:
err = SET_ERROR(EINVAL);
goto cleanup;
@ -10267,8 +10265,6 @@ cleanup:
l2arc_log_blk_fetch_abort(*next_io);
*next_io = NULL;
}
if (abd != NULL)
abd_free(abd);
return (err);
}
@ -10504,7 +10500,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
uint64_t psize, asize;
zio_t *wzio;
l2arc_lb_abd_buf_t *abd_buf;
uint8_t *tmpbuf = NULL;
abd_t *abd = NULL;
l2arc_lb_ptr_buf_t *lb_ptr_buf;
VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
@ -10527,7 +10523,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
/* try to compress the buffer */
psize = zio_compress_data(ZIO_COMPRESS_LZ4,
abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
abd_buf->abd, &abd, sizeof (*lb), 0);
/* a log block is never entirely zero */
ASSERT(psize != 0);
@ -10553,27 +10549,26 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
ZIO_CHECKSUM_FLETCHER_4);
if (asize < sizeof (*lb)) {
/* compression succeeded */
memset(tmpbuf + psize, 0, asize - psize);
abd_zero_off(abd, psize, asize - psize);
L2BLK_SET_COMPRESS(
(&l2dhdr->dh_start_lbps[0])->lbp_prop,
ZIO_COMPRESS_LZ4);
} else {
/* compression failed */
memcpy(tmpbuf, lb, sizeof (*lb));
abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
L2BLK_SET_COMPRESS(
(&l2dhdr->dh_start_lbps[0])->lbp_prop,
ZIO_COMPRESS_OFF);
}
/* checksum what we're about to write */
fletcher_4_native(tmpbuf, asize, NULL,
abd_fletcher_4_native(abd, asize, NULL,
&l2dhdr->dh_start_lbps[0].lbp_cksum);
abd_free(abd_buf->abd);
/* perform the write itself */
abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
abd_buf->abd = abd;
wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);

View File

@ -142,8 +142,13 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
uint8_t dstbuf[BPE_PAYLOAD_SIZE];
decode_embedded_bp_compressed(bp, dstbuf);
VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
dstbuf, buf, psize, buflen, NULL));
abd_t cabd, dabd;
abd_get_from_buf_struct(&cabd, dstbuf, psize);
abd_get_from_buf_struct(&dabd, buf, buflen);
VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &cabd,
&dabd, psize, buflen, NULL));
abd_free(&dabd);
abd_free(&cabd);
} else {
ASSERT3U(lsize, ==, psize);
decode_embedded_bp_compressed(bp, buf);

View File

@ -204,6 +204,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
void
dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
{
if (dk->dk_kstats == NULL)
return;
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
char *ds_name;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,778 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2023, Klara Inc.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/ddt.h>
#include <sys/dmu_tx.h>
#include <sys/dmu.h>
#include <sys/ddt_impl.h>
#include <sys/dnode.h>
#include <sys/dbuf.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
/*
* No more than this many txgs before swapping logs.
*/
uint_t zfs_dedup_log_txg_max = 8;
/*
* Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
* load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
*/
uint64_t zfs_dedup_log_mem_max = 0;
uint_t zfs_dedup_log_mem_max_percent = 1;
static kmem_cache_t *ddt_log_entry_flat_cache;
static kmem_cache_t *ddt_log_entry_trad_cache;
#define DDT_LOG_ENTRY_FLAT_SIZE \
(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
#define DDT_LOG_ENTRY_TRAD_SIZE \
(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
#define DDT_LOG_ENTRY_SIZE(ddt) \
_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
void
ddt_log_init(void)
{
ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
/*
* Max memory for log AVL entries. At least 1M, because we need
* something (that's ~3800 entries per tree). They can say 100% if they
* want; it just means they're at the mercy of the the txg flush limit.
*/
if (zfs_dedup_log_mem_max == 0) {
zfs_dedup_log_mem_max_percent =
MIN(zfs_dedup_log_mem_max_percent, 100);
zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
zfs_dedup_log_mem_max_percent / 100;
}
zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
}
void
ddt_log_fini(void)
{
kmem_cache_destroy(ddt_log_entry_trad_cache);
kmem_cache_destroy(ddt_log_entry_flat_cache);
}
static void
ddt_log_name(ddt_t *ddt, char *name, uint_t n)
{
snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
zio_checksum_table[ddt->ddt_checksum].ci_name, n);
}
static void
ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
{
dmu_buf_t *db;
VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
DLH_SET_VERSION(hdr, 1);
DLH_SET_FLAGS(hdr, ddl->ddl_flags);
hdr->dlh_length = ddl->ddl_length;
hdr->dlh_first_txg = ddl->ddl_first_txg;
hdr->dlh_checkpoint = ddl->ddl_checkpoint;
dmu_buf_rele(db, FTAG);
}
static void
ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
{
ASSERT3U(ddt->ddt_dir_object, >, 0);
ASSERT3U(ddl->ddl_object, ==, 0);
char name[DDT_NAMELEN];
ddt_log_name(ddt, name, n);
ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
sizeof (uint64_t), 1, &ddl->ddl_object, tx));
ddl->ddl_length = 0;
ddl->ddl_first_txg = tx->tx_txg;
ddt_log_update_header(ddt, ddl, tx);
}
static void
ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
{
ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
}
static void
ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
{
ASSERT3U(ddt->ddt_dir_object, >, 0);
if (ddl->ddl_object == 0)
return;
ASSERT0(ddl->ddl_length);
char name[DDT_NAMELEN];
ddt_log_name(ddt, name, n);
VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
ddl->ddl_object = 0;
}
void
ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
{
ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
}
static void
ddt_log_update_stats(ddt_t *ddt)
{
/*
* Log object stats. We count the number of live entries in the log
* tree, even if there are more than on disk, and even if the same
* entry is on both append and flush trees, because that's more what
* the user expects to see. This does mean the on-disk size is not
* really correlated with the number of entries, but I don't think
* that's reasonable to expect anyway.
*/
dmu_object_info_t doi;
uint64_t nblocks;
dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
nblocks = doi.doi_physical_blocks_512;
dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
nblocks += doi.doi_physical_blocks_512;
ddt_object_t *ddo = &ddt->ddt_log_stats;
ddo->ddo_count =
avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
ddo->ddo_dspace = nblocks << 9;
}
void
ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
{
ASSERT3U(nentries, >, 0);
ASSERT3P(dlu->dlu_dbp, ==, NULL);
if (ddt->ddt_log_active->ddl_object == 0)
ddt_log_create(ddt, tx);
/*
* We want to store as many entries as we can in a block, but never
* split an entry across block boundaries.
*/
size_t reclen = P2ALIGN_TYPED(
sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
ASSERT3U(reclen, <=, UINT16_MAX);
dlu->dlu_reclen = reclen;
VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
&dlu->dlu_dn));
dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
uint64_t nblocks = howmany(nentries,
dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
uint64_t offset = ddt->ddt_log_active->ddl_length;
uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
DMU_READ_NO_PREFETCH));
dlu->dlu_tx = tx;
dlu->dlu_block = dlu->dlu_offset = 0;
}
static ddt_log_entry_t *
ddt_log_alloc_entry(ddt_t *ddt)
{
ddt_log_entry_t *ddle;
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
} else {
ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
}
return (ddle);
}
static void
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
{
/* Create the log tree entry from a live or stored entry */
avl_index_t where;
ddt_log_entry_t *ddle =
avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
if (ddle == NULL) {
ddle = ddt_log_alloc_entry(ddt);
ddle->ddle_key = ddlwe->ddlwe_key;
avl_insert(&ddl->ddl_tree, ddle, where);
}
ddle->ddle_type = ddlwe->ddlwe_type;
ddle->ddle_class = ddlwe->ddlwe_class;
memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
}
void
ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
{
ASSERT3U(dlu->dlu_dbp, !=, NULL);
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
/* Get our block */
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
/*
* If this would take us past the end of the block, finish it and
* move to the next one.
*/
if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
ASSERT3U(dlu->dlu_offset, >, 0);
dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
dlu->dlu_block++;
dlu->dlu_offset = 0;
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
db = dlu->dlu_dbp[dlu->dlu_block];
}
/*
* If this is the first time touching the block, inform the DMU that
* we will fill it, and zero it out.
*/
if (dlu->dlu_offset == 0) {
dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
memset(db->db_data, 0, db->db_size);
}
/* Create the log record directly in the buffer */
ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
DLR_SET_TYPE(dlr, DLR_ENTRY);
DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
ddt_log_record_entry_t *dlre =
(ddt_log_record_entry_t *)&dlr->dlr_payload;
dlre->dlre_key = ddlwe->ddlwe_key;
memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
/* Advance offset for next record. */
dlu->dlu_offset += dlu->dlu_reclen;
}
void
ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
{
ASSERT3U(dlu->dlu_dbp, !=, NULL);
ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
ASSERT3U(dlu->dlu_offset, >, 0);
/*
* Close out the last block. Whatever we haven't used will be zeroed,
* which matches DLR_INVALID, so we can detect this during load.
*/
dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
ddt->ddt_log_active->ddl_length +=
dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
dnode_rele(dlu->dlu_dn, FTAG);
ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
memset(dlu, 0, sizeof (ddt_log_update_t));
ddt_log_update_stats(ddt);
}
boolean_t
ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
{
ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
if (ddle == NULL)
return (B_FALSE);
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
avl_remove(&ddl->ddl_tree, ddle);
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
return (B_TRUE);
}
boolean_t
ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
{
ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
if (ddle == NULL)
return (B_FALSE);
ddt_lightweight_entry_t ddlwe;
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
avl_remove(&ddl->ddl_tree, ddle);
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
return (B_TRUE);
}
boolean_t
ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe)
{
ddt_log_entry_t *ddle =
avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
if (!ddle)
ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
if (!ddle)
return (B_FALSE);
if (ddlwe)
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
return (B_TRUE);
}
void
ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{
ddt_log_t *ddl = ddt->ddt_log_flushing;
ASSERT3U(ddl->ddl_object, !=, 0);
#ifdef ZFS_DEBUG
/*
* There should not be any entries on the log tree before the given
* checkpoint. Assert that this is the case.
*/
ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
if (ddle != NULL)
VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
>, 0);
#endif
ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
ddl->ddl_checkpoint = ddlwe->ddlwe_key;
ddt_log_update_header(ddt, ddl, tx);
ddt_log_update_stats(ddt);
}
void
ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
{
ddt_log_t *ddl = ddt->ddt_log_flushing;
if (ddl->ddl_object == 0)
return;
ASSERT(avl_is_empty(&ddl->ddl_tree));
/* Eject the entire object */
dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
ddl->ddl_length = 0;
ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
ddt_log_update_header(ddt, ddl, tx);
ddt_log_update_stats(ddt);
}
boolean_t
ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
{
/* Swap the logs. The old flushing one must be empty */
VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
/*
* If there are still blocks on the flushing log, truncate it first.
* This can happen if there were entries on the flushing log that were
* removed in memory via ddt_lookup(); their vestigal remains are
* on disk.
*/
if (ddt->ddt_log_flushing->ddl_length > 0)
ddt_log_truncate(ddt, tx);
/*
* Swap policy. We swap the logs (and so begin flushing) when the
* active tree grows too large, or when we haven't swapped it in
* some amount of time, or if something has requested the logs be
* flushed ASAP (see ddt_walk_init()).
*/
/*
* The log tree is too large if the memory usage of its entries is over
* half of the memory limit. This effectively gives each log tree half
* the available memory.
*/
const boolean_t too_large =
(avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
const boolean_t too_old =
tx->tx_txg >=
(ddt->ddt_log_active->ddl_first_txg +
MAX(1, zfs_dedup_log_txg_max));
const boolean_t force =
ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
if (!(too_large || too_old || force))
return (B_FALSE);
ddt_log_t *swap = ddt->ddt_log_active;
ddt->ddt_log_active = ddt->ddt_log_flushing;
ddt->ddt_log_flushing = swap;
ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
ddt->ddt_log_active->ddl_flags &=
~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
ddt_log_update_stats(ddt);
return (B_TRUE);
}
static inline void
ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
const ddt_key_t *checkpoint)
{
ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
ddt_log_record_entry_t *dlre =
(ddt_log_record_entry_t *)dlr->dlr_payload;
if (checkpoint != NULL &&
ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
/* Skip pre-checkpoint entries; they're already flushed. */
return;
}
ddt_lightweight_entry_t ddlwe;
ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
ddlwe.ddlwe_key = dlre->dlre_key;
memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
ddt_log_update_entry(ddt, ddl, &ddlwe);
}
static void
ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
{
void *cookie = NULL;
ddt_log_entry_t *ddle;
IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
while ((ddle =
avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
}
ASSERT(avl_is_empty(&ddl->ddl_tree));
}
static int
ddt_log_load_one(ddt_t *ddt, uint_t n)
{
ASSERT3U(n, <, 2);
ddt_log_t *ddl = &ddt->ddt_log[n];
char name[DDT_NAMELEN];
ddt_log_name(ddt, name, n);
uint64_t obj;
int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
sizeof (uint64_t), 1, &obj);
if (err == ENOENT)
return (0);
if (err != 0)
return (err);
dnode_t *dn;
err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
if (err != 0)
return (err);
ddt_log_header_t hdr;
dmu_buf_t *db;
err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err != 0) {
dnode_rele(dn, FTAG);
return (err);
}
memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
dmu_buf_rele(db, FTAG);
if (DLH_GET_VERSION(&hdr) != 1) {
dnode_rele(dn, FTAG);
zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
"unknown version=%llu", spa_name(ddt->ddt_spa), name,
(u_longlong_t)DLH_GET_VERSION(&hdr));
return (SET_ERROR(EINVAL));
}
ddt_key_t *checkpoint = NULL;
if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
/*
* If the log has a checkpoint, then we can ignore any entries
* that have already been flushed.
*/
ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
checkpoint = &hdr.dlh_checkpoint;
}
if (hdr.dlh_length > 0) {
dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
ZIO_PRIORITY_SYNC_READ);
for (uint64_t offset = 0; offset < hdr.dlh_length;
offset += dn->dn_datablksz) {
err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
DMU_READ_PREFETCH);
if (err != 0) {
dnode_rele(dn, FTAG);
ddt_log_empty(ddt, ddl);
return (err);
}
uint64_t boffset = 0;
while (boffset < db->db_size) {
ddt_log_record_t *dlr =
(ddt_log_record_t *)(db->db_data + boffset);
/* Partially-filled block, skip the rest */
if (DLR_GET_TYPE(dlr) == DLR_INVALID)
break;
switch (DLR_GET_TYPE(dlr)) {
case DLR_ENTRY:
ddt_log_load_entry(ddt, ddl, dlr,
checkpoint);
break;
default:
dmu_buf_rele(db, FTAG);
dnode_rele(dn, FTAG);
ddt_log_empty(ddt, ddl);
return (SET_ERROR(EINVAL));
}
boffset += DLR_GET_RECLEN(dlr);
}
dmu_buf_rele(db, FTAG);
}
}
dnode_rele(dn, FTAG);
ddl->ddl_object = obj;
ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
ddl->ddl_length = hdr.dlh_length;
ddl->ddl_first_txg = hdr.dlh_first_txg;
if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
ddt->ddt_log_flushing = ddl;
else
ddt->ddt_log_active = ddl;
return (0);
}
int
ddt_log_load(ddt_t *ddt)
{
int err;
if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
/*
* The DDT is going to be freed again in a moment, so there's
* no point loading the log; it'll just slow down import.
*/
return (0);
}
ASSERT0(ddt->ddt_log[0].ddl_object);
ASSERT0(ddt->ddt_log[1].ddl_object);
if (ddt->ddt_dir_object == 0) {
/*
* If we're configured but the containing dir doesn't exist
* yet, then the log object can't possibly exist either.
*/
ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
return (SET_ERROR(ENOENT));
}
if ((err = ddt_log_load_one(ddt, 0)) != 0)
return (err);
if ((err = ddt_log_load_one(ddt, 1)) != 0)
return (err);
VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
/*
* We have two finalisation tasks:
*
* - rebuild the histogram. We do this at the end rather than while
* we're loading so we don't need to uncount and recount entries that
* appear multiple times in the log.
*
* - remove entries from the flushing tree that are on both trees. This
* happens when ddt_lookup() rehydrates an entry from the flushing
* tree, as ddt_log_take_key() removes the entry from the in-memory
* tree but doesn't remove it from disk.
*/
/*
* We don't technically need a config lock here, since there shouldn't
* be pool config changes during DDT load. dva_get_dsize_sync() via
* ddt_stat_generate() is expecting it though, and it won't hurt
* anything, so we take it.
*/
spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
ddt_log_entry_t *ae = avl_first(al);
ddt_log_entry_t *fe = avl_first(fl);
while (ae != NULL || fe != NULL) {
ddt_log_entry_t *ddle;
if (ae == NULL) {
/* active exhausted, take flushing */
ddle = fe;
fe = AVL_NEXT(fl, fe);
} else if (fe == NULL) {
/* flushing exuhausted, take active */
ddle = ae;
ae = AVL_NEXT(al, ae);
} else {
/* compare active and flushing */
int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
if (c < 0) {
/* active behind, take and advance */
ddle = ae;
ae = AVL_NEXT(al, ae);
} else if (c > 0) {
/* flushing behind, take and advance */
ddle = fe;
fe = AVL_NEXT(fl, fe);
} else {
/* match. remove from flushing, take active */
ddle = fe;
fe = AVL_NEXT(fl, fe);
avl_remove(fl, ddle);
ddle = ae;
ae = AVL_NEXT(al, ae);
}
}
ddt_lightweight_entry_t ddlwe;
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
}
spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
ddt_log_update_stats(ddt);
return (0);
}
void
ddt_log_alloc(ddt_t *ddt)
{
ASSERT3P(ddt->ddt_log_active, ==, NULL);
ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
ddt->ddt_log_active = &ddt->ddt_log[0];
ddt->ddt_log_flushing = &ddt->ddt_log[1];
ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
}
void
ddt_log_free(ddt_t *ddt)
{
ddt_log_empty(ddt, &ddt->ddt_log[0]);
ddt_log_empty(ddt, &ddt->ddt_log[1]);
avl_destroy(&ddt->ddt_log[0].ddl_tree);
avl_destroy(&ddt->ddt_log[1].ddl_tree);
}
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
"Max transactions before starting to flush dedup logs");
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
"Max memory for dedup logs");
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
"Max memory for dedup logs, as % of total memory");

View File

@ -33,27 +33,32 @@
#include <sys/ddt_impl.h>
static void
ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
ddt_stat_t *dds)
{
spa_t *spa = ddt->ddt_spa;
ddt_phys_t *ddp = dde->dde_phys;
ddt_key_t *ddk = &dde->dde_key;
uint64_t lsize = DDK_GET_LSIZE(ddk);
uint64_t psize = DDK_GET_PSIZE(ddk);
uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key);
uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key);
memset(dds, 0, sizeof (*dds));
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
uint64_t dsize = 0;
uint64_t refcnt = ddp->ddp_refcnt;
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (ddp->ddp_phys_birth == 0)
if (ddt_phys_birth(ddp, v) == 0)
continue;
int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
int ndvas = ddt_phys_dva_count(ddp, v,
DDK_GET_CRYPT(&ddlwe->ddlwe_key));
const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
uint64_t dsize = 0;
for (int d = 0; d < ndvas; d++)
dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
dsize += dva_get_dsize_sync(spa, &dvas[d]);
uint64_t refcnt = ddt_phys_refcnt(ddp, v);
dds->dds_blocks += 1;
dds->dds_lsize += lsize;
@ -67,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
}
}
void
ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
static void
ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src)
{
const uint64_t *s = (const uint64_t *)src;
uint64_t *d = (uint64_t *)dst;
uint64_t *d_end = (uint64_t *)(dst + 1);
dst->dds_blocks += src->dds_blocks;
dst->dds_lsize += src->dds_lsize;
dst->dds_psize += src->dds_psize;
dst->dds_dsize += src->dds_dsize;
dst->dds_ref_blocks += src->dds_ref_blocks;
dst->dds_ref_lsize += src->dds_ref_lsize;
dst->dds_ref_psize += src->dds_ref_psize;
dst->dds_ref_dsize += src->dds_ref_dsize;
}
ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
static void
ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src)
{
/* This caught more during development than you might expect... */
ASSERT3U(dst->dds_blocks, >=, src->dds_blocks);
ASSERT3U(dst->dds_lsize, >=, src->dds_lsize);
ASSERT3U(dst->dds_psize, >=, src->dds_psize);
ASSERT3U(dst->dds_dsize, >=, src->dds_dsize);
ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks);
ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize);
ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize);
ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize);
for (int i = 0; i < d_end - d; i++)
d[i] += (s[i] ^ neg) - neg;
dst->dds_blocks -= src->dds_blocks;
dst->dds_lsize -= src->dds_lsize;
dst->dds_psize -= src->dds_psize;
dst->dds_dsize -= src->dds_dsize;
dst->dds_ref_blocks -= src->dds_ref_blocks;
dst->dds_ref_lsize -= src->dds_ref_lsize;
dst->dds_ref_psize -= src->dds_ref_psize;
dst->dds_ref_dsize -= src->dds_ref_dsize;
}
void
ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
const ddt_lightweight_entry_t *ddlwe)
{
ddt_stat_t dds;
ddt_histogram_t *ddh;
int bucket;
ddt_stat_generate(ddt, dde, &dds);
ddt_stat_generate(ddt, ddlwe, &dds);
bucket = highbit64(dds.dds_ref_blocks) - 1;
ASSERT3U(bucket, >=, 0);
if (bucket < 0)
return;
ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
ddt_stat_add(&ddh->ddh_stat[bucket], &dds);
}
ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
void
ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
const ddt_lightweight_entry_t *ddlwe)
{
ddt_stat_t dds;
int bucket;
ddt_stat_generate(ddt, ddlwe, &dds);
bucket = highbit64(dds.dds_ref_blocks) - 1;
if (bucket < 0)
return;
ddt_stat_sub(&ddh->ddh_stat[bucket], &dds);
}
void
ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
{
for (int h = 0; h < 64; h++)
ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]);
}
void
ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh)
{
memset(dds, 0, sizeof (*dds));
for (int h = 0; h < 64; h++)
ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
ddt_stat_add(dds, &ddh->ddh_stat[h]);
}
boolean_t
ddt_histogram_empty(const ddt_histogram_t *ddh)
{
const uint64_t *s = (const uint64_t *)ddh;
const uint64_t *s_end = (const uint64_t *)(ddh + 1);
for (int h = 0; h < 64; h++) {
const ddt_stat_t *dds = &ddh->ddh_stat[h];
while (s < s_end)
if (*s++ != 0)
return (B_FALSE);
if (dds->dds_blocks == 0 &&
dds->dds_lsize == 0 &&
dds->dds_psize == 0 &&
dds->dds_dsize == 0 &&
dds->dds_ref_blocks == 0 &&
dds->dds_ref_lsize == 0 &&
dds->dds_ref_psize == 0 &&
dds->dds_ref_dsize == 0)
continue;
return (B_FALSE);
}
return (B_TRUE);
}
@ -170,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
ddo_total->ddo_mspace += ddo->ddo_mspace;
}
}
ddt_object_t *ddo = &ddt->ddt_log_stats;
ddo_total->ddo_count += ddo->ddo_count;
ddo_total->ddo_dspace += ddo->ddo_dspace;
ddo_total->ddo_mspace += ddo->ddo_mspace;
}
/*
@ -207,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
&ddt->ddt_histogram_cache[type][class]);
}
}
ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
}
}
@ -217,7 +276,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
ddt_get_dedup_histogram(spa, ddh_total);
ddt_histogram_stat(dds_total, ddh_total);
ddt_histogram_total(dds_total, ddh_total);
kmem_free(ddh_total, sizeof (ddt_histogram_t));
}

View File

@ -22,6 +22,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
* Copyright (c) 2023, Klara Inc.
*/
#include <sys/zfs_context.h>
@ -51,8 +52,13 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */
c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
ci->ci_level);
/* Call compress function directly to avoid hole detection. */
abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, (void *)src, s_len);
abd_get_from_buf_struct(&dabd, dst, d_len);
c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
abd_free(&dabd);
abd_free(&sabd);
if (c_len == s_len) {
cpfunc = ZIO_COMPRESS_OFF;
@ -71,12 +77,18 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
{
uchar_t version = *src++;
int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
zio_compress_info_t *ci = &zio_compress_table[cpfunc];
if (ci->ci_decompress != NULL)
(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
else
if (zio_compress_table[cpfunc].ci_decompress == NULL) {
memcpy(dst, src, d_len);
return;
}
abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, src, s_len);
abd_get_from_buf_struct(&dabd, dst, d_len);
VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
abd_free(&dabd);
abd_free(&sabd);
if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
(ZFS_HOST_BYTEORDER != 0))
@ -108,7 +120,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
static int
ddt_zap_lookup(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
const ddt_key_t *ddk, void *phys, size_t psize)
{
uchar_t *cbuf;
uint64_t one, csize;
@ -155,7 +167,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)
static int
ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
const void *phys, size_t psize, dmu_tx_t *tx)
{
const size_t cbuf_size = psize + 1;
@ -181,7 +193,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
static int
ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
ddt_phys_t *phys, size_t psize)
void *phys, size_t psize)
{
zap_cursor_t zc;
zap_attribute_t za;

View File

@ -95,6 +95,12 @@ uint_t dmu_prefetch_max = 8 * 1024 * 1024;
uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
#endif
/*
* Override copies= for dedup state objects. 0 means the traditional behaviour
* (ie the default for the containing objset ie 3 for the MOS).
*/
uint_t dmu_ddt_copies = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
{DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
@ -2272,6 +2278,28 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
case ZFS_REDUNDANT_METADATA_NONE:
break;
}
if (dmu_ddt_copies > 0) {
/*
* If this tuneable is set, and this is a write for a
* dedup entry store (zap or log), then we treat it
* something like ZFS_REDUNDANT_METADATA_MOST on a
* regular dataset: this many copies, and one more for
* "higher" indirect blocks. This specific exception is
* necessary because dedup objects are stored in the
* MOS, which always has the highest possible copies.
*/
dmu_object_type_t stype =
dn ? dn->dn_storage_type : DMU_OT_NONE;
if (stype == DMU_OT_NONE)
stype = type;
if (stype == DMU_OT_DDT_ZAP) {
copies = dmu_ddt_copies;
if (level >=
zfs_redundant_metadata_most_ditto_level)
copies++;
}
}
} else if (wp & WP_NOFILL) {
ASSERT(level == 0);
@ -2824,3 +2852,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
"Limit one prefetch call to this size");
/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
"Override copies= for dedup objects");

View File

@ -1391,7 +1391,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
abd_t *dabd = abd_alloc_linear(
drrw->drr_logical_size, B_FALSE);
err = zio_decompress_data(drrw->drr_compressiontype,
abd, abd_to_buf(dabd), abd_get_size(abd),
abd, dabd, abd_get_size(abd),
abd_get_size(dabd), NULL);
if (err != 0) {
@ -1407,9 +1407,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
/* Recompress the data */
abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
B_FALSE);
void *buf = abd_to_buf(cabd);
uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
abd, &buf, abd_get_size(abd),
abd, &cabd, abd_get_size(abd),
rwa->os->os_complevel);
abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
/* Swap in newly compressed data into the abd */
@ -2221,7 +2220,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
err = zio_decompress_data(
drrw->drr_compressiontype,
abd, abd_to_buf(decomp_abd),
abd, decomp_abd,
abd_get_size(abd),
abd_get_size(decomp_abd), NULL);

View File

@ -2425,8 +2425,14 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
fnvlist_free(token_nv);
compressed = kmem_alloc(packed_size, KM_SLEEP);
compressed_size = gzip_compress(packed, compressed,
/* Call compress function directly to avoid hole detection. */
abd_t pabd, cabd;
abd_get_from_buf_struct(&pabd, packed, packed_size);
abd_get_from_buf_struct(&cabd, compressed, packed_size);
compressed_size = zfs_gzip_compress(&pabd, &cabd,
packed_size, packed_size, 6);
abd_free(&cabd);
abd_free(&pabd);
zio_cksum_t cksum;
fletcher_4_native_varsize(compressed, compressed_size, &cksum);

View File

@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
zap_cursor_fini(&zc);
}
ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
spa_scan_stat_init(spa);
vdev_scan_stat_init(spa->spa_root_vdev);
@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
spa_history_log_internal(spa, "scan setup", tx,
@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
txg_sync_waiting(scn->scn_dp) ||
NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
spa_shutting_down(scn->scn_dp->dp_spa) ||
(zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
(zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
!ddt_walk_ready(scn->scn_dp->dp_spa)) {
if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
dprintf("suspending at first available bookmark "
"%llx/%llx/%llx/%llx\n",
@ -2929,11 +2934,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
void
dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx)
ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{
(void) tx;
const ddt_key_t *ddk = &dde->dde_key;
ddt_phys_t *ddp = dde->dde_phys;
const ddt_key_t *ddk = &ddlwe->ddlwe_key;
blkptr_t bp;
zbookmark_phys_t zb = { 0 };
@ -2954,11 +2958,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
if (scn->scn_done_txg != 0)
return;
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
continue;
ddt_bp_create(checksum, ddk, ddp, &bp);
ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
scn->scn_visited_this_txg++;
scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
@ -3002,11 +3008,11 @@ static void
dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
{
ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
ddt_entry_t dde = {{{{0}}}};
ddt_lightweight_entry_t ddlwe = {0};
int error;
uint64_t n = 0;
while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
ddt_t *ddt;
if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
@ -3021,16 +3027,28 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
ASSERT(avl_first(&ddt->ddt_tree) == NULL);
dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
n++;
if (dsl_scan_check_suspend(scn, NULL))
break;
}
zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
"suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
(int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
if (error == EAGAIN) {
dsl_scan_check_suspend(scn, NULL);
error = 0;
zfs_dbgmsg("waiting for ddt to become ready for scan "
"on %s with class_max = %u; suspending=%u",
scn->scn_dp->dp_spa->spa_name,
(int)scn->scn_phys.scn_ddt_class_max,
(int)scn->scn_suspending);
} else
zfs_dbgmsg("scanned %llu ddt entries on %s with "
"class_max = %u; suspending=%u", (longlong_t)n,
scn->scn_dp->dp_spa->spa_name,
(int)scn->scn_phys.scn_ddt_class_max,
(int)scn->scn_suspending);
ASSERT(error == 0 || error == ENOENT);
ASSERT(error != ENOENT ||

View File

@ -47,8 +47,9 @@ typedef uLongf zlen_t;
#endif
size_t
gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
static size_t
zfs_gzip_compress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{
int ret;
zlen_t dstlen = d_len;
@ -82,8 +83,9 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
return ((size_t)dstlen);
}
int
gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
static int
zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{
(void) n;
zlen_t dstlen = d_len;
@ -103,3 +105,6 @@ gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
return (0);
}
ZFS_COMPRESS_WRAP_DECL(zfs_gzip_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_gzip_decompress)

View File

@ -53,8 +53,8 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
static void *lz4_alloc(int flags);
static void lz4_free(void *ctx);
size_t
lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
static size_t
zfs_lz4_compress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{
(void) n;
@ -81,8 +81,8 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
return (bufsiz + sizeof (bufsiz));
}
int
lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
static int
zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{
(void) n;
@ -101,6 +101,9 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
d_start, bufsiz, d_len) < 0);
}
ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_lz4_decompress)
/*
* LZ4 API Description:
*

Some files were not shown because too many files have changed in this diff Show More