diff --git a/sys/contrib/openzfs/.mailmap b/sys/contrib/openzfs/.mailmap index 5aa1eef464d9..64e02681e37d 100644 --- a/sys/contrib/openzfs/.mailmap +++ b/sys/contrib/openzfs/.mailmap @@ -70,6 +70,7 @@ Rob Norris Rob Norris Sam Lunt Sanjeev Bagewadi +Sebastian Wuerl Stoiko Ivanov Tamas TEVESZ WHR @@ -78,6 +79,7 @@ Youzhong Yang # Signed-off-by: overriding Author: Ryan +Sietse Qiuhao Chen Yuxin Wang Zhenlei Huang diff --git a/sys/contrib/openzfs/AUTHORS b/sys/contrib/openzfs/AUTHORS index 6a5cc088e651..b4342f6912ae 100644 --- a/sys/contrib/openzfs/AUTHORS +++ b/sys/contrib/openzfs/AUTHORS @@ -423,6 +423,7 @@ CONTRIBUTORS: Mathieu Velten Matt Fiddaman Matthew Ahrens + Matthew Heller Matthew Thode Matthias Blankertz Matt Johnston @@ -562,6 +563,7 @@ CONTRIBUTORS: Scot W. Stevenson Sean Eric Fagan Sebastian Gottschall + Sebastian Wuerl Sebastien Roy Sen Haerens Serapheim Dimitropoulos @@ -574,6 +576,7 @@ CONTRIBUTORS: Shawn Bayern Shengqi Chen Shen Yan + Sietse Simon Guest Simon Klinkert Sowrabha Gopal @@ -629,6 +632,7 @@ CONTRIBUTORS: Trevor Bautista Trey Dockendorf Troels Nørgaard + tstabrawa Tulsi Jain Turbo Fredriksson Tyler J. Stachecki diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 185cca4a44d4..dc19ac37b355 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.11 +Linux-Maximum: 6.12 Linux-Minimum: 4.18 diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/arc_summary index c24d400fa39a..72381d266e64 100755 --- a/sys/contrib/openzfs/cmd/arc_summary +++ b/sys/contrib/openzfs/cmd/arc_summary @@ -662,10 +662,7 @@ def section_arc(kstats_dict): print() print('ARC hash breakdown:') - prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max'])) - prt_i2('Elements current:', - f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']), - f_hits(arc_stats['hash_elements'])) + prt_i1('Elements:', f_hits(arc_stats['hash_elements'])) prt_i1('Collisions:', f_hits(arc_stats['hash_collisions'])) prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max'])) diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 89d19b675fdd..ae88e9edc17d 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -2119,9 +2119,6 @@ dump_brt(spa_t *spa) return; } - brt_t *brt = spa->spa_brt; - VERIFY(brt); - char count[32], used[32], saved[32]; zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); @@ -2132,11 +2129,8 @@ dump_brt(spa_t *spa) if (dump_opt['T'] < 2) return; - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd == NULL) - continue; - + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; if (!brtvd->bv_initiated) { printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); continue; @@ -2160,20 +2154,21 @@ dump_brt(spa_t *spa) if (!do_histo) printf("\n%-16s %-10s\n", "DVA", "REFCNT"); - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd == NULL || !brtvd->bv_initiated) + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + if (!brtvd->bv_initiated) continue; uint64_t counts[64] = {}; zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); - for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); + for (zap_cursor_init(&zc, spa->spa_meta_objset, + brtvd->bv_mos_entries); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t refcnt; - VERIFY0(zap_lookup_uint64(brt->brt_mos, + VERIFY0(zap_lookup_uint64(spa->spa_meta_objset, brtvd->bv_mos_entries, (const uint64_t *)za->za_name, 1, za->za_integer_length, za->za_num_integers, @@ -8227,14 +8222,11 @@ dump_mos_leaks(spa_t *spa) } } - if (spa->spa_brt != NULL) { - brt_t *brt = spa->spa_brt; - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd != NULL && brtvd->bv_initiated) { - mos_obj_refd(brtvd->bv_mos_brtvdev); - mos_obj_refd(brtvd->bv_mos_entries); - } + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + if (brtvd->bv_initiated) { + mos_obj_refd(brtvd->bv_mos_brtvdev); + mos_obj_refd(brtvd->bv_mos_entries); } } diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c index 1ef5c631a438..6f994b68a127 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c @@ -445,8 +445,8 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, * its a loopback event from spa_async_remove(). Just * ignore it. */ - if (vs->vs_state == VDEV_STATE_REMOVED && - state == VDEV_STATE_REMOVED) + if ((vs->vs_state == VDEV_STATE_REMOVED && state == + VDEV_STATE_REMOVED) || vs->vs_state == VDEV_STATE_OFFLINE) return; /* Remove the vdev since device is unplugged */ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h index 9eb424dd0373..615f97351ec4 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h @@ -201,7 +201,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) "failed (%lld " #OP " %lld) " STR "\n", \ (long long)(_verify3_left), \ (long long)(_verify3_right), \ - __VA_ARGS); \ + __VA_ARGS__); \ } while (0) #define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \ @@ -213,7 +213,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) "failed (%llu " #OP " %llu) " STR "\n", \ (unsigned long long)(_verify3_left), \ (unsigned long long)(_verify3_right), \ - __VA_ARGS); \ + __VA_ARGS__); \ } while (0) #define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode.h index 8df28071252d..c7ade2564757 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode.h @@ -98,11 +98,9 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync) { if (vm_object_mightbedirty(vp->v_object)) { int flags = sync ? OBJPC_SYNC : 0; - vn_lock(vp, LK_SHARED | LK_RETRY); zfs_vmobject_wlock(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, flags); zfs_vmobject_wunlock(vp->v_object); - VOP_UNLOCK(vp); } } #endif diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h index f041dde34fc8..38cc57ae0ca7 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h @@ -205,7 +205,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) "failed (%lld " #OP " %lld) " STR "\n", \ (long long)(_verify3_left), \ (long long)(_verify3_right), \ - __VA_ARGS); \ + __VA_ARGS__); \ } while (0) #define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \ @@ -217,7 +217,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) "failed (%llu " #OP " %llu) " STR "\n", \ (unsigned long long)(_verify3_left), \ (unsigned long long)(_verify3_right), \ - __VA_ARGS); \ + __VA_ARGS__); \ } while (0) #define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \ diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h index 883c07b4ff3d..5148905c93d8 100644 --- a/sys/contrib/openzfs/include/sys/arc.h +++ b/sys/contrib/openzfs/include/sys/arc.h @@ -347,6 +347,7 @@ void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); void l2arc_spa_rebuild_start(spa_t *spa); +void l2arc_spa_rebuild_stop(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index 01693d72dda8..b2839bdf1485 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -942,6 +942,7 @@ typedef struct arc_sums { wmsum_t arcstat_evict_l2_eligible_mru; wmsum_t arcstat_evict_l2_ineligible; wmsum_t arcstat_evict_l2_skip; + wmsum_t arcstat_hash_elements; wmsum_t arcstat_hash_collisions; wmsum_t arcstat_hash_chains; aggsum_t arcstat_size; diff --git a/sys/contrib/openzfs/include/sys/brt_impl.h b/sys/contrib/openzfs/include/sys/brt_impl.h index 9cc06fbb2c3a..168d81f17b72 100644 --- a/sys/contrib/openzfs/include/sys/brt_impl.h +++ b/sys/contrib/openzfs/include/sys/brt_impl.h @@ -86,28 +86,38 @@ typedef struct brt_vdev_phys { uint64_t bvp_savedspace; } brt_vdev_phys_t; -typedef struct brt_vdev { +struct brt_vdev { + /* + * Pending changes from open contexts. + */ + kmutex_t bv_pending_lock; + avl_tree_t bv_pending_tree[TXG_SIZE]; + /* + * Protects bv_mos_*. + */ + krwlock_t bv_mos_entries_lock ____cacheline_aligned; + /* + * Protects all the fields starting from bv_initiated. + */ + krwlock_t bv_lock ____cacheline_aligned; /* * VDEV id. */ - uint64_t bv_vdevid; - /* - * Is the structure initiated? - * (bv_entcount and bv_bitmap are allocated?) - */ - boolean_t bv_initiated; + uint64_t bv_vdevid ____cacheline_aligned; /* * Object number in the MOS for the entcount array and brt_vdev_phys. */ uint64_t bv_mos_brtvdev; /* - * Object number in the MOS for the entries table. + * Object number in the MOS and dnode for the entries table. */ uint64_t bv_mos_entries; + dnode_t *bv_mos_entries_dnode; /* - * Entries to sync. + * Is the structure initiated? + * (bv_entcount and bv_bitmap are allocated?) */ - avl_tree_t bv_tree; + boolean_t bv_initiated; /* * Does the bv_entcount[] array needs byte swapping? */ @@ -120,6 +130,26 @@ typedef struct brt_vdev { * This is the array with BRT entry count per BRT_RANGESIZE. */ uint16_t *bv_entcount; + /* + * bv_entcount[] potentially can be a bit too big to sychronize it all + * when we just changed few entcounts. The fields below allow us to + * track updates to bv_entcount[] array since the last sync. + * A single bit in the bv_bitmap represents as many entcounts as can + * fit into a single BRT_BLOCKSIZE. + * For example we have 65536 entcounts in the bv_entcount array + * (so the whole array is 128kB). We updated bv_entcount[2] and + * bv_entcount[5]. In that case only first bit in the bv_bitmap will + * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + */ + ulong_t *bv_bitmap; + /* + * bv_entcount[] needs updating on disk. + */ + boolean_t bv_entcount_dirty; + /* + * brt_vdev_phys needs updating on disk. + */ + boolean_t bv_meta_dirty; /* * Sum of all bv_entcount[]s. */ @@ -133,65 +163,27 @@ typedef struct brt_vdev { */ uint64_t bv_savedspace; /* - * brt_vdev_phys needs updating on disk. + * Entries to sync. */ - boolean_t bv_meta_dirty; - /* - * bv_entcount[] needs updating on disk. - */ - boolean_t bv_entcount_dirty; - /* - * bv_entcount[] potentially can be a bit too big to sychronize it all - * when we just changed few entcounts. The fields below allow us to - * track updates to bv_entcount[] array since the last sync. - * A single bit in the bv_bitmap represents as many entcounts as can - * fit into a single BRT_BLOCKSIZE. - * For example we have 65536 entcounts in the bv_entcount array - * (so the whole array is 128kB). We updated bv_entcount[2] and - * bv_entcount[5]. In that case only first bit in the bv_bitmap will - * be set and we will write only first BRT_BLOCKSIZE out of 128kB. - */ - ulong_t *bv_bitmap; - uint64_t bv_nblocks; -} brt_vdev_t; + avl_tree_t bv_tree; +}; -/* - * In-core brt - */ -typedef struct brt { - krwlock_t brt_lock; - spa_t *brt_spa; -#define brt_mos brt_spa->spa_meta_objset - uint64_t brt_rangesize; - uint64_t brt_usedspace; - uint64_t brt_savedspace; - avl_tree_t brt_pending_tree[TXG_SIZE]; - kmutex_t brt_pending_lock[TXG_SIZE]; - /* Sum of all entries across all bv_trees. */ - uint64_t brt_nentries; - brt_vdev_t *brt_vdevs; - uint64_t brt_nvdevs; -} brt_t; - -/* Size of bre_offset / sizeof (uint64_t). */ +/* Size of offset / sizeof (uint64_t). */ #define BRT_KEY_WORDS (1) +#define BRE_OFFSET(bre) (DVA_GET_OFFSET(&(bre)->bre_bp.blk_dva[0])) + /* * In-core brt entry. - * On-disk we use bre_offset as the key and bre_refcount as the value. + * On-disk we use ZAP with offset as the key and count as the value. */ typedef struct brt_entry { - uint64_t bre_offset; - uint64_t bre_refcount; avl_node_t bre_node; + blkptr_t bre_bp; + uint64_t bre_count; + uint64_t bre_pcount; } brt_entry_t; -typedef struct brt_pending_entry { - blkptr_t bpe_bp; - int bpe_count; - avl_node_t bpe_node; -} brt_pending_entry_t; - #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index ca30b60c0af7..52601921fc3c 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -53,6 +53,7 @@ extern "C" { /* * Forward references that lots of things need. */ +typedef struct brt_vdev brt_vdev_t; typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 7811abbb9ce3..d1da87105103 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -412,8 +412,12 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + uint64_t spa_rdspace; /* raw (non-dedup) --//-- */ boolean_t spa_active_ddt_prune; /* ddt prune process active */ - struct brt *spa_brt; /* in-core BRT */ + brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */ + uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */ + uint64_t spa_brt_rangesize; /* pool's BRT range size */ + krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ diff --git a/sys/contrib/openzfs/include/sys/zap.h b/sys/contrib/openzfs/include/sys/zap.h index 53166e094a72..c8d24b1100be 100644 --- a/sys/contrib/openzfs/include/sys/zap.h +++ b/sys/contrib/openzfs/include/sys/zap.h @@ -223,11 +223,15 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, boolean_t *normalization_conflictp); int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name); int zap_prefetch_object(objset_t *os, uint64_t zapobj); int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); +int zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints); int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); @@ -236,9 +240,6 @@ int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp); -int zap_count_write_by_dnode(dnode_t *dn, const char *name, - int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite); - /* * Create an attribute with the given name and value. * diff --git a/sys/contrib/openzfs/man/man8/zpool-remove.8 b/sys/contrib/openzfs/man/man8/zpool-remove.8 index b5cc6e4fc57e..00216b65a8d7 100644 --- a/sys/contrib/openzfs/man/man8/zpool-remove.8 +++ b/sys/contrib/openzfs/man/man8/zpool-remove.8 @@ -109,7 +109,7 @@ Stops and cancels an in-progress removal of a top-level vdev. .El . .Sh EXAMPLES -.\" These are, respectively, examples 14 from zpool.8 +.\" These are, respectively, examples 15 from zpool.8 .\" Make sure to update them bidirectionally .Ss Example 1 : No Removing a Mirrored top-level (Log or Data) Device The following commands remove the mirrored log device @@ -142,9 +142,43 @@ The command to remove the mirrored log .Ar mirror-2 No is : .Dl # Nm zpool Cm remove Ar tank mirror-2 .Pp +At this point, the log device no longer exists +(both sides of the mirror have been removed): +.Bd -literal -compact -offset Ds + pool: tank + state: ONLINE + scan: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 + mirror-1 ONLINE 0 0 0 + sdc ONLINE 0 0 0 + sdd ONLINE 0 0 0 +.Ed +.Pp The command to remove the mirrored data .Ar mirror-1 No is : .Dl # Nm zpool Cm remove Ar tank mirror-1 +.Pp +After +.Ar mirror-1 No has been evacuated, the pool remains redundant, but +the total amount of space is reduced: +.Bd -literal -compact -offset Ds + pool: tank + state: ONLINE + scan: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 +.Ed . .Sh SEE ALSO .Xr zpool-add 8 , diff --git a/sys/contrib/openzfs/man/man8/zpool.8 b/sys/contrib/openzfs/man/man8/zpool.8 index 02a258f66708..b54a92f96151 100644 --- a/sys/contrib/openzfs/man/man8/zpool.8 +++ b/sys/contrib/openzfs/man/man8/zpool.8 @@ -405,9 +405,43 @@ The command to remove the mirrored log .Ar mirror-2 No is : .Dl # Nm zpool Cm remove Ar tank mirror-2 .Pp +At this point, the log device no longer exists +(both sides of the mirror have been removed): +.Bd -literal -compact -offset Ds + pool: tank + state: ONLINE + scan: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 + mirror-1 ONLINE 0 0 0 + sdc ONLINE 0 0 0 + sdd ONLINE 0 0 0 +.Ed +.Pp The command to remove the mirrored data .Ar mirror-1 No is : .Dl # Nm zpool Cm remove Ar tank mirror-1 +.Pp +After +.Ar mirror-1 No has been evacuated, the pool remains redundant, but +the total amount of space is reduced: +.Bd -literal -compact -offset Ds + pool: tank + state: ONLINE + scan: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 +.Ed . .Ss Example 16 : No Displaying expanded space on a device The following command displays the detailed information for the pool diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index d0535cd7f737..d0f07929d22f 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -291,8 +291,12 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, case F_SEEK_HOLE: { off = *(offset_t *)data; + error = vn_lock(vp, LK_SHARED); + if (error) + return (error); /* offset parameter is in/out */ error = zfs_holey(VTOZ(vp), com, &off); + VOP_UNLOCK(vp); if (error) return (error); *(offset_t *)data = off; @@ -452,8 +456,10 @@ mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio) if (!vm_page_wired(pp) && pp->valid == 0 && vm_page_busy_tryupgrade(pp)) vm_page_free(pp); - else + else { + vm_page_deactivate_noreuse(pp); vm_page_sunbusy(pp); + } zfs_vmobject_wunlock(obj); } } else { @@ -3928,6 +3934,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0) return (zfs_vm_pagerret_error); + object = ma[0]->object; start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); @@ -3936,33 +3943,47 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, * Note that we need to handle the case of the block size growing. */ for (;;) { + uint64_t len; + blksz = zp->z_blksz; + len = roundup(end, blksz) - rounddown(start, blksz); + lr = zfs_rangelock_tryenter(&zp->z_rangelock, - rounddown(start, blksz), - roundup(end, blksz) - rounddown(start, blksz), RL_READER); + rounddown(start, blksz), len, RL_READER); if (lr == NULL) { - if (rahead != NULL) { - *rahead = 0; - rahead = NULL; + /* + * Avoid a deadlock with update_pages(). We need to + * hold the range lock when copying from the DMU, so + * give up the busy lock to allow update_pages() to + * proceed. We might need to allocate new pages, which + * isn't quite right since this allocation isn't subject + * to the page fault handler's OOM logic, but this is + * the best we can do for now. + */ + for (int i = 0; i < count; i++) { + ASSERT(vm_page_none_valid(ma[i])); + vm_page_xunbusy(ma[i]); } - if (rbehind != NULL) { - *rbehind = 0; - rbehind = NULL; - } - break; + + lr = zfs_rangelock_enter(&zp->z_rangelock, + rounddown(start, blksz), len, RL_READER); + + zfs_vmobject_wlock(object); + (void) vm_page_grab_pages(object, OFF_TO_IDX(start), + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO, + ma, count); + zfs_vmobject_wunlock(object); } if (blksz == zp->z_blksz) break; zfs_rangelock_exit(lr); } - object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { - if (lr != NULL) - zfs_rangelock_exit(lr); + zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (zfs_vm_pagerret_bad); } @@ -3987,11 +4008,33 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ - error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b, - &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); + for (int i = 0; i < count; i++) { + int dummypgsin, count1, j, last_size; - if (lr != NULL) - zfs_rangelock_exit(lr); + if (vm_page_any_valid(ma[i])) { + ASSERT(vm_page_all_valid(ma[i])); + continue; + } + for (j = i + 1; j < count; j++) { + if (vm_page_any_valid(ma[j])) { + ASSERT(vm_page_all_valid(ma[j])); + break; + } + } + count1 = j - i; + dummypgsin = 0; + last_size = j == count ? + MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE; + error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1, + i == 0 ? &pgsin_b : &dummypgsin, + j == count ? &pgsin_a : &dummypgsin, + last_size); + if (error != 0) + break; + i += count1 - 1; + } + + zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE); @@ -6159,7 +6202,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) } else { #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \ __FreeBSD_version >= 1400086 - vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false, + vn_lock_pair(invp, false, LK_SHARED, outvp, false, LK_EXCLUSIVE); #else vn_lock_pair(invp, false, outvp, false); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 287f5f36f9dd..b97b701b7460 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -375,7 +375,18 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg) struct super_block *sb = (struct super_block *)arg; int objects = 0; - (void) -zfs_prune(sb, nr_to_scan, &objects); + /* + * deactivate_locked_super calls shrinker_free and only then + * sops->kill_sb cb, resulting in UAF on umount when trying to reach + * for the shrinker functions in zpl_prune_sb of in-umount dataset. + * Increment if s_active is not zero, but don't prune if it is - + * umount could be underway. + */ + if (atomic_inc_not_zero(&sb->s_active)) { + (void) -zfs_prune(sb, nr_to_scan, &objects); + atomic_dec(&sb->s_active); + } + } const struct super_operations zpl_super_operations = { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 2396690b40fd..47aa6417068d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1176,7 +1176,7 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, limits->zql_max_segment_size = UINT_MAX; } - limits->zql_io_opt = zv->zv_volblocksize; + limits->zql_io_opt = DMU_MAX_ACCESS / 2; limits->zql_physical_block_size = zv->zv_volblocksize; limits->zql_max_discard_sectors = diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 76dc0b19139d..fa7baac04b7b 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1074,12 +1074,9 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); - ARCSTAT_MAX(arcstat_hash_chain_max, i); } - uint64_t he = atomic_inc_64_nv( - &arc_stats.arcstat_hash_elements.value.ui64); - ARCSTAT_MAX(arcstat_hash_elements_max, he); + ARCSTAT_BUMP(arcstat_hash_elements); return (NULL); } @@ -1103,8 +1100,7 @@ buf_hash_remove(arc_buf_hdr_t *hdr) arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ - atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64); - + ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); @@ -7008,6 +7004,9 @@ arc_kstat_update(kstat_t *ksp, int rw) wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); as->arcstat_evict_l2_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_skip); + as->arcstat_hash_elements.value.ui64 = + as->arcstat_hash_elements_max.value.ui64 = + wmsum_value(&arc_sums.arcstat_hash_elements); as->arcstat_hash_collisions.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_collisions); as->arcstat_hash_chains.value.ui64 = @@ -7432,6 +7431,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); + wmsum_init(&arc_sums.arcstat_hash_elements, 0); wmsum_init(&arc_sums.arcstat_hash_collisions, 0); wmsum_init(&arc_sums.arcstat_hash_chains, 0); aggsum_init(&arc_sums.arcstat_size, 0); @@ -7590,6 +7590,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); wmsum_fini(&arc_sums.arcstat_evict_l2_skip); + wmsum_fini(&arc_sums.arcstat_hash_elements); wmsum_fini(&arc_sums.arcstat_hash_collisions); wmsum_fini(&arc_sums.arcstat_hash_chains); aggsum_fini(&arc_sums.arcstat_size); @@ -9287,6 +9288,14 @@ skip: hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | + ARC_FLAG_L2_WRITING); + + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + l2arc_hdr_arcstats_increment(hdr); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* @@ -9298,12 +9307,6 @@ skip: } list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); - arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | - ARC_FLAG_L2_WRITING); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - l2arc_hdr_arcstats_increment(hdr); boolean_t commit = l2arc_log_blk_insert(dev, hdr); mutex_exit(hash_lock); @@ -9333,7 +9336,6 @@ skip: write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; - vdev_space_update(dev->l2ad_vdev, asize, 0, 0); if (commit) { /* l2ad_hand will be adjusted inside. */ @@ -9844,6 +9846,37 @@ l2arc_spa_rebuild_start(spa_t *spa) } } +void +l2arc_spa_rebuild_stop(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); + + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) + continue; + mutex_enter(&l2arc_rebuild_thr_lock); + dev->l2ad_rebuild_cancel = B_TRUE; + mutex_exit(&l2arc_rebuild_thr_lock); + } + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) + continue; + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild_began == B_TRUE) { + while (dev->l2ad_rebuild == B_TRUE) { + cv_wait(&l2arc_rebuild_thr_cv, + &l2arc_rebuild_thr_lock); + } + } + mutex_exit(&l2arc_rebuild_thr_lock); + } +} + /* * Main entry point for L2ARC rebuilding. */ @@ -9852,12 +9885,12 @@ l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; - VERIFY(!dev->l2ad_rebuild_cancel); VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; + cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); @@ -10008,8 +10041,6 @@ l2arc_rebuild(l2arc_dev_t *dev) for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { - dev->l2ad_rebuild = B_FALSE; - cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; @@ -10585,6 +10616,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) (void) zio_nowait(wzio); dev->l2ad_hand += asize; + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. @@ -10598,7 +10631,6 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); - vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c index ea8c0735c4b7..9afee4e208ec 100644 --- a/sys/contrib/openzfs/module/zfs/brt.c +++ b/sys/contrib/openzfs/module/zfs/brt.c @@ -243,7 +243,6 @@ */ static kmem_cache_t *brt_entry_cache; -static kmem_cache_t *brt_pending_entry_cache; /* * Enable/disable prefetching of BRT entries that we are going to modify. @@ -266,14 +265,11 @@ static int brt_zap_default_ibs = 12; static kstat_t *brt_ksp; typedef struct brt_stats { - kstat_named_t brt_addref_entry_in_memory; kstat_named_t brt_addref_entry_not_on_disk; kstat_named_t brt_addref_entry_on_disk; - kstat_named_t brt_addref_entry_read_lost_race; kstat_named_t brt_decref_entry_in_memory; kstat_named_t brt_decref_entry_loaded_from_disk; kstat_named_t brt_decref_entry_not_in_memory; - kstat_named_t brt_decref_entry_not_on_disk; kstat_named_t brt_decref_entry_read_lost_race; kstat_named_t brt_decref_entry_still_referenced; kstat_named_t brt_decref_free_data_later; @@ -282,14 +278,11 @@ typedef struct brt_stats { } brt_stats_t; static brt_stats_t brt_stats = { - { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, - { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, - { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, { "decref_free_data_later", KSTAT_DATA_UINT64 }, @@ -298,14 +291,11 @@ static brt_stats_t brt_stats = { }; struct { - wmsum_t brt_addref_entry_in_memory; wmsum_t brt_addref_entry_not_on_disk; wmsum_t brt_addref_entry_on_disk; - wmsum_t brt_addref_entry_read_lost_race; wmsum_t brt_decref_entry_in_memory; wmsum_t brt_decref_entry_loaded_from_disk; wmsum_t brt_decref_entry_not_in_memory; - wmsum_t brt_decref_entry_not_on_disk; wmsum_t brt_decref_entry_read_lost_race; wmsum_t brt_decref_entry_still_referenced; wmsum_t brt_decref_free_data_later; @@ -316,24 +306,24 @@ struct { #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) static int brt_entry_compare(const void *x1, const void *x2); -static int brt_pending_entry_compare(const void *x1, const void *x2); +static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); static void -brt_rlock(brt_t *brt) +brt_rlock(spa_t *spa) { - rw_enter(&brt->brt_lock, RW_READER); + rw_enter(&spa->spa_brt_lock, RW_READER); } static void -brt_wlock(brt_t *brt) +brt_wlock(spa_t *spa) { - rw_enter(&brt->brt_lock, RW_WRITER); + rw_enter(&spa->spa_brt_lock, RW_WRITER); } static void -brt_unlock(brt_t *brt) +brt_unlock(spa_t *spa) { - rw_exit(&brt->brt_lock); + rw_exit(&spa->spa_brt_lock); } static uint16_t @@ -394,14 +384,15 @@ brt_vdev_dump(brt_vdev_t *brtvd) { uint64_t idx; + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " - "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", + "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", (u_longlong_t)brtvd->bv_vdevid, brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, (u_longlong_t)brtvd->bv_size, (u_longlong_t)brtvd->bv_totalcount, - (u_longlong_t)brtvd->bv_nblocks, - (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); + (u_longlong_t)nblocks, + (size_t)BT_SIZEOFMAP(nblocks)); if (brtvd->bv_totalcount > 0) { zfs_dbgmsg(" entcounts:"); for (idx = 0; idx < brtvd->bv_size; idx++) { @@ -415,51 +406,56 @@ brt_vdev_dump(brt_vdev_t *brtvd) if (brtvd->bv_entcount_dirty) { char *bitmap; - bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); - for (idx = 0; idx < brtvd->bv_nblocks; idx++) { + bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); + for (idx = 0; idx < nblocks; idx++) { bitmap[idx] = BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } bitmap[idx] = '\0'; zfs_dbgmsg(" dirty: %s", bitmap); - kmem_free(bitmap, brtvd->bv_nblocks + 1); + kmem_free(bitmap, nblocks + 1); } } #endif static brt_vdev_t * -brt_vdev(brt_t *brt, uint64_t vdevid) +brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) { - brt_vdev_t *brtvd; + brt_vdev_t *brtvd = NULL; - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - - if (vdevid < brt->brt_nvdevs) { - brtvd = &brt->brt_vdevs[vdevid]; - } else { - brtvd = NULL; + brt_rlock(spa); + if (vdevid < spa->spa_brt_nvdevs) { + brtvd = spa->spa_brt_vdevs[vdevid]; + } else if (alloc) { + /* New VDEV was added. */ + brt_unlock(spa); + brt_wlock(spa); + if (vdevid >= spa->spa_brt_nvdevs) + brt_vdevs_expand(spa, vdevid + 1); + brtvd = spa->spa_brt_vdevs[vdevid]; } - + brt_unlock(spa); return (brtvd); } static void -brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_initiated); ASSERT0(brtvd->bv_mos_brtvdev); ASSERT0(brtvd->bv_mos_entries); - ASSERT(brtvd->bv_entcount != NULL); - ASSERT(brtvd->bv_size > 0); - ASSERT(brtvd->bv_bitmap != NULL); - ASSERT(brtvd->bv_nblocks > 0); - brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, + uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); - VERIFY(brtvd->bv_mos_entries != 0); + VERIFY(mos_entries != 0); + VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, + &brtvd->bv_mos_entries_dnode)); + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); + brtvd->bv_mos_entries = mos_entries; + rw_exit(&brtvd->bv_mos_entries_lock); BRT_DEBUG("MOS entries created, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); @@ -468,7 +464,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) * We will keep array size (bv_size) and cummulative count for all * bv_entcount[]s (bv_totalcount) in the bonus buffer. */ - brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, + brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); VERIFY(brtvd->bv_mos_brtvdev != 0); @@ -477,27 +473,27 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); - VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, + VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("Pool directory object created, object=%s", name); - spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void -brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) +brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) { vdev_t *vd; uint16_t *entcount; ulong_t *bitmap; - uint64_t nblocks, size; + uint64_t nblocks, onblocks, size; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); - spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); - vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); - size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; - spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, brtvd->bv_vdevid); + size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; + spa_config_exit(spa, SCL_VDEV, FTAG); entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); @@ -505,38 +501,33 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) if (!brtvd->bv_initiated) { ASSERT0(brtvd->bv_size); - ASSERT(brtvd->bv_entcount == NULL); - ASSERT(brtvd->bv_bitmap == NULL); - ASSERT0(brtvd->bv_nblocks); - - avl_create(&brtvd->bv_tree, brt_entry_compare, - sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); + ASSERT0P(brtvd->bv_entcount); + ASSERT0P(brtvd->bv_bitmap); } else { ASSERT(brtvd->bv_size > 0); ASSERT(brtvd->bv_entcount != NULL); ASSERT(brtvd->bv_bitmap != NULL); - ASSERT(brtvd->bv_nblocks > 0); /* * TODO: Allow vdev shrinking. We only need to implement * shrinking the on-disk BRT VDEV object. - * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, - * size, tx); + * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + * offset, size, tx); */ ASSERT3U(brtvd->bv_size, <=, size); memcpy(entcount, brtvd->bv_entcount, sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); - memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), - BT_SIZEOFMAP(brtvd->bv_nblocks))); vmem_free(brtvd->bv_entcount, sizeof (entcount[0]) * brtvd->bv_size); - kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), + BT_SIZEOFMAP(onblocks))); + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); } brtvd->bv_size = size; brtvd->bv_entcount = entcount; brtvd->bv_bitmap = bitmap; - brtvd->bv_nblocks = nblocks; if (!brtvd->bv_initiated) { brtvd->bv_need_byteswap = FALSE; brtvd->bv_initiated = TRUE; @@ -545,36 +536,29 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) } } -static void -brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) +static int +brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) { - char name[64]; dmu_buf_t *db; brt_vdev_phys_t *bvphys; int error; - snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, - (u_longlong_t)brtvd->bv_vdevid); - error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); - if (error != 0) - return; + ASSERT(!brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); - error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); - ASSERT0(error); + error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + FTAG, &db); if (error != 0) - return; + return (error); bvphys = db->db_data; - if (brt->brt_rangesize == 0) { - brt->brt_rangesize = bvphys->bvp_rangesize; + if (spa->spa_brt_rangesize == 0) { + spa->spa_brt_rangesize = bvphys->bvp_rangesize; } else { - ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); + ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); } - ASSERT(!brtvd->bv_initiated); - brt_vdev_realloc(brt, brtvd); + brt_vdev_realloc(spa, brtvd); /* TODO: We don't support VDEV shrinking. */ ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); @@ -582,163 +566,176 @@ brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) /* * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. */ - error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), brtvd->bv_entcount, DMU_READ_NO_PREFETCH); - ASSERT0(error); + if (error != 0) + return (error); + ASSERT(bvphys->bvp_mos_entries != 0); + VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, + &brtvd->bv_mos_entries_dnode)); + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = bvphys->bvp_mos_entries; - ASSERT(brtvd->bv_mos_entries != 0); + rw_exit(&brtvd->bv_mos_entries_lock); brtvd->bv_need_byteswap = (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); brtvd->bv_totalcount = bvphys->bvp_totalcount; brtvd->bv_usedspace = bvphys->bvp_usedspace; brtvd->bv_savedspace = bvphys->bvp_savedspace; - brt->brt_usedspace += brtvd->bv_usedspace; - brt->brt_savedspace += brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); - BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", - name, (u_longlong_t)brtvd->bv_mos_brtvdev, + BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu", + (u_longlong_t)brtvd->bv_vdevid, + (u_longlong_t)brtvd->bv_mos_brtvdev, (u_longlong_t)brtvd->bv_mos_entries); + return (0); } static void -brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) +brt_vdev_dealloc(brt_vdev_t *brtvd) { - - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); ASSERT(brtvd->bv_initiated); + ASSERT0(avl_numnodes(&brtvd->bv_tree)); vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); brtvd->bv_entcount = NULL; - kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); brtvd->bv_bitmap = NULL; - ASSERT0(avl_numnodes(&brtvd->bv_tree)); - avl_destroy(&brtvd->bv_tree); brtvd->bv_size = 0; - brtvd->bv_nblocks = 0; brtvd->bv_initiated = FALSE; BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); } static void -brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; uint64_t count; - dmu_buf_t *db; - brt_vdev_phys_t *bvphys; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(brtvd->bv_mos_entries != 0); + ASSERT0(brtvd->bv_totalcount); + ASSERT0(brtvd->bv_usedspace); + ASSERT0(brtvd->bv_savedspace); - VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); - VERIFY0(count); - VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); - BRT_DEBUG("MOS entries destroyed, object=%llu", - (u_longlong_t)brtvd->bv_mos_entries); + uint64_t mos_entries = brtvd->bv_mos_entries; + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = 0; + rw_exit(&brtvd->bv_mos_entries_lock); + dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); + brtvd->bv_mos_entries_dnode = NULL; + ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); + ASSERT0(count); + VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); + BRT_DEBUG("MOS entries destroyed, object=%llu", + (u_longlong_t)mos_entries); - VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); - bvphys = db->db_data; - ASSERT0(bvphys->bvp_totalcount); - ASSERT0(bvphys->bvp_usedspace); - ASSERT0(bvphys->bvp_savedspace); - dmu_buf_rele(db, FTAG); - - VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); + VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + tx)); BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); brtvd->bv_mos_brtvdev = 0; + brtvd->bv_entcount_dirty = FALSE; snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); - VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, tx)); BRT_DEBUG("Pool directory object removed, object=%s", name); - brt_vdev_dealloc(brt, brtvd); + brtvd->bv_meta_dirty = FALSE; - spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); + rw_enter(&brtvd->bv_lock, RW_WRITER); + brt_vdev_dealloc(brtvd); + rw_exit(&brtvd->bv_lock); + + spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void -brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) +brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) { - brt_vdev_t *brtvd, *vdevs; - uint64_t vdevid; + brt_vdev_t **vdevs; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - ASSERT3U(nvdevs, >, brt->brt_nvdevs); + ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); + ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs); - vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); - if (brt->brt_nvdevs > 0) { - ASSERT(brt->brt_vdevs != NULL); + if (nvdevs == spa->spa_brt_nvdevs) + return; - memcpy(vdevs, brt->brt_vdevs, - sizeof (brt_vdev_t) * brt->brt_nvdevs); - kmem_free(brt->brt_vdevs, - sizeof (brt_vdev_t) * brt->brt_nvdevs); + vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); + if (spa->spa_brt_nvdevs > 0) { + ASSERT(spa->spa_brt_vdevs != NULL); + + memcpy(vdevs, spa->spa_brt_vdevs, + sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); + kmem_free(spa->spa_brt_vdevs, + sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); } - for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { - brtvd = &vdevs[vdevid]; + spa->spa_brt_vdevs = vdevs; + for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { + brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); + rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); brtvd->bv_vdevid = vdevid; brtvd->bv_initiated = FALSE; + rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); + avl_create(&brtvd->bv_tree, brt_entry_compare, + sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); + for (int i = 0; i < TXG_SIZE; i++) { + avl_create(&brtvd->bv_pending_tree[i], + brt_entry_compare, sizeof (brt_entry_t), + offsetof(brt_entry_t, bre_node)); + } + mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); + spa->spa_brt_vdevs[vdevid] = brtvd; } BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", - (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); - - brt->brt_vdevs = vdevs; - brt->brt_nvdevs = nvdevs; + (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); + spa->spa_brt_nvdevs = nvdevs; } static boolean_t -brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) +brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset) { - uint64_t idx; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - - idx = bre->bre_offset / brt->brt_rangesize; - if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { + uint64_t idx = offset / spa->spa_brt_rangesize; + if (idx < brtvd->bv_size) { /* VDEV wasn't expanded. */ return (brt_vdev_entcount_get(brtvd, idx) > 0); } - return (FALSE); } static void -brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, - uint64_t dsize) +brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, + uint64_t dsize, uint64_t count) { uint64_t idx; - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - ASSERT(brtvd != NULL); - ASSERT(brtvd->bv_entcount != NULL); + ASSERT(brtvd->bv_initiated); - brt->brt_savedspace += dsize; - brtvd->bv_savedspace += dsize; + brtvd->bv_savedspace += dsize * count; brtvd->bv_meta_dirty = TRUE; - if (bre->bre_refcount > 1) { + if (bre->bre_count > 0) return; - } - brt->brt_usedspace += dsize; brtvd->bv_usedspace += dsize; - idx = bre->bre_offset / brt->brt_rangesize; + idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; if (idx >= brtvd->bv_size) { /* VDEV has been expanded. */ - brt_vdev_realloc(brt, brtvd); + rw_enter(&brtvd->bv_lock, RW_WRITER); + brt_vdev_realloc(spa, brtvd); + rw_exit(&brtvd->bv_lock); } ASSERT3U(idx, <, brtvd->bv_size); @@ -748,35 +745,26 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); - -#ifdef ZFS_DEBUG - if (zfs_flags & ZFS_DEBUG_BRT) - brt_vdev_dump(brtvd); -#endif } static void -brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, +brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize) { uint64_t idx; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - ASSERT(brtvd != NULL); - ASSERT(brtvd->bv_entcount != NULL); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); + ASSERT(brtvd->bv_initiated); - brt->brt_savedspace -= dsize; brtvd->bv_savedspace -= dsize; brtvd->bv_meta_dirty = TRUE; - if (bre->bre_refcount > 0) { + if (bre->bre_count > 0) return; - } - brt->brt_usedspace -= dsize; brtvd->bv_usedspace -= dsize; - idx = bre->bre_offset / brt->brt_rangesize; + idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; ASSERT3U(idx, <, brtvd->bv_size); ASSERT(brtvd->bv_totalcount > 0); @@ -785,15 +773,10 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); - -#ifdef ZFS_DEBUG - if (zfs_flags & ZFS_DEBUG_BRT) - brt_vdev_dump(brtvd); -#endif } static void -brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { dmu_buf_t *db; brt_vdev_phys_t *bvphys; @@ -802,16 +785,18 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(dmu_tx_is_syncing(tx)); - VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); + VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + FTAG, &db)); if (brtvd->bv_entcount_dirty) { /* * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. */ - dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), brtvd->bv_entcount, tx); - memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); brtvd->bv_entcount_dirty = FALSE; } @@ -825,7 +810,7 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; } bvphys->bvp_totalcount = brtvd->bv_totalcount; - bvphys->bvp_rangesize = brt->brt_rangesize; + bvphys->bvp_rangesize = spa->spa_brt_rangesize; bvphys->bvp_usedspace = brtvd->bv_usedspace; bvphys->bvp_savedspace = brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); @@ -834,114 +819,48 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) } static void -brt_vdevs_alloc(brt_t *brt, boolean_t load) +brt_vdevs_free(spa_t *spa) { - brt_vdev_t *brtvd; - uint64_t vdevid; - - brt_wlock(brt); - - brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); - - if (load) { - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; - ASSERT(brtvd->bv_entcount == NULL); - - brt_vdev_load(brt, brtvd); - } - } - - if (brt->brt_rangesize == 0) { - brt->brt_rangesize = BRT_RANGESIZE; - } - - brt_unlock(brt); -} - -static void -brt_vdevs_free(brt_t *brt) -{ - brt_vdev_t *brtvd; - uint64_t vdevid; - - brt_wlock(brt); - - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; + if (spa->spa_brt_vdevs == 0) + return; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + rw_enter(&brtvd->bv_lock, RW_WRITER); if (brtvd->bv_initiated) - brt_vdev_dealloc(brt, brtvd); + brt_vdev_dealloc(brtvd); + rw_exit(&brtvd->bv_lock); + rw_destroy(&brtvd->bv_lock); + if (brtvd->bv_mos_entries != 0) + dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); + rw_destroy(&brtvd->bv_mos_entries_lock); + avl_destroy(&brtvd->bv_tree); + for (int i = 0; i < TXG_SIZE; i++) + avl_destroy(&brtvd->bv_pending_tree[i]); + mutex_destroy(&brtvd->bv_pending_lock); + kmem_free(brtvd, sizeof (*brtvd)); } - kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); - - brt_unlock(brt); + kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * + spa->spa_brt_nvdevs); } static void brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) { - bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); - bre->bre_refcount = 0; + bre->bre_bp = *bp; + bre->bre_count = 0; + bre->bre_pcount = 0; *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); } static int -brt_entry_compare(const void *x1, const void *x2) +brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) { - const brt_entry_t *bre1 = x1; - const brt_entry_t *bre2 = x2; + uint64_t off = BRE_OFFSET(bre); - return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); -} - -static int -brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) -{ - uint64_t mos_entries; - int error; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - - if (!brt_vdev_lookup(brt, brtvd, bre)) - return (SET_ERROR(ENOENT)); - - /* - * Remember mos_entries object number. After we reacquire the BRT lock, - * the brtvd pointer may be invalid. - */ - mos_entries = brtvd->bv_mos_entries; - if (mos_entries == 0) - return (SET_ERROR(ENOENT)); - - brt_unlock(brt); - - error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, - BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); - - brt_wlock(brt); - - return (error); -} - -static void -brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) -{ - brt_vdev_t *brtvd; - uint64_t mos_entries = 0; - - brt_rlock(brt); - brtvd = brt_vdev(brt, vdevid); - if (brtvd != NULL) - mos_entries = brtvd->bv_mos_entries; - brt_unlock(brt); - - if (mos_entries == 0) - return; - - (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); } /* @@ -952,72 +871,66 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; - brt_entry_t bre_search; - boolean_t mayexists = FALSE; - uint64_t vdevid; - brt_entry_fill(bp, &bre_search, &vdevid); + if (spa->spa_brt_nvdevs == 0) + return (B_FALSE); - brt_rlock(brt); + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); + if (brtvd == NULL || !brtvd->bv_initiated) + return (FALSE); - brtvd = brt_vdev(brt, vdevid); - if (brtvd != NULL && brtvd->bv_initiated) { - if (!avl_is_empty(&brtvd->bv_tree) || - brt_vdev_lookup(brt, brtvd, &bre_search)) { - mayexists = TRUE; - } - } - - brt_unlock(brt); - - return (mayexists); + /* + * We don't need locks here, since bv_entcount pointer must be + * stable at this point, and we don't care about false positive + * races here, while false negative should be impossible, since + * all brt_vdev_addref() have already completed by this point. + */ + uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); + return (brt_vdev_lookup(spa, brtvd, off)); } uint64_t brt_get_dspace(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) + if (spa->spa_brt_nvdevs == 0) return (0); - return (brt->brt_savedspace); + brt_rlock(spa); + uint64_t s = 0; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) + s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; + brt_unlock(spa); + return (s); } uint64_t brt_get_used(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) + if (spa->spa_brt_nvdevs == 0) return (0); - return (brt->brt_usedspace); + brt_rlock(spa); + uint64_t s = 0; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) + s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; + brt_unlock(spa); + return (s); } uint64_t brt_get_saved(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) - return (0); - - return (brt->brt_savedspace); + return (brt_get_dspace(spa)); } uint64_t brt_get_ratio(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt->brt_usedspace == 0) + uint64_t used = brt_get_used(spa); + if (used == 0) return (100); - - return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / - brt->brt_usedspace); + return ((used + brt_get_saved(spa)) * 100 / used); } static int @@ -1028,22 +941,16 @@ brt_kstats_update(kstat_t *ksp, int rw) if (rw == KSTAT_WRITE) return (EACCES); - bs->brt_addref_entry_in_memory.value.ui64 = - wmsum_value(&brt_sums.brt_addref_entry_in_memory); bs->brt_addref_entry_not_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); bs->brt_addref_entry_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_on_disk); - bs->brt_addref_entry_read_lost_race.value.ui64 = - wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); bs->brt_decref_entry_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_in_memory); bs->brt_decref_entry_loaded_from_disk.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); bs->brt_decref_entry_not_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); - bs->brt_decref_entry_not_on_disk.value.ui64 = - wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); bs->brt_decref_entry_read_lost_race.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); bs->brt_decref_entry_still_referenced.value.ui64 = @@ -1062,14 +969,11 @@ static void brt_stat_init(void) { - wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); - wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); - wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); wmsum_init(&brt_sums.brt_decref_free_data_later, 0); @@ -1093,14 +997,11 @@ brt_stat_fini(void) brt_ksp = NULL; } - wmsum_fini(&brt_sums.brt_addref_entry_in_memory); wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); wmsum_fini(&brt_sums.brt_addref_entry_on_disk); - wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); wmsum_fini(&brt_sums.brt_decref_entry_in_memory); wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); - wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); wmsum_fini(&brt_sums.brt_decref_free_data_later); @@ -1113,8 +1014,6 @@ brt_init(void) { brt_entry_cache = kmem_cache_create("brt_entry_cache", sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", - sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); brt_stat_init(); } @@ -1125,105 +1024,12 @@ brt_fini(void) brt_stat_fini(); kmem_cache_destroy(brt_entry_cache); - kmem_cache_destroy(brt_pending_entry_cache); -} - -static brt_entry_t * -brt_entry_alloc(const brt_entry_t *bre_init) -{ - brt_entry_t *bre; - - bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); - bre->bre_offset = bre_init->bre_offset; - bre->bre_refcount = bre_init->bre_refcount; - - return (bre); -} - -static void -brt_entry_free(brt_entry_t *bre) -{ - - kmem_cache_free(brt_entry_cache, bre); -} - -static void -brt_entry_addref(brt_t *brt, const blkptr_t *bp) -{ - brt_vdev_t *brtvd; - brt_entry_t *bre, *racebre; - brt_entry_t bre_search; - avl_index_t where; - uint64_t vdevid; - int error; - - ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); - - brt_entry_fill(bp, &bre_search, &vdevid); - - brt_wlock(brt); - - brtvd = brt_vdev(brt, vdevid); - if (brtvd == NULL) { - ASSERT3U(vdevid, >=, brt->brt_nvdevs); - - /* New VDEV was added. */ - brt_vdevs_expand(brt, vdevid + 1); - brtvd = brt_vdev(brt, vdevid); - } - ASSERT(brtvd != NULL); - if (!brtvd->bv_initiated) - brt_vdev_realloc(brt, brtvd); - - bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); - if (bre != NULL) { - BRTSTAT_BUMP(brt_addref_entry_in_memory); - } else { - /* - * brt_entry_lookup() may drop the BRT (read) lock and - * reacquire it (write). - */ - error = brt_entry_lookup(brt, brtvd, &bre_search); - /* bre_search now contains correct bre_refcount */ - ASSERT(error == 0 || error == ENOENT); - if (error == 0) - BRTSTAT_BUMP(brt_addref_entry_on_disk); - else - BRTSTAT_BUMP(brt_addref_entry_not_on_disk); - /* - * When the BRT lock was dropped, brt_vdevs[] may have been - * expanded and reallocated, we need to update brtvd's pointer. - */ - brtvd = brt_vdev(brt, vdevid); - ASSERT(brtvd != NULL); - - racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); - if (racebre == NULL) { - bre = brt_entry_alloc(&bre_search); - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - avl_insert(&brtvd->bv_tree, bre, where); - brt->brt_nentries++; - } else { - /* - * The entry was added when the BRT lock was dropped in - * brt_entry_lookup(). - */ - BRTSTAT_BUMP(brt_addref_entry_read_lost_race); - bre = racebre; - } - } - bre->bre_refcount++; - brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); - - brt_unlock(brt); } /* Return TRUE if block should be freed immediately. */ boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; brt_entry_t *bre, *racebre; brt_entry_t bre_search; avl_index_t where; @@ -1232,11 +1038,11 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) brt_entry_fill(bp, &bre_search, &vdevid); - brt_wlock(brt); - - brtvd = brt_vdev(brt, vdevid); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); + rw_enter(&brtvd->bv_lock, RW_WRITER); + ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre != NULL) { BRTSTAT_BUMP(brt_decref_entry_in_memory); @@ -1244,67 +1050,49 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) } else { BRTSTAT_BUMP(brt_decref_entry_not_in_memory); } + rw_exit(&brtvd->bv_lock); - /* - * brt_entry_lookup() may drop the BRT lock and reacquire it. - */ - error = brt_entry_lookup(brt, brtvd, &bre_search); - /* bre_search now contains correct bre_refcount */ - ASSERT(error == 0 || error == ENOENT); - /* - * When the BRT lock was dropped, brt_vdevs[] may have been expanded - * and reallocated, we need to update brtvd's pointer. - */ - brtvd = brt_vdev(brt, vdevid); - ASSERT(brtvd != NULL); - + error = brt_entry_lookup(brtvd, &bre_search); + /* bre_search now contains correct bre_count */ if (error == ENOENT) { - BRTSTAT_BUMP(brt_decref_entry_not_on_disk); - bre = NULL; - goto out; + BRTSTAT_BUMP(brt_decref_no_entry); + return (B_TRUE); } + ASSERT0(error); + rw_enter(&brtvd->bv_lock, RW_WRITER); racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); if (racebre != NULL) { - /* - * The entry was added when the BRT lock was dropped in - * brt_entry_lookup(). - */ + /* The entry was added when the lock was dropped. */ BRTSTAT_BUMP(brt_decref_entry_read_lost_race); bre = racebre; goto out; } BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); - bre = brt_entry_alloc(&bre_search); - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); + bre->bre_bp = bre_search.bre_bp; + bre->bre_count = bre_search.bre_count; + bre->bre_pcount = 0; avl_insert(&brtvd->bv_tree, bre, where); - brt->brt_nentries++; out: - if (bre == NULL) { - /* - * This is a free of a regular (not cloned) block. - */ - brt_unlock(brt); - BRTSTAT_BUMP(brt_decref_no_entry); - return (B_TRUE); - } - if (bre->bre_refcount == 0) { - brt_unlock(brt); + if (bre->bre_count == 0) { + rw_exit(&brtvd->bv_lock); BRTSTAT_BUMP(brt_decref_free_data_now); return (B_TRUE); } - ASSERT(bre->bre_refcount > 0); - bre->bre_refcount--; - if (bre->bre_refcount == 0) + bre->bre_pcount--; + ASSERT(bre->bre_count > 0); + bre->bre_count--; + if (bre->bre_count == 0) BRTSTAT_BUMP(brt_decref_free_data_later); else BRTSTAT_BUMP(brt_decref_entry_still_referenced); - brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); + brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); - brt_unlock(brt); + rw_exit(&brtvd->bv_lock); return (B_FALSE); } @@ -1312,222 +1100,259 @@ out: uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; brt_entry_t bre_search, *bre; uint64_t vdevid, refcnt; int error; brt_entry_fill(bp, &bre_search, &vdevid); - brt_rlock(brt); - - brtvd = brt_vdev(brt, vdevid); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); + rw_enter(&brtvd->bv_lock, RW_READER); + ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre == NULL) { - error = brt_entry_lookup(brt, brtvd, &bre_search); - ASSERT(error == 0 || error == ENOENT); - if (error == ENOENT) + rw_exit(&brtvd->bv_lock); + error = brt_entry_lookup(brtvd, &bre_search); + if (error == ENOENT) { refcnt = 0; - else - refcnt = bre_search.bre_refcount; - } else - refcnt = bre->bre_refcount; + } else { + ASSERT0(error); + refcnt = bre_search.bre_count; + } + } else { + refcnt = bre->bre_count; + rw_exit(&brtvd->bv_lock); + } - brt_unlock(brt); return (refcnt); } static void -brt_prefetch(brt_t *brt, const blkptr_t *bp) +brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) { - brt_entry_t bre; - uint64_t vdevid; - - ASSERT(bp != NULL); - - if (!brt_zap_prefetch) + if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) return; - brt_entry_fill(bp, &bre, &vdevid); - - brt_entry_prefetch(brt, vdevid, &bre); + uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); + rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); + if (brtvd->bv_mos_entries != 0) { + (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS); + } + rw_exit(&brtvd->bv_mos_entries_lock); } static int -brt_pending_entry_compare(const void *x1, const void *x2) +brt_entry_compare(const void *x1, const void *x2) { - const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; - const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; - int cmp; + const brt_entry_t *bre1 = x1, *bre2 = x2; + const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp; - cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), - DVA_GET_VDEV(&bp2->blk_dva[0])); - if (cmp == 0) { - cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), - DVA_GET_OFFSET(&bp2->blk_dva[0])); - if (unlikely(cmp == 0)) { - cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2)); - } - } - - return (cmp); + return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), + DVA_GET_OFFSET(&bp2->blk_dva[0]))); } void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { - brt_t *brt; - avl_tree_t *pending_tree; - kmutex_t *pending_lock; - brt_pending_entry_t *bpe, *newbpe; + brt_entry_t *bre, *newbre; avl_index_t where; uint64_t txg; - brt = spa->spa_brt; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; - newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); - newbpe->bpe_bp = *bp; - newbpe->bpe_count = 1; + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); + avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; - mutex_enter(pending_lock); + newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); + newbre->bre_bp = *bp; + newbre->bre_count = 0; + newbre->bre_pcount = 1; - bpe = avl_find(pending_tree, newbpe, &where); - if (bpe == NULL) { - avl_insert(pending_tree, newbpe, where); - newbpe = NULL; + mutex_enter(&brtvd->bv_pending_lock); + bre = avl_find(pending_tree, newbre, &where); + if (bre == NULL) { + avl_insert(pending_tree, newbre, where); + newbre = NULL; } else { - bpe->bpe_count++; + bre->bre_pcount++; } + mutex_exit(&brtvd->bv_pending_lock); - mutex_exit(pending_lock); - - if (newbpe != NULL) { - ASSERT(bpe != NULL); - ASSERT(bpe != newbpe); - kmem_cache_free(brt_pending_entry_cache, newbpe); + if (newbre != NULL) { + ASSERT(bre != NULL); + ASSERT(bre != newbre); + kmem_cache_free(brt_entry_cache, newbre); } else { - ASSERT(bpe == NULL); + ASSERT0P(bre); /* Prefetch BRT entry for the syncing context. */ - brt_prefetch(brt, bp); + brt_prefetch(brtvd, bp); } } void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { - brt_t *brt; - avl_tree_t *pending_tree; - kmutex_t *pending_lock; - brt_pending_entry_t *bpe, bpe_search; + brt_entry_t *bre, bre_search; uint64_t txg; - brt = spa->spa_brt; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; - bpe_search.bpe_bp = *bp; + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); + ASSERT(brtvd != NULL); + avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; - mutex_enter(pending_lock); + bre_search.bre_bp = *bp; - bpe = avl_find(pending_tree, &bpe_search, NULL); - /* I believe we should always find bpe when this function is called. */ - if (bpe != NULL) { - ASSERT(bpe->bpe_count > 0); + mutex_enter(&brtvd->bv_pending_lock); + bre = avl_find(pending_tree, &bre_search, NULL); + ASSERT(bre != NULL); + ASSERT(bre->bre_pcount > 0); + bre->bre_pcount--; + if (bre->bre_pcount == 0) + avl_remove(pending_tree, bre); + else + bre = NULL; + mutex_exit(&brtvd->bv_pending_lock); - bpe->bpe_count--; - if (bpe->bpe_count == 0) { - avl_remove(pending_tree, bpe); - kmem_cache_free(brt_pending_entry_cache, bpe); + if (bre) + kmem_cache_free(brt_entry_cache, bre); +} + +static void +brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) +{ + brt_entry_t *bre, *nbre; + + /* + * We are in syncing context, so no other bv_pending_tree accesses + * are possible for the TXG. So we don't need bv_pending_lock. + */ + ASSERT(avl_is_empty(&brtvd->bv_tree)); + avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]); + + for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) { + nbre = AVL_NEXT(&brtvd->bv_tree, bre); + + /* + * If the block has DEDUP bit set, it means that it + * already exists in the DEDUP table, so we can just + * use that instead of creating new entry in the BRT. + */ + if (BP_GET_DEDUP(&bre->bre_bp)) { + while (bre->bre_pcount > 0) { + if (!ddt_addref(spa, &bre->bre_bp)) + break; + bre->bre_pcount--; + } + if (bre->bre_pcount == 0) { + avl_remove(&brtvd->bv_tree, bre); + kmem_cache_free(brt_entry_cache, bre); + continue; + } + } + + /* + * Unless we know that the block is definitely not in ZAP, + * try to get its reference count from there. + */ + uint64_t off = BRE_OFFSET(bre); + if (brtvd->bv_mos_entries != 0 && + brt_vdev_lookup(spa, brtvd, off)) { + int error = zap_lookup_uint64_by_dnode( + brtvd->bv_mos_entries_dnode, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count); + if (error == 0) { + BRTSTAT_BUMP(brt_addref_entry_on_disk); + } else { + ASSERT3U(error, ==, ENOENT); + BRTSTAT_BUMP(brt_addref_entry_not_on_disk); + } } } - mutex_exit(pending_lock); + /* + * If all the cloned blocks we had were handled by DDT, we don't need + * to initiate the vdev. + */ + if (avl_is_empty(&brtvd->bv_tree)) + return; + + if (!brtvd->bv_initiated) { + rw_enter(&brtvd->bv_lock, RW_WRITER); + brt_vdev_realloc(spa, brtvd); + rw_exit(&brtvd->bv_lock); + } + + /* + * Convert pending references into proper ones. This has to be a + * separate loop, since entcount modifications would cause false + * positives for brt_vdev_lookup() on following iterations. + */ + for (bre = avl_first(&brtvd->bv_tree); bre; + bre = AVL_NEXT(&brtvd->bv_tree, bre)) { + brt_vdev_addref(spa, brtvd, bre, + bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount); + bre->bre_count += bre->bre_pcount; + } } void brt_pending_apply(spa_t *spa, uint64_t txg) { - brt_t *brt = spa->spa_brt; - brt_pending_entry_t *bpe; - avl_tree_t *pending_tree; - void *c; - ASSERT3U(txg, !=, 0); + brt_rlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + brt_unlock(spa); - /* - * We are in syncing context, so no other brt_pending_tree accesses - * are possible for the TXG. Don't need to acquire brt_pending_lock. - */ - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; + brt_pending_apply_vdev(spa, brtvd, txg); - c = NULL; - while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { - boolean_t added_to_ddt; - - for (int i = 0; i < bpe->bpe_count; i++) { - /* - * If the block has DEDUP bit set, it means that it - * already exists in the DEDUP table, so we can just - * use that instead of creating new entry in - * the BRT table. - */ - if (BP_GET_DEDUP(&bpe->bpe_bp)) { - added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); - } else { - added_to_ddt = B_FALSE; - } - if (!added_to_ddt) - brt_entry_addref(brt, &bpe->bpe_bp); - } - - kmem_cache_free(brt_pending_entry_cache, bpe); + brt_rlock(spa); } + brt_unlock(spa); } static void brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { - if (bre->bre_refcount == 0) { - int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset, + uint64_t off = BRE_OFFSET(bre); + + if (bre->bre_pcount == 0) { + /* The net change is zero, nothing to do in ZAP. */ + } else if (bre->bre_count == 0) { + int error = zap_remove_uint64_by_dnode(dn, &off, BRT_KEY_WORDS, tx); VERIFY(error == 0 || error == ENOENT); } else { - VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset, - BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), - &bre->bre_refcount, tx)); + VERIFY0(zap_update_uint64_by_dnode(dn, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count, tx)); } } static void -brt_sync_table(brt_t *brt, dmu_tx_t *tx) +brt_sync_table(spa_t *spa, dmu_tx_t *tx) { - brt_vdev_t *brtvd; brt_entry_t *bre; - dnode_t *dn; - uint64_t vdevid; - void *c; - brt_wlock(brt); - - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; - - if (!brtvd->bv_initiated) - continue; + brt_rlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + brt_unlock(spa); if (!brtvd->bv_meta_dirty) { ASSERT(!brtvd->bv_entcount_dirty); ASSERT0(avl_numnodes(&brtvd->bv_tree)); + brt_rlock(spa); continue; } @@ -1535,132 +1360,117 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) avl_numnodes(&brtvd->bv_tree) != 0); if (brtvd->bv_mos_brtvdev == 0) - brt_vdev_create(brt, brtvd, tx); + brt_vdev_create(spa, brtvd, tx); - VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, - FTAG, &dn)); - - c = NULL; + void *c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(dn, bre, tx); - brt_entry_free(bre); - ASSERT(brt->brt_nentries > 0); - brt->brt_nentries--; + brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); + kmem_cache_free(brt_entry_cache, bre); } - dnode_rele(dn, FTAG); - - brt_vdev_sync(brt, brtvd, tx); - +#ifdef ZFS_DEBUG + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); +#endif if (brtvd->bv_totalcount == 0) - brt_vdev_destroy(brt, brtvd, tx); + brt_vdev_destroy(spa, brtvd, tx); + else + brt_vdev_sync(spa, brtvd, tx); + brt_rlock(spa); } - - ASSERT0(brt->brt_nentries); - - brt_unlock(brt); + brt_unlock(spa); } void brt_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; - brt_t *brt; + uint64_t vdevid; - ASSERT(spa_syncing_txg(spa) == txg); + ASSERT3U(spa_syncing_txg(spa), ==, txg); - brt = spa->spa_brt; - brt_rlock(brt); - if (brt->brt_nentries == 0) { - /* No changes. */ - brt_unlock(brt); + brt_rlock(spa); + for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) + break; + } + if (vdevid >= spa->spa_brt_nvdevs) { + brt_unlock(spa); return; } - brt_unlock(brt); + brt_unlock(spa); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - brt_sync_table(brt, tx); - + brt_sync_table(spa, tx); dmu_tx_commit(tx); } -static void -brt_table_alloc(brt_t *brt) -{ - - for (int i = 0; i < TXG_SIZE; i++) { - avl_create(&brt->brt_pending_tree[i], - brt_pending_entry_compare, - sizeof (brt_pending_entry_t), - offsetof(brt_pending_entry_t, bpe_node)); - mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, - NULL); - } -} - -static void -brt_table_free(brt_t *brt) -{ - - for (int i = 0; i < TXG_SIZE; i++) { - ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); - - avl_destroy(&brt->brt_pending_tree[i]); - mutex_destroy(&brt->brt_pending_lock[i]); - } -} - static void brt_alloc(spa_t *spa) { - brt_t *brt; - - ASSERT(spa->spa_brt == NULL); - - brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); - rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); - brt->brt_spa = spa; - brt->brt_rangesize = 0; - brt->brt_nentries = 0; - brt->brt_vdevs = NULL; - brt->brt_nvdevs = 0; - brt_table_alloc(brt); - - spa->spa_brt = brt; + rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); + spa->spa_brt_vdevs = NULL; + spa->spa_brt_nvdevs = 0; + spa->spa_brt_rangesize = 0; } void brt_create(spa_t *spa) { - brt_alloc(spa); - brt_vdevs_alloc(spa->spa_brt, B_FALSE); + spa->spa_brt_rangesize = BRT_RANGESIZE; } int brt_load(spa_t *spa) { + int error = 0; brt_alloc(spa); - brt_vdevs_alloc(spa->spa_brt, B_TRUE); + brt_wlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children; + vdevid++) { + char name[64]; + uint64_t mos_brtvdev; - return (0); + /* Look if this vdev had active block cloning. */ + snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, + (u_longlong_t)vdevid); + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, + &mos_brtvdev); + if (error == ENOENT) { + error = 0; + continue; + } + if (error != 0) + break; + + /* If it did, then allocate them all and load this one. */ + brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + rw_enter(&brtvd->bv_lock, RW_WRITER); + brtvd->bv_mos_brtvdev = mos_brtvdev; + error = brt_vdev_load(spa, brtvd); + rw_exit(&brtvd->bv_lock); + if (error != 0) + break; + } + + if (spa->spa_brt_rangesize == 0) + spa->spa_brt_rangesize = BRT_RANGESIZE; + brt_unlock(spa); + return (error); } void brt_unload(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) + if (spa->spa_brt_rangesize == 0) return; - - brt_vdevs_free(brt); - brt_table_free(brt); - rw_destroy(&brt->brt_lock); - kmem_free(brt, sizeof (*brt)); - spa->spa_brt = NULL; + brt_vdevs_free(spa); + rw_destroy(&spa->spa_brt_lock); + spa->spa_brt_rangesize = 0; } /* BEGIN CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index b1419d96f4ef..cbd07d19a7f9 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -89,7 +89,6 @@ typedef struct dbuf_stats { kstat_named_t hash_misses; kstat_named_t hash_collisions; kstat_named_t hash_elements; - kstat_named_t hash_elements_max; /* * Number of sublists containing more than one dbuf in the dbuf * hash table. Keep track of the longest hash chain. @@ -134,7 +133,6 @@ dbuf_stats_t dbuf_stats = { { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, - { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, @@ -154,6 +152,7 @@ struct { wmsum_t hash_hits; wmsum_t hash_misses; wmsum_t hash_collisions; + wmsum_t hash_elements; wmsum_t hash_chains; wmsum_t hash_insert_race; wmsum_t metadata_cache_count; @@ -432,8 +431,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); - uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); - DBUF_STAT_MAX(hash_elements_max, he); + DBUF_STAT_BUMP(hash_elements); return (NULL); } @@ -506,7 +504,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); + DBUF_STAT_BUMPDOWN(hash_elements); } typedef enum { @@ -903,6 +901,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw) wmsum_value(&dbuf_sums.hash_misses); ds->hash_collisions.value.ui64 = wmsum_value(&dbuf_sums.hash_collisions); + ds->hash_elements.value.ui64 = + wmsum_value(&dbuf_sums.hash_elements); ds->hash_chains.value.ui64 = wmsum_value(&dbuf_sums.hash_chains); ds->hash_insert_race.value.ui64 = @@ -1004,6 +1004,7 @@ dbuf_init(void) wmsum_init(&dbuf_sums.hash_hits, 0); wmsum_init(&dbuf_sums.hash_misses, 0); wmsum_init(&dbuf_sums.hash_collisions, 0); + wmsum_init(&dbuf_sums.hash_elements, 0); wmsum_init(&dbuf_sums.hash_chains, 0); wmsum_init(&dbuf_sums.hash_insert_race, 0); wmsum_init(&dbuf_sums.metadata_cache_count, 0); @@ -1077,6 +1078,7 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.hash_hits); wmsum_fini(&dbuf_sums.hash_misses); wmsum_fini(&dbuf_sums.hash_collisions); + wmsum_fini(&dbuf_sums.hash_elements); wmsum_fini(&dbuf_sums.hash_chains); wmsum_fini(&dbuf_sums.hash_insert_race); wmsum_fini(&dbuf_sums.metadata_cache_count); @@ -2578,8 +2580,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * We are freeing a block that we cloned in the same * transaction group. */ - brt_pending_remove(dmu_objset_spa(db->db_objset), - &dr->dt.dl.dr_overridden_by, tx); + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + brt_pending_remove(dmu_objset_spa(db->db_objset), + bp, tx); + } } dnode_t *dn = dr->dr_dnode; diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index 2248f644bee7..4712addf81be 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -68,6 +68,7 @@ #include #include #include +#include /* * The SPA supports block sizes up to 16MB. However, very large blocks @@ -289,8 +290,26 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object); - dsl_free(tx->tx_pool, tx->tx_txg, bp); + /* + * Put blocks that would create IO on the pool's deadlist for + * dsl_process_async_destroys() to find. This is to prevent + * zio_free() from creating a ZIO_TYPE_FREE IO for them, which + * are very heavy and can lead to out-of-memory conditions if + * something tries to free millions of blocks on the same txg. + */ + boolean_t defer = spa_version(spa) >= SPA_VERSION_DEADLISTS && + (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || + brt_maybe_exists(spa, bp)); + + if (defer) { + dprintf_bp(bp, "putting on free list: %s", ""); + bpobj_enqueue(&ds->ds_dir->dd_pool->dp_free_bpobj, + bp, B_FALSE, tx); + } else { + dprintf_bp(bp, "freeing ds=%llu", + (u_longlong_t)ds->ds_object); + dsl_free(tx->tx_pool, tx->tx_txg, bp); + } mutex_enter(&ds->ds_lock); ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || @@ -298,9 +317,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, delta = parent_delta(ds, -used); dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); + dsl_dir_diduse_transfer_space(ds->ds_dir, delta, -compressed, -uncompressed, -used, DD_USED_REFRSRV, DD_USED_HEAD, tx); + + if (defer) + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, compressed, uncompressed, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) { diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 6b8c7ee93daa..bf3b0d143db4 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -2081,6 +2081,7 @@ spa_unload(spa_t *spa) vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); + l2arc_spa_rebuild_stop(spa); } } @@ -7115,6 +7116,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); + l2arc_spa_rebuild_stop(spa); /* * We want this to be reflected on every label, diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index f486513fcaf9..32542e7ce701 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -1870,13 +1870,7 @@ spa_get_slop_space(spa_t *spa) if (spa->spa_dedup_dspace == ~0ULL) spa_update_dspace(spa); - /* - * spa_get_dspace() includes the space only logically "used" by - * deduplicated data, so since it's not useful to reserve more - * space with more deduplicated data, we subtract that out here. - */ - space = - spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa); + space = spa->spa_rdspace; slop = MIN(space >> spa_slop_shift, spa_max_slop); /* @@ -1912,8 +1906,7 @@ spa_get_checkpoint_space(spa_t *spa) void spa_update_dspace(spa_t *spa) { - spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + - ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); + spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa)); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that @@ -1933,9 +1926,11 @@ spa_update_dspace(spa_t *spa) * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new device). */ - ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace); - spa->spa_dspace -= spa->spa_nonallocating_dspace; + ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace); + spa->spa_rdspace -= spa->spa_nonallocating_dspace; } + spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) + + brt_get_dspace(spa); } /* diff --git a/sys/contrib/openzfs/module/zfs/zap_leaf.c b/sys/contrib/openzfs/module/zfs/zap_leaf.c index 032aca92695e..e396523a94b2 100644 --- a/sys/contrib/openzfs/module/zfs/zap_leaf.c +++ b/sys/contrib/openzfs/module/zfs/zap_leaf.c @@ -248,20 +248,63 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf, return (chunk_head); } -static void -zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) +/* + * Non-destructively copy array between leaves. + */ +static uint16_t +zap_leaf_array_copy(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) { - uint16_t chunk = *chunkp; - - *chunkp = CHAIN_END; + uint16_t new_chunk; + uint16_t *nchunkp = &new_chunk; while (chunk != CHAIN_END) { - uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; - ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, - ZAP_CHUNK_ARRAY); - zap_leaf_chunk_free(l, chunk); - chunk = nextchunk; + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + uint16_t nchunk = zap_leaf_chunk_alloc(nl); + + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + struct zap_leaf_array *nla = + &ZAP_LEAF_CHUNK(nl, nchunk).l_array; + ASSERT3U(la->la_type, ==, ZAP_CHUNK_ARRAY); + + *nla = *la; /* structure assignment */ + + chunk = la->la_next; + *nchunkp = nchunk; + nchunkp = &nla->la_next; } + *nchunkp = CHAIN_END; + return (new_chunk); +} + +/* + * Free array. Unlike trivial loop of zap_leaf_chunk_free() this does + * not reverse order of chunks in the free list, reducing fragmentation. + */ +static void +zap_leaf_array_free(zap_leaf_t *l, uint16_t chunk) +{ + struct zap_leaf_header *hdr = &zap_leaf_phys(l)->l_hdr; + uint16_t *tailp = &hdr->lh_freelist; + uint16_t oldfree = *tailp; + + while (chunk != CHAIN_END) { + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + zap_leaf_chunk_t *c = &ZAP_LEAF_CHUNK(l, chunk); + ASSERT3U(c->l_array.la_type, ==, ZAP_CHUNK_ARRAY); + + *tailp = chunk; + chunk = c->l_array.la_next; + + c->l_free.lf_type = ZAP_CHUNK_FREE; + memset(c->l_free.lf_pad, 0, sizeof (c->l_free.lf_pad)); + tailp = &c->l_free.lf_next; + + ASSERT3U(hdr->lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); + hdr->lh_nfree++; + } + + *tailp = oldfree; } /* array_len and buf_len are in integers, not bytes */ @@ -515,7 +558,7 @@ zap_entry_update(zap_entry_handle_t *zeh, if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) return (SET_ERROR(EAGAIN)); - zap_leaf_array_free(l, &le->le_value_chunk); + zap_leaf_array_free(l, le->le_value_chunk); le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); le->le_value_numints = num_integers; @@ -534,10 +577,11 @@ zap_entry_remove(zap_entry_handle_t *zeh) struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - zap_leaf_array_free(l, &le->le_name_chunk); - zap_leaf_array_free(l, &le->le_value_chunk); - *zeh->zeh_chunkp = le->le_next; + + /* Free in opposite order to reduce fragmentation. */ + zap_leaf_array_free(l, le->le_value_chunk); + zap_leaf_array_free(l, le->le_name_chunk); zap_leaf_chunk_free(l, entry_chunk); zap_leaf_phys(l)->l_hdr.lh_nentries--; @@ -701,34 +745,6 @@ zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry) return (chunkp); } -static uint16_t -zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) -{ - uint16_t new_chunk; - uint16_t *nchunkp = &new_chunk; - - while (chunk != CHAIN_END) { - uint16_t nchunk = zap_leaf_chunk_alloc(nl); - struct zap_leaf_array *nla = - &ZAP_LEAF_CHUNK(nl, nchunk).l_array; - struct zap_leaf_array *la = - &ZAP_LEAF_CHUNK(l, chunk).l_array; - uint_t nextchunk = la->la_next; - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); - - *nla = *la; /* structure assignment */ - - zap_leaf_chunk_free(l, chunk); - chunk = nextchunk; - *nchunkp = nchunk; - nchunkp = &nla->la_next; - } - *nchunkp = CHAIN_END; - return (new_chunk); -} - static void zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl) { @@ -741,10 +757,12 @@ zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl) (void) zap_leaf_rehash_entry(nl, nle, chunk); - nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); - nle->le_value_chunk = - zap_leaf_transfer_array(l, le->le_value_chunk, nl); + nle->le_name_chunk = zap_leaf_array_copy(l, le->le_name_chunk, nl); + nle->le_value_chunk = zap_leaf_array_copy(l, le->le_value_chunk, nl); + /* Free in opposite order to reduce fragmentation. */ + zap_leaf_array_free(l, le->le_value_chunk); + zap_leaf_array_free(l, le->le_name_chunk); zap_leaf_chunk_free(l, entry); zap_leaf_phys(l)->l_hdr.lh_nentries--; diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c index 12938022e976..dfe309aa551f 100644 --- a/sys/contrib/openzfs/module/zfs/zap_micro.c +++ b/sys/contrib/openzfs/module/zfs/zap_micro.c @@ -1227,6 +1227,21 @@ zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, return (err); } +static int +zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints) +{ + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (0); +} + int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) @@ -1237,13 +1252,37 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); + err = zap_prefetch_uint64_impl(zap, key, key_numints); + /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_prefetch_uint64_impl(zap, key, key_numints); + /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +static int +zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } - fzap_prefetch(zn); + int err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); @@ -1259,16 +1298,25 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } + err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size, + num_integers, buf); + /* zap_lookup_uint64_impl() calls zap_unlockdir() */ + return (err); +} - err = fzap_lookup(zn, integer_size, num_integers, buf, - NULL, 0, NULL); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); +int +zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size, + num_integers, buf); + /* zap_lookup_uint64_impl() calls zap_unlockdir() */ return (err); } diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index a5daf73d59ba..f4d7e57542a1 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -2192,31 +2192,20 @@ zio_delay_interrupt(zio_t *zio) } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; - clock_t expire_at_tick = ddi_get_lbolt() + - NSEC_TO_TICK(diff); + int ticks = MAX(1, NSEC_TO_TICK(diff)); + clock_t expire_at_tick = ddi_get_lbolt() + ticks; DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); - if (NSEC_TO_TICK(diff) == 0) { - /* Our delay is less than a jiffy - just spin */ - zfs_sleep_until(zio->io_target_timestamp); - zio_interrupt(zio); - } else { + tid = taskq_dispatch_delay(system_taskq, zio_interrupt, + zio, TQ_NOSLEEP, expire_at_tick); + if (tid == TASKQID_INVALID) { /* - * Use taskq_dispatch_delay() in the place of - * OpenZFS's timeout_generic(). + * Couldn't allocate a task. Just finish the + * zio without a delay. */ - tid = taskq_dispatch_delay(system_taskq, - zio_interrupt, zio, TQ_NOSLEEP, - expire_at_tick); - if (tid == TASKQID_INVALID) { - /* - * Couldn't allocate a task. Just - * finish the zio without a delay. - */ - zio_interrupt(zio); - } + zio_interrupt(zio); } } return; diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c index ce6772a40c8b..0d2fda8d5270 100644 --- a/sys/contrib/openzfs/module/zfs/zio_checksum.c +++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c @@ -160,6 +160,12 @@ abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, abd_fletcher_4_impl(abd, size, &acd); } +/* + * Checksum vectors. + * + * Note: you cannot change the name string for these functions, as they are + * embedded in on-disk data in some places (eg dedup table names). + */ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c index 10c482573862..1a0178eb2830 100644 --- a/sys/contrib/openzfs/module/zfs/zio_compress.c +++ b/sys/contrib/openzfs/module/zfs/zio_compress.c @@ -44,10 +44,6 @@ static unsigned long zio_decompress_fail_fraction = 0; /* * Compression vectors. - * - * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS. - * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE - * PART OF THE ON-DISK FORMAT. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"inherit", 0, NULL, NULL, NULL}, diff --git a/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in b/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in index cd85dd28cf56..6735c4a67ec5 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in @@ -32,6 +32,7 @@ Requires(post): gcc, make, perl, diffutils %if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version}%{?openEuler} Requires: kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 Requires(post): kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 +Conflicts: kernel-devel < @ZFS_META_KVER_MIN@, kernel-devel > @ZFS_META_KVER_MAX@.999 Obsoletes: spl-dkms <= %{version} %endif Provides: %{module}-kmod = %{version} diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/getversion.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/getversion.c index 1e026b92d17d..3626d1e968a3 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/getversion.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/getversion.c @@ -19,9 +19,13 @@ */ #include +#ifdef _KERNEL +#include +#else +#include +#endif #include #include -#include #include #include #include diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh index f8aa1c875c60..08ed5717b9da 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh @@ -41,9 +41,11 @@ log_must zfs set compress=zle $TESTDSTFS for prop in "${sync_prop_vals[@]}"; do log_must zfs set sync=$prop $TESTSRCFS + # 15*8=120, which is greater than 113, so we are sure the data won't + # be embedded into BP. # 32767*8=262136, which is larger than a single default recordsize of # 131072. - FILESIZE=$(random_int_between 1 32767) + FILESIZE=$(random_int_between 15 32767) FILESIZE=$((FILESIZE * 8)) bclone_test random $FILESIZE false $TESTSRCDIR $TESTSRCDIR done @@ -52,9 +54,11 @@ for srcprop in "${sync_prop_vals[@]}"; do log_must zfs set sync=$srcprop $TESTSRCFS for dstprop in "${sync_prop_vals[@]}"; do log_must zfs set sync=$dstprop $TESTDSTFS + # 15*8=120, which is greater than 113, so we are sure the data won't + # be embedded into BP. # 32767*8=262136, which is larger than a single default recordsize of # 131072. - FILESIZE=$(random_int_between 1 32767) + FILESIZE=$(random_int_between 15 32767) FILESIZE=$((FILESIZE * 8)) bclone_test random $FILESIZE false $TESTSRCDIR $TESTDSTDIR done diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh index 4fb900c73cf6..7c44e800c16a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh @@ -69,15 +69,16 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" # Check no ONLINE slow vdevs are show. Then mark IOs greater than - # 160ms slow, delay IOs 320ms to vdev6, check slow IOs. + # 750ms slow, delay IOs 1000ms to vdev6, check slow IOs. log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE" log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE" - log_must set_tunable64 ZIO_SLOW_IO_MS 160 - log_must zinject -d $TESTDIR/vdev6 -D320:100 $TESTPOOL2 + log_must set_tunable64 ZIO_SLOW_IO_MS 750 + log_must zinject -d $TESTDIR/vdev6 -D1000:100 $TESTPOOL2 log_must mkfile 1048576 /$TESTPOOL2/testfile sync_pool $TESTPOOL2 log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must zinject -c all # Check vdev6 slow IOs are only shown when requested with -s. log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" @@ -95,10 +96,9 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE" log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE" - log_must zinject -c all log_must zpool status -es $TESTPOOL2 - zpool destroy $TESTPOOL2 + log_must zpool destroy $TESTPOOL2 done log_pass "Verify zpool status -e shows only unhealthy vdevs" diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index fc6d0a0839ba..1959b7cab014 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -792,7 +792,7 @@ /* #undef ZFS_DEVICE_MINOR */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.3.99-64-FreeBSD_g1c9a4c8cb" +#define ZFS_META_ALIAS "zfs-2.3.99-92-FreeBSD_gd0a91b9f8" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -801,7 +801,7 @@ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ -#define ZFS_META_KVER_MAX "6.11" +#define ZFS_META_KVER_MAX "6.12" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "4.18" @@ -822,7 +822,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "64-FreeBSD_g1c9a4c8cb" +#define ZFS_META_RELEASE "92-FreeBSD_gd0a91b9f8" /* Define the project version. */ #define ZFS_META_VERSION "2.3.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 0b144c128952..a2122a773a82 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.3.99-64-g1c9a4c8cb" +#define ZFS_META_GITREV "zfs-2.3.99-92-gd0a91b9f8"