zfs: merge openzfs/zfs@d0a91b9f8
Some checks are pending
Cross-build Kernel / ${{ matrix.target_arch }} ${{ matrix.os }} (${{ matrix.compiler }}) (clang-14, /usr/lib/llvm-14/bin, ubuntu-22.04, bmake libarchive-dev clang-14 lld-14, amd64, amd64) (push) Waiting to run
Cross-build Kernel / ${{ matrix.target_arch }} ${{ matrix.os }} (${{ matrix.compiler }}) (clang-14, /usr/lib/llvm-14/bin, ubuntu-22.04, bmake libarchive-dev clang-14 lld-14, arm64, aarch64) (push) Waiting to run
Cross-build Kernel / ${{ matrix.target_arch }} ${{ matrix.os }} (${{ matrix.compiler }}) (clang-18, /opt/homebrew/opt/llvm@18/bin, macos-latest, bmake libarchive llvm@18, amd64, amd64) (push) Waiting to run
Cross-build Kernel / ${{ matrix.target_arch }} ${{ matrix.os }} (${{ matrix.compiler }}) (clang-18, /opt/homebrew/opt/llvm@18/bin, macos-latest, bmake libarchive llvm@18, arm64, aarch64) (push) Waiting to run
Cross-build Kernel / ${{ matrix.target_arch }} ${{ matrix.os }} (${{ matrix.compiler }}) (clang-18, /usr/lib/llvm-18/bin, ubuntu-24.04, bmake libarchive-dev clang-18 lld-18, amd64, amd64) (push) Waiting to run
Cross-build Kernel / ${{ matrix.target_arch }} ${{ matrix.os }} (${{ matrix.compiler }}) (clang-18, /usr/lib/llvm-18/bin, ubuntu-24.04, bmake libarchive-dev clang-18 lld-18, arm64, aarch64) (push) Waiting to run

Notable upstream pull request merges:
 #16643 -multiple Change rangelock handling in FreeBSD's zfs_getpages()
 #16697 46c4f2ce0 dsl_dataset: put IO-inducing frees on the pool deadlist
 #16740 -multiple BRT: Rework structures and locks to be per-vdev
 #16743 a60ed3822 L2ARC: Move different stats updates earlier
 #16758 8dc452d90 Fix some nits in zfs_getpages()
 #16759 534688948 Remove hash_elements_max accounting from DBUF and ARC
 #16766 9a81484e3 ZAP: Reduce leaf array and free chunks fragmentation
 #16773 457f8b76e BRT: More optimizations after per-vdev splitting
 #16782 0ca82c568 L2ARC: Stop rebuild before setting spa_final_txg
 #16785 d76d79fd2 zio: Avoid sleeping in the I/O path
 #16791 ae1d11882 BRT: Clear bv_entcount_dirty on destroy
 #16796 b3b0ce64d FreeBSD: Lock vnode in zfs_ioctl()
 #16797 d0a91b9f8 FreeBSD: Reduce copy_file_range() source lock to shared

Obtained from:	OpenZFS
OpenZFS commit:	d0a91b9f88
This commit is contained in:
Martin Matuska 2024-11-24 10:04:11 +01:00
commit 718519f4ef
37 changed files with 958 additions and 909 deletions

View File

@ -70,6 +70,7 @@ Rob Norris <robn@despairlabs.com>
Rob Norris <rob.norris@klarasystems.com>
Sam Lunt <samuel.j.lunt@gmail.com>
Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Sebastian Wuerl <s.wuerl@mailbox.org>
Stoiko Ivanov <github@nomore.at>
Tamas TEVESZ <ice@extreme.hu>
WHR <msl0000023508@gmail.com>
@ -78,6 +79,7 @@ Youzhong Yang <youzhong@gmail.com>
# Signed-off-by: overriding Author:
Ryan <errornointernet@envs.net> <error.nointernet@gmail.com>
Sietse <sietse@wizdom.nu> <uglymotha@wizdom.nu>
Qiuhao Chen <chenqiuhao1997@gmail.com> <haohao0924@126.com>
Yuxin Wang <yuxinwang9999@gmail.com> <Bi11gates9999@gmail.com>
Zhenlei Huang <zlei@FreeBSD.org> <zlei.huang@gmail.com>

View File

@ -423,6 +423,7 @@ CONTRIBUTORS:
Mathieu Velten <matmaul@gmail.com>
Matt Fiddaman <github@m.fiddaman.uk>
Matthew Ahrens <matt@delphix.com>
Matthew Heller <matthew.f.heller@gmail.com>
Matthew Thode <mthode@mthode.org>
Matthias Blankertz <matthias@blankertz.org>
Matt Johnston <matt@fugro-fsi.com.au>
@ -562,6 +563,7 @@ CONTRIBUTORS:
Scot W. Stevenson <scot.stevenson@gmail.com>
Sean Eric Fagan <sef@ixsystems.com>
Sebastian Gottschall <s.gottschall@dd-wrt.com>
Sebastian Wuerl <s.wuerl@mailbox.org>
Sebastien Roy <seb@delphix.com>
Sen Haerens <sen@senhaerens.be>
Serapheim Dimitropoulos <serapheim@delphix.com>
@ -574,6 +576,7 @@ CONTRIBUTORS:
Shawn Bayern <sbayern@law.fsu.edu>
Shengqi Chen <harry-chen@outlook.com>
Shen Yan <shenyanxxxy@qq.com>
Sietse <sietse@wizdom.nu>
Simon Guest <simon.guest@tesujimath.org>
Simon Klinkert <simon.klinkert@gmail.com>
Sowrabha Gopal <sowrabha.gopal@delphix.com>
@ -629,6 +632,7 @@ CONTRIBUTORS:
Trevor Bautista <trevrb@trevrb.net>
Trey Dockendorf <treydock@gmail.com>
Troels Nørgaard <tnn@tradeshift.com>
tstabrawa <tstabrawa@users.noreply.github.com>
Tulsi Jain <tulsi.jain@delphix.com>
Turbo Fredriksson <turbo@bayour.com>
Tyler J. Stachecki <stachecki.tyler@gmail.com>

View File

@ -6,5 +6,5 @@ Release: 1
Release-Tags: relext
License: CDDL
Author: OpenZFS
Linux-Maximum: 6.11
Linux-Maximum: 6.12
Linux-Minimum: 4.18

View File

@ -662,10 +662,7 @@ def section_arc(kstats_dict):
print()
print('ARC hash breakdown:')
prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max']))
prt_i2('Elements current:',
f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']),
f_hits(arc_stats['hash_elements']))
prt_i1('Elements:', f_hits(arc_stats['hash_elements']))
prt_i1('Collisions:', f_hits(arc_stats['hash_collisions']))
prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max']))

View File

@ -2119,9 +2119,6 @@ dump_brt(spa_t *spa)
return;
}
brt_t *brt = spa->spa_brt;
VERIFY(brt);
char count[32], used[32], saved[32];
zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
@ -2132,11 +2129,8 @@ dump_brt(spa_t *spa)
if (dump_opt['T'] < 2)
return;
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
if (brtvd == NULL)
continue;
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
if (!brtvd->bv_initiated) {
printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
continue;
@ -2160,20 +2154,21 @@ dump_brt(spa_t *spa)
if (!do_histo)
printf("\n%-16s %-10s\n", "DVA", "REFCNT");
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
if (brtvd == NULL || !brtvd->bv_initiated)
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
if (!brtvd->bv_initiated)
continue;
uint64_t counts[64] = {};
zap_cursor_t zc;
zap_attribute_t *za = zap_attribute_alloc();
for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
for (zap_cursor_init(&zc, spa->spa_meta_objset,
brtvd->bv_mos_entries);
zap_cursor_retrieve(&zc, za) == 0;
zap_cursor_advance(&zc)) {
uint64_t refcnt;
VERIFY0(zap_lookup_uint64(brt->brt_mos,
VERIFY0(zap_lookup_uint64(spa->spa_meta_objset,
brtvd->bv_mos_entries,
(const uint64_t *)za->za_name, 1,
za->za_integer_length, za->za_num_integers,
@ -8227,14 +8222,11 @@ dump_mos_leaks(spa_t *spa)
}
}
if (spa->spa_brt != NULL) {
brt_t *brt = spa->spa_brt;
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
if (brtvd != NULL && brtvd->bv_initiated) {
mos_obj_refd(brtvd->bv_mos_brtvdev);
mos_obj_refd(brtvd->bv_mos_entries);
}
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
if (brtvd->bv_initiated) {
mos_obj_refd(brtvd->bv_mos_brtvdev);
mos_obj_refd(brtvd->bv_mos_entries);
}
}

View File

@ -445,8 +445,8 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
* its a loopback event from spa_async_remove(). Just
* ignore it.
*/
if (vs->vs_state == VDEV_STATE_REMOVED &&
state == VDEV_STATE_REMOVED)
if ((vs->vs_state == VDEV_STATE_REMOVED && state ==
VDEV_STATE_REMOVED) || vs->vs_state == VDEV_STATE_OFFLINE)
return;
/* Remove the vdev since device is unplugged */

View File

@ -201,7 +201,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
"failed (%lld " #OP " %lld) " STR "\n", \
(long long)(_verify3_left), \
(long long)(_verify3_right), \
__VA_ARGS); \
__VA_ARGS__); \
} while (0)
#define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \
@ -213,7 +213,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
"failed (%llu " #OP " %llu) " STR "\n", \
(unsigned long long)(_verify3_left), \
(unsigned long long)(_verify3_right), \
__VA_ARGS); \
__VA_ARGS__); \
} while (0)
#define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \

View File

@ -98,11 +98,9 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync)
{
if (vm_object_mightbedirty(vp->v_object)) {
int flags = sync ? OBJPC_SYNC : 0;
vn_lock(vp, LK_SHARED | LK_RETRY);
zfs_vmobject_wlock(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, flags);
zfs_vmobject_wunlock(vp->v_object);
VOP_UNLOCK(vp);
}
}
#endif

View File

@ -205,7 +205,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
"failed (%lld " #OP " %lld) " STR "\n", \
(long long)(_verify3_left), \
(long long)(_verify3_right), \
__VA_ARGS); \
__VA_ARGS__); \
} while (0)
#define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \
@ -217,7 +217,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
"failed (%llu " #OP " %llu) " STR "\n", \
(unsigned long long)(_verify3_left), \
(unsigned long long)(_verify3_right), \
__VA_ARGS); \
__VA_ARGS__); \
} while (0)
#define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \

View File

@ -347,6 +347,7 @@ void l2arc_fini(void);
void l2arc_start(void);
void l2arc_stop(void);
void l2arc_spa_rebuild_start(spa_t *spa);
void l2arc_spa_rebuild_stop(spa_t *spa);
#ifndef _KERNEL
extern boolean_t arc_watch;

View File

@ -942,6 +942,7 @@ typedef struct arc_sums {
wmsum_t arcstat_evict_l2_eligible_mru;
wmsum_t arcstat_evict_l2_ineligible;
wmsum_t arcstat_evict_l2_skip;
wmsum_t arcstat_hash_elements;
wmsum_t arcstat_hash_collisions;
wmsum_t arcstat_hash_chains;
aggsum_t arcstat_size;

View File

@ -86,28 +86,38 @@ typedef struct brt_vdev_phys {
uint64_t bvp_savedspace;
} brt_vdev_phys_t;
typedef struct brt_vdev {
struct brt_vdev {
/*
* Pending changes from open contexts.
*/
kmutex_t bv_pending_lock;
avl_tree_t bv_pending_tree[TXG_SIZE];
/*
* Protects bv_mos_*.
*/
krwlock_t bv_mos_entries_lock ____cacheline_aligned;
/*
* Protects all the fields starting from bv_initiated.
*/
krwlock_t bv_lock ____cacheline_aligned;
/*
* VDEV id.
*/
uint64_t bv_vdevid;
/*
* Is the structure initiated?
* (bv_entcount and bv_bitmap are allocated?)
*/
boolean_t bv_initiated;
uint64_t bv_vdevid ____cacheline_aligned;
/*
* Object number in the MOS for the entcount array and brt_vdev_phys.
*/
uint64_t bv_mos_brtvdev;
/*
* Object number in the MOS for the entries table.
* Object number in the MOS and dnode for the entries table.
*/
uint64_t bv_mos_entries;
dnode_t *bv_mos_entries_dnode;
/*
* Entries to sync.
* Is the structure initiated?
* (bv_entcount and bv_bitmap are allocated?)
*/
avl_tree_t bv_tree;
boolean_t bv_initiated;
/*
* Does the bv_entcount[] array needs byte swapping?
*/
@ -120,6 +130,26 @@ typedef struct brt_vdev {
* This is the array with BRT entry count per BRT_RANGESIZE.
*/
uint16_t *bv_entcount;
/*
* bv_entcount[] potentially can be a bit too big to sychronize it all
* when we just changed few entcounts. The fields below allow us to
* track updates to bv_entcount[] array since the last sync.
* A single bit in the bv_bitmap represents as many entcounts as can
* fit into a single BRT_BLOCKSIZE.
* For example we have 65536 entcounts in the bv_entcount array
* (so the whole array is 128kB). We updated bv_entcount[2] and
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
*/
ulong_t *bv_bitmap;
/*
* bv_entcount[] needs updating on disk.
*/
boolean_t bv_entcount_dirty;
/*
* brt_vdev_phys needs updating on disk.
*/
boolean_t bv_meta_dirty;
/*
* Sum of all bv_entcount[]s.
*/
@ -133,65 +163,27 @@ typedef struct brt_vdev {
*/
uint64_t bv_savedspace;
/*
* brt_vdev_phys needs updating on disk.
* Entries to sync.
*/
boolean_t bv_meta_dirty;
/*
* bv_entcount[] needs updating on disk.
*/
boolean_t bv_entcount_dirty;
/*
* bv_entcount[] potentially can be a bit too big to sychronize it all
* when we just changed few entcounts. The fields below allow us to
* track updates to bv_entcount[] array since the last sync.
* A single bit in the bv_bitmap represents as many entcounts as can
* fit into a single BRT_BLOCKSIZE.
* For example we have 65536 entcounts in the bv_entcount array
* (so the whole array is 128kB). We updated bv_entcount[2] and
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
*/
ulong_t *bv_bitmap;
uint64_t bv_nblocks;
} brt_vdev_t;
avl_tree_t bv_tree;
};
/*
* In-core brt
*/
typedef struct brt {
krwlock_t brt_lock;
spa_t *brt_spa;
#define brt_mos brt_spa->spa_meta_objset
uint64_t brt_rangesize;
uint64_t brt_usedspace;
uint64_t brt_savedspace;
avl_tree_t brt_pending_tree[TXG_SIZE];
kmutex_t brt_pending_lock[TXG_SIZE];
/* Sum of all entries across all bv_trees. */
uint64_t brt_nentries;
brt_vdev_t *brt_vdevs;
uint64_t brt_nvdevs;
} brt_t;
/* Size of bre_offset / sizeof (uint64_t). */
/* Size of offset / sizeof (uint64_t). */
#define BRT_KEY_WORDS (1)
#define BRE_OFFSET(bre) (DVA_GET_OFFSET(&(bre)->bre_bp.blk_dva[0]))
/*
* In-core brt entry.
* On-disk we use bre_offset as the key and bre_refcount as the value.
* On-disk we use ZAP with offset as the key and count as the value.
*/
typedef struct brt_entry {
uint64_t bre_offset;
uint64_t bre_refcount;
avl_node_t bre_node;
blkptr_t bre_bp;
uint64_t bre_count;
uint64_t bre_pcount;
} brt_entry_t;
typedef struct brt_pending_entry {
blkptr_t bpe_bp;
int bpe_count;
avl_node_t bpe_node;
} brt_pending_entry_t;
#ifdef __cplusplus
}
#endif

View File

@ -53,6 +53,7 @@ extern "C" {
/*
* Forward references that lots of things need.
*/
typedef struct brt_vdev brt_vdev_t;
typedef struct spa spa_t;
typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t;

View File

@ -412,8 +412,12 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
uint64_t spa_rdspace; /* raw (non-dedup) --//-- */
boolean_t spa_active_ddt_prune; /* ddt prune process active */
struct brt *spa_brt; /* in-core BRT */
brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */
uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */
uint64_t spa_brt_rangesize; /* pool's BRT range size */
krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */

View File

@ -223,11 +223,15 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
boolean_t *normalization_conflictp);
int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
int zap_prefetch_object(objset_t *os, uint64_t zapobj);
int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints);
int zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints);
int zap_lookup_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf);
@ -236,9 +240,6 @@ int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp);
int zap_count_write_by_dnode(dnode_t *dn, const char *name,
int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite);
/*
* Create an attribute with the given name and value.
*

View File

@ -109,7 +109,7 @@ Stops and cancels an in-progress removal of a top-level vdev.
.El
.
.Sh EXAMPLES
.\" These are, respectively, examples 14 from zpool.8
.\" These are, respectively, examples 15 from zpool.8
.\" Make sure to update them bidirectionally
.Ss Example 1 : No Removing a Mirrored top-level (Log or Data) Device
The following commands remove the mirrored log device
@ -142,9 +142,43 @@ The command to remove the mirrored log
.Ar mirror-2 No is :
.Dl # Nm zpool Cm remove Ar tank mirror-2
.Pp
At this point, the log device no longer exists
(both sides of the mirror have been removed):
.Bd -literal -compact -offset Ds
pool: tank
state: ONLINE
scan: none requested
config:
NAME STATE READ WRITE CKSUM
tank ONLINE 0 0 0
mirror-0 ONLINE 0 0 0
sda ONLINE 0 0 0
sdb ONLINE 0 0 0
mirror-1 ONLINE 0 0 0
sdc ONLINE 0 0 0
sdd ONLINE 0 0 0
.Ed
.Pp
The command to remove the mirrored data
.Ar mirror-1 No is :
.Dl # Nm zpool Cm remove Ar tank mirror-1
.Pp
After
.Ar mirror-1 No has been evacuated, the pool remains redundant, but
the total amount of space is reduced:
.Bd -literal -compact -offset Ds
pool: tank
state: ONLINE
scan: none requested
config:
NAME STATE READ WRITE CKSUM
tank ONLINE 0 0 0
mirror-0 ONLINE 0 0 0
sda ONLINE 0 0 0
sdb ONLINE 0 0 0
.Ed
.
.Sh SEE ALSO
.Xr zpool-add 8 ,

View File

@ -405,9 +405,43 @@ The command to remove the mirrored log
.Ar mirror-2 No is :
.Dl # Nm zpool Cm remove Ar tank mirror-2
.Pp
At this point, the log device no longer exists
(both sides of the mirror have been removed):
.Bd -literal -compact -offset Ds
pool: tank
state: ONLINE
scan: none requested
config:
NAME STATE READ WRITE CKSUM
tank ONLINE 0 0 0
mirror-0 ONLINE 0 0 0
sda ONLINE 0 0 0
sdb ONLINE 0 0 0
mirror-1 ONLINE 0 0 0
sdc ONLINE 0 0 0
sdd ONLINE 0 0 0
.Ed
.Pp
The command to remove the mirrored data
.Ar mirror-1 No is :
.Dl # Nm zpool Cm remove Ar tank mirror-1
.Pp
After
.Ar mirror-1 No has been evacuated, the pool remains redundant, but
the total amount of space is reduced:
.Bd -literal -compact -offset Ds
pool: tank
state: ONLINE
scan: none requested
config:
NAME STATE READ WRITE CKSUM
tank ONLINE 0 0 0
mirror-0 ONLINE 0 0 0
sda ONLINE 0 0 0
sdb ONLINE 0 0 0
.Ed
.
.Ss Example 16 : No Displaying expanded space on a device
The following command displays the detailed information for the pool

View File

@ -291,8 +291,12 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
case F_SEEK_HOLE:
{
off = *(offset_t *)data;
error = vn_lock(vp, LK_SHARED);
if (error)
return (error);
/* offset parameter is in/out */
error = zfs_holey(VTOZ(vp), com, &off);
VOP_UNLOCK(vp);
if (error)
return (error);
*(offset_t *)data = off;
@ -452,8 +456,10 @@ mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
if (!vm_page_wired(pp) && pp->valid == 0 &&
vm_page_busy_tryupgrade(pp))
vm_page_free(pp);
else
else {
vm_page_deactivate_noreuse(pp);
vm_page_sunbusy(pp);
}
zfs_vmobject_wunlock(obj);
}
} else {
@ -3928,6 +3934,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
return (zfs_vm_pagerret_error);
object = ma[0]->object;
start = IDX_TO_OFF(ma[0]->pindex);
end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
@ -3936,33 +3943,47 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
* Note that we need to handle the case of the block size growing.
*/
for (;;) {
uint64_t len;
blksz = zp->z_blksz;
len = roundup(end, blksz) - rounddown(start, blksz);
lr = zfs_rangelock_tryenter(&zp->z_rangelock,
rounddown(start, blksz),
roundup(end, blksz) - rounddown(start, blksz), RL_READER);
rounddown(start, blksz), len, RL_READER);
if (lr == NULL) {
if (rahead != NULL) {
*rahead = 0;
rahead = NULL;
/*
* Avoid a deadlock with update_pages(). We need to
* hold the range lock when copying from the DMU, so
* give up the busy lock to allow update_pages() to
* proceed. We might need to allocate new pages, which
* isn't quite right since this allocation isn't subject
* to the page fault handler's OOM logic, but this is
* the best we can do for now.
*/
for (int i = 0; i < count; i++) {
ASSERT(vm_page_none_valid(ma[i]));
vm_page_xunbusy(ma[i]);
}
if (rbehind != NULL) {
*rbehind = 0;
rbehind = NULL;
}
break;
lr = zfs_rangelock_enter(&zp->z_rangelock,
rounddown(start, blksz), len, RL_READER);
zfs_vmobject_wlock(object);
(void) vm_page_grab_pages(object, OFF_TO_IDX(start),
VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO,
ma, count);
zfs_vmobject_wunlock(object);
}
if (blksz == zp->z_blksz)
break;
zfs_rangelock_exit(lr);
}
object = ma[0]->object;
zfs_vmobject_wlock(object);
obj_size = object->un_pager.vnp.vnp_size;
zfs_vmobject_wunlock(object);
if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
if (lr != NULL)
zfs_rangelock_exit(lr);
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
return (zfs_vm_pagerret_bad);
}
@ -3987,11 +4008,33 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
* ZFS will panic if we request DMU to read beyond the end of the last
* allocated block.
*/
error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b,
&pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE));
for (int i = 0; i < count; i++) {
int dummypgsin, count1, j, last_size;
if (lr != NULL)
zfs_rangelock_exit(lr);
if (vm_page_any_valid(ma[i])) {
ASSERT(vm_page_all_valid(ma[i]));
continue;
}
for (j = i + 1; j < count; j++) {
if (vm_page_any_valid(ma[j])) {
ASSERT(vm_page_all_valid(ma[j]));
break;
}
}
count1 = j - i;
dummypgsin = 0;
last_size = j == count ?
MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE;
error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1,
i == 0 ? &pgsin_b : &dummypgsin,
j == count ? &pgsin_a : &dummypgsin,
last_size);
if (error != 0)
break;
i += count1 - 1;
}
zfs_rangelock_exit(lr);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);
@ -6159,7 +6202,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
} else {
#if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
__FreeBSD_version >= 1400086
vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false,
vn_lock_pair(invp, false, LK_SHARED, outvp, false,
LK_EXCLUSIVE);
#else
vn_lock_pair(invp, false, outvp, false);

View File

@ -375,7 +375,18 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
struct super_block *sb = (struct super_block *)arg;
int objects = 0;
(void) -zfs_prune(sb, nr_to_scan, &objects);
/*
* deactivate_locked_super calls shrinker_free and only then
* sops->kill_sb cb, resulting in UAF on umount when trying to reach
* for the shrinker functions in zpl_prune_sb of in-umount dataset.
* Increment if s_active is not zero, but don't prune if it is -
* umount could be underway.
*/
if (atomic_inc_not_zero(&sb->s_active)) {
(void) -zfs_prune(sb, nr_to_scan, &objects);
atomic_dec(&sb->s_active);
}
}
const struct super_operations zpl_super_operations = {

View File

@ -1176,7 +1176,7 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
limits->zql_max_segment_size = UINT_MAX;
}
limits->zql_io_opt = zv->zv_volblocksize;
limits->zql_io_opt = DMU_MAX_ACCESS / 2;
limits->zql_physical_block_size = zv->zv_volblocksize;
limits->zql_max_discard_sectors =

View File

@ -1074,12 +1074,9 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
ARCSTAT_BUMP(arcstat_hash_collisions);
if (i == 1)
ARCSTAT_BUMP(arcstat_hash_chains);
ARCSTAT_MAX(arcstat_hash_chain_max, i);
}
uint64_t he = atomic_inc_64_nv(
&arc_stats.arcstat_hash_elements.value.ui64);
ARCSTAT_MAX(arcstat_hash_elements_max, he);
ARCSTAT_BUMP(arcstat_hash_elements);
return (NULL);
}
@ -1103,8 +1100,7 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
/* collect some hash table performance data */
atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64);
ARCSTAT_BUMPDOWN(arcstat_hash_elements);
if (buf_hash_table.ht_table[idx] &&
buf_hash_table.ht_table[idx]->b_hash_next == NULL)
ARCSTAT_BUMPDOWN(arcstat_hash_chains);
@ -7008,6 +7004,9 @@ arc_kstat_update(kstat_t *ksp, int rw)
wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
as->arcstat_evict_l2_skip.value.ui64 =
wmsum_value(&arc_sums.arcstat_evict_l2_skip);
as->arcstat_hash_elements.value.ui64 =
as->arcstat_hash_elements_max.value.ui64 =
wmsum_value(&arc_sums.arcstat_hash_elements);
as->arcstat_hash_collisions.value.ui64 =
wmsum_value(&arc_sums.arcstat_hash_collisions);
as->arcstat_hash_chains.value.ui64 =
@ -7432,6 +7431,7 @@ arc_state_init(void)
wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
wmsum_init(&arc_sums.arcstat_hash_elements, 0);
wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
wmsum_init(&arc_sums.arcstat_hash_chains, 0);
aggsum_init(&arc_sums.arcstat_size, 0);
@ -7590,6 +7590,7 @@ arc_state_fini(void)
wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
wmsum_fini(&arc_sums.arcstat_hash_elements);
wmsum_fini(&arc_sums.arcstat_hash_collisions);
wmsum_fini(&arc_sums.arcstat_hash_chains);
aggsum_fini(&arc_sums.arcstat_size);
@ -9287,6 +9288,14 @@ skip:
hdr->b_l2hdr.b_hits = 0;
hdr->b_l2hdr.b_arcs_state =
hdr->b_l1hdr.b_state->arcs_state;
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
ARC_FLAG_L2_WRITING);
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
arc_hdr_size(hdr), hdr);
l2arc_hdr_arcstats_increment(hdr);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
mutex_enter(&dev->l2ad_mtx);
if (pio == NULL) {
/*
@ -9298,12 +9307,6 @@ skip:
}
list_insert_head(&dev->l2ad_buflist, hdr);
mutex_exit(&dev->l2ad_mtx);
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
ARC_FLAG_L2_WRITING);
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
arc_hdr_size(hdr), hdr);
l2arc_hdr_arcstats_increment(hdr);
boolean_t commit = l2arc_log_blk_insert(dev, hdr);
mutex_exit(hash_lock);
@ -9333,7 +9336,6 @@ skip:
write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
if (commit) {
/* l2ad_hand will be adjusted inside. */
@ -9844,6 +9846,37 @@ l2arc_spa_rebuild_start(spa_t *spa)
}
}
void
l2arc_spa_rebuild_stop(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
l2arc_dev_t *dev =
l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
if (dev == NULL)
continue;
mutex_enter(&l2arc_rebuild_thr_lock);
dev->l2ad_rebuild_cancel = B_TRUE;
mutex_exit(&l2arc_rebuild_thr_lock);
}
for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
l2arc_dev_t *dev =
l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
if (dev == NULL)
continue;
mutex_enter(&l2arc_rebuild_thr_lock);
if (dev->l2ad_rebuild_began == B_TRUE) {
while (dev->l2ad_rebuild == B_TRUE) {
cv_wait(&l2arc_rebuild_thr_cv,
&l2arc_rebuild_thr_lock);
}
}
mutex_exit(&l2arc_rebuild_thr_lock);
}
}
/*
* Main entry point for L2ARC rebuilding.
*/
@ -9852,12 +9885,12 @@ l2arc_dev_rebuild_thread(void *arg)
{
l2arc_dev_t *dev = arg;
VERIFY(!dev->l2ad_rebuild_cancel);
VERIFY(dev->l2ad_rebuild);
(void) l2arc_rebuild(dev);
mutex_enter(&l2arc_rebuild_thr_lock);
dev->l2ad_rebuild_began = B_FALSE;
dev->l2ad_rebuild = B_FALSE;
cv_signal(&l2arc_rebuild_thr_cv);
mutex_exit(&l2arc_rebuild_thr_lock);
thread_exit();
@ -10008,8 +10041,6 @@ l2arc_rebuild(l2arc_dev_t *dev)
for (;;) {
mutex_enter(&l2arc_rebuild_thr_lock);
if (dev->l2ad_rebuild_cancel) {
dev->l2ad_rebuild = B_FALSE;
cv_signal(&l2arc_rebuild_thr_cv);
mutex_exit(&l2arc_rebuild_thr_lock);
err = SET_ERROR(ECANCELED);
goto out;
@ -10585,6 +10616,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
(void) zio_nowait(wzio);
dev->l2ad_hand += asize;
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
/*
* Include the committed log block's pointer in the list of pointers
* to log blocks present in the L2ARC device.
@ -10598,7 +10631,6 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
mutex_exit(&dev->l2ad_mtx);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
/* bump the kstats */
ARCSTAT_INCR(arcstat_l2_write_bytes, asize);

File diff suppressed because it is too large Load Diff

View File

@ -89,7 +89,6 @@ typedef struct dbuf_stats {
kstat_named_t hash_misses;
kstat_named_t hash_collisions;
kstat_named_t hash_elements;
kstat_named_t hash_elements_max;
/*
* Number of sublists containing more than one dbuf in the dbuf
* hash table. Keep track of the longest hash chain.
@ -134,7 +133,6 @@ dbuf_stats_t dbuf_stats = {
{ "hash_misses", KSTAT_DATA_UINT64 },
{ "hash_collisions", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
{ "hash_elements_max", KSTAT_DATA_UINT64 },
{ "hash_chains", KSTAT_DATA_UINT64 },
{ "hash_chain_max", KSTAT_DATA_UINT64 },
{ "hash_insert_race", KSTAT_DATA_UINT64 },
@ -154,6 +152,7 @@ struct {
wmsum_t hash_hits;
wmsum_t hash_misses;
wmsum_t hash_collisions;
wmsum_t hash_elements;
wmsum_t hash_chains;
wmsum_t hash_insert_race;
wmsum_t metadata_cache_count;
@ -432,8 +431,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
db->db_hash_next = h->hash_table[idx];
h->hash_table[idx] = db;
mutex_exit(DBUF_HASH_MUTEX(h, idx));
uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
DBUF_STAT_MAX(hash_elements_max, he);
DBUF_STAT_BUMP(hash_elements);
return (NULL);
}
@ -506,7 +504,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
h->hash_table[idx]->db_hash_next == NULL)
DBUF_STAT_BUMPDOWN(hash_chains);
mutex_exit(DBUF_HASH_MUTEX(h, idx));
atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
DBUF_STAT_BUMPDOWN(hash_elements);
}
typedef enum {
@ -903,6 +901,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
wmsum_value(&dbuf_sums.hash_misses);
ds->hash_collisions.value.ui64 =
wmsum_value(&dbuf_sums.hash_collisions);
ds->hash_elements.value.ui64 =
wmsum_value(&dbuf_sums.hash_elements);
ds->hash_chains.value.ui64 =
wmsum_value(&dbuf_sums.hash_chains);
ds->hash_insert_race.value.ui64 =
@ -1004,6 +1004,7 @@ dbuf_init(void)
wmsum_init(&dbuf_sums.hash_hits, 0);
wmsum_init(&dbuf_sums.hash_misses, 0);
wmsum_init(&dbuf_sums.hash_collisions, 0);
wmsum_init(&dbuf_sums.hash_elements, 0);
wmsum_init(&dbuf_sums.hash_chains, 0);
wmsum_init(&dbuf_sums.hash_insert_race, 0);
wmsum_init(&dbuf_sums.metadata_cache_count, 0);
@ -1077,6 +1078,7 @@ dbuf_fini(void)
wmsum_fini(&dbuf_sums.hash_hits);
wmsum_fini(&dbuf_sums.hash_misses);
wmsum_fini(&dbuf_sums.hash_collisions);
wmsum_fini(&dbuf_sums.hash_elements);
wmsum_fini(&dbuf_sums.hash_chains);
wmsum_fini(&dbuf_sums.hash_insert_race);
wmsum_fini(&dbuf_sums.metadata_cache_count);
@ -2578,8 +2580,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* We are freeing a block that we cloned in the same
* transaction group.
*/
brt_pending_remove(dmu_objset_spa(db->db_objset),
&dr->dt.dl.dr_overridden_by, tx);
blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
brt_pending_remove(dmu_objset_spa(db->db_objset),
bp, tx);
}
}
dnode_t *dn = dr->dr_dnode;

View File

@ -68,6 +68,7 @@
#include <sys/zio_compress.h>
#include <zfs_fletcher.h>
#include <sys/zio_checksum.h>
#include <sys/brt.h>
/*
* The SPA supports block sizes up to 16MB. However, very large blocks
@ -289,8 +290,26 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
int64_t delta;
dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object);
dsl_free(tx->tx_pool, tx->tx_txg, bp);
/*
* Put blocks that would create IO on the pool's deadlist for
* dsl_process_async_destroys() to find. This is to prevent
* zio_free() from creating a ZIO_TYPE_FREE IO for them, which
* are very heavy and can lead to out-of-memory conditions if
* something tries to free millions of blocks on the same txg.
*/
boolean_t defer = spa_version(spa) >= SPA_VERSION_DEADLISTS &&
(BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
brt_maybe_exists(spa, bp));
if (defer) {
dprintf_bp(bp, "putting on free list: %s", "");
bpobj_enqueue(&ds->ds_dir->dd_pool->dp_free_bpobj,
bp, B_FALSE, tx);
} else {
dprintf_bp(bp, "freeing ds=%llu",
(u_longlong_t)ds->ds_object);
dsl_free(tx->tx_pool, tx->tx_txg, bp);
}
mutex_enter(&ds->ds_lock);
ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
@ -298,9 +317,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
delta = parent_delta(ds, -used);
dsl_dataset_phys(ds)->ds_unique_bytes -= used;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_transfer_space(ds->ds_dir,
delta, -compressed, -uncompressed, -used,
DD_USED_REFRSRV, DD_USED_HEAD, tx);
if (defer)
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
DD_USED_HEAD, used, compressed, uncompressed, tx);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
if (async) {

View File

@ -2081,6 +2081,7 @@ spa_unload(spa_t *spa)
vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
vdev_rebuild_stop_all(spa);
l2arc_spa_rebuild_stop(spa);
}
}
@ -7115,6 +7116,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
vdev_rebuild_stop_all(spa);
l2arc_spa_rebuild_stop(spa);
/*
* We want this to be reflected on every label,

View File

@ -1870,13 +1870,7 @@ spa_get_slop_space(spa_t *spa)
if (spa->spa_dedup_dspace == ~0ULL)
spa_update_dspace(spa);
/*
* spa_get_dspace() includes the space only logically "used" by
* deduplicated data, so since it's not useful to reserve more
* space with more deduplicated data, we subtract that out here.
*/
space =
spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
space = spa->spa_rdspace;
slop = MIN(space >> spa_slop_shift, spa_max_slop);
/*
@ -1912,8 +1906,7 @@ spa_get_checkpoint_space(spa_t *spa)
void
spa_update_dspace(spa_t *spa)
{
spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa));
if (spa->spa_nonallocating_dspace > 0) {
/*
* Subtract the space provided by all non-allocating vdevs that
@ -1933,9 +1926,11 @@ spa_update_dspace(spa_t *spa)
* doesn't matter that the data we are moving may be
* allocated twice (on the old device and the new device).
*/
ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace);
spa->spa_dspace -= spa->spa_nonallocating_dspace;
ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace);
spa->spa_rdspace -= spa->spa_nonallocating_dspace;
}
spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) +
brt_get_dspace(spa);
}
/*

View File

@ -248,20 +248,63 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
return (chunk_head);
}
static void
zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
/*
* Non-destructively copy array between leaves.
*/
static uint16_t
zap_leaf_array_copy(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
{
uint16_t chunk = *chunkp;
*chunkp = CHAIN_END;
uint16_t new_chunk;
uint16_t *nchunkp = &new_chunk;
while (chunk != CHAIN_END) {
uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
ZAP_CHUNK_ARRAY);
zap_leaf_chunk_free(l, chunk);
chunk = nextchunk;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
uint16_t nchunk = zap_leaf_chunk_alloc(nl);
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
struct zap_leaf_array *nla =
&ZAP_LEAF_CHUNK(nl, nchunk).l_array;
ASSERT3U(la->la_type, ==, ZAP_CHUNK_ARRAY);
*nla = *la; /* structure assignment */
chunk = la->la_next;
*nchunkp = nchunk;
nchunkp = &nla->la_next;
}
*nchunkp = CHAIN_END;
return (new_chunk);
}
/*
* Free array. Unlike trivial loop of zap_leaf_chunk_free() this does
* not reverse order of chunks in the free list, reducing fragmentation.
*/
static void
zap_leaf_array_free(zap_leaf_t *l, uint16_t chunk)
{
struct zap_leaf_header *hdr = &zap_leaf_phys(l)->l_hdr;
uint16_t *tailp = &hdr->lh_freelist;
uint16_t oldfree = *tailp;
while (chunk != CHAIN_END) {
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
zap_leaf_chunk_t *c = &ZAP_LEAF_CHUNK(l, chunk);
ASSERT3U(c->l_array.la_type, ==, ZAP_CHUNK_ARRAY);
*tailp = chunk;
chunk = c->l_array.la_next;
c->l_free.lf_type = ZAP_CHUNK_FREE;
memset(c->l_free.lf_pad, 0, sizeof (c->l_free.lf_pad));
tailp = &c->l_free.lf_next;
ASSERT3U(hdr->lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
hdr->lh_nfree++;
}
*tailp = oldfree;
}
/* array_len and buf_len are in integers, not bytes */
@ -515,7 +558,7 @@ zap_entry_update(zap_entry_handle_t *zeh,
if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
return (SET_ERROR(EAGAIN));
zap_leaf_array_free(l, &le->le_value_chunk);
zap_leaf_array_free(l, le->le_value_chunk);
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
le->le_value_numints = num_integers;
@ -534,10 +577,11 @@ zap_entry_remove(zap_entry_handle_t *zeh)
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
zap_leaf_array_free(l, &le->le_name_chunk);
zap_leaf_array_free(l, &le->le_value_chunk);
*zeh->zeh_chunkp = le->le_next;
/* Free in opposite order to reduce fragmentation. */
zap_leaf_array_free(l, le->le_value_chunk);
zap_leaf_array_free(l, le->le_name_chunk);
zap_leaf_chunk_free(l, entry_chunk);
zap_leaf_phys(l)->l_hdr.lh_nentries--;
@ -701,34 +745,6 @@ zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
return (chunkp);
}
static uint16_t
zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
{
uint16_t new_chunk;
uint16_t *nchunkp = &new_chunk;
while (chunk != CHAIN_END) {
uint16_t nchunk = zap_leaf_chunk_alloc(nl);
struct zap_leaf_array *nla =
&ZAP_LEAF_CHUNK(nl, nchunk).l_array;
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
uint_t nextchunk = la->la_next;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
*nla = *la; /* structure assignment */
zap_leaf_chunk_free(l, chunk);
chunk = nextchunk;
*nchunkp = nchunk;
nchunkp = &nla->la_next;
}
*nchunkp = CHAIN_END;
return (new_chunk);
}
static void
zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
{
@ -741,10 +757,12 @@ zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
(void) zap_leaf_rehash_entry(nl, nle, chunk);
nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
nle->le_value_chunk =
zap_leaf_transfer_array(l, le->le_value_chunk, nl);
nle->le_name_chunk = zap_leaf_array_copy(l, le->le_name_chunk, nl);
nle->le_value_chunk = zap_leaf_array_copy(l, le->le_value_chunk, nl);
/* Free in opposite order to reduce fragmentation. */
zap_leaf_array_free(l, le->le_value_chunk);
zap_leaf_array_free(l, le->le_name_chunk);
zap_leaf_chunk_free(l, entry);
zap_leaf_phys(l)->l_hdr.lh_nentries--;

View File

@ -1227,6 +1227,21 @@ zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
return (err);
}
static int
zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
{
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
fzap_prefetch(zn);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (0);
}
int
zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints)
@ -1237,13 +1252,37 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_prefetch_uint64_impl(zap, key, key_numints);
/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
return (err);
}
int
zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
{
zap_t *zap;
int err =
zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_prefetch_uint64_impl(zap, key, key_numints);
/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
return (err);
}
static int
zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
{
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
fzap_prefetch(zn);
int err = fzap_lookup(zn, integer_size, num_integers, buf,
NULL, 0, NULL);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
@ -1259,16 +1298,25 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
num_integers, buf);
/* zap_lookup_uint64_impl() calls zap_unlockdir() */
return (err);
}
err = fzap_lookup(zn, integer_size, num_integers, buf,
NULL, 0, NULL);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
int
zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
{
zap_t *zap;
int err =
zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
num_integers, buf);
/* zap_lookup_uint64_impl() calls zap_unlockdir() */
return (err);
}

View File

@ -2192,31 +2192,20 @@ zio_delay_interrupt(zio_t *zio)
} else {
taskqid_t tid;
hrtime_t diff = zio->io_target_timestamp - now;
clock_t expire_at_tick = ddi_get_lbolt() +
NSEC_TO_TICK(diff);
int ticks = MAX(1, NSEC_TO_TICK(diff));
clock_t expire_at_tick = ddi_get_lbolt() + ticks;
DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
hrtime_t, now, hrtime_t, diff);
if (NSEC_TO_TICK(diff) == 0) {
/* Our delay is less than a jiffy - just spin */
zfs_sleep_until(zio->io_target_timestamp);
zio_interrupt(zio);
} else {
tid = taskq_dispatch_delay(system_taskq, zio_interrupt,
zio, TQ_NOSLEEP, expire_at_tick);
if (tid == TASKQID_INVALID) {
/*
* Use taskq_dispatch_delay() in the place of
* OpenZFS's timeout_generic().
* Couldn't allocate a task. Just finish the
* zio without a delay.
*/
tid = taskq_dispatch_delay(system_taskq,
zio_interrupt, zio, TQ_NOSLEEP,
expire_at_tick);
if (tid == TASKQID_INVALID) {
/*
* Couldn't allocate a task. Just
* finish the zio without a delay.
*/
zio_interrupt(zio);
}
zio_interrupt(zio);
}
}
return;

View File

@ -160,6 +160,12 @@ abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
abd_fletcher_4_impl(abd, size, &acd);
}
/*
* Checksum vectors.
*
* Note: you cannot change the name string for these functions, as they are
* embedded in on-disk data in some places (eg dedup table names).
*/
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{NULL, NULL}, NULL, NULL, 0, "inherit"},
{{NULL, NULL}, NULL, NULL, 0, "on"},

View File

@ -44,10 +44,6 @@ static unsigned long zio_decompress_fail_fraction = 0;
/*
* Compression vectors.
*
* NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS.
* THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE
* PART OF THE ON-DISK FORMAT.
*/
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{"inherit", 0, NULL, NULL, NULL},

View File

@ -32,6 +32,7 @@ Requires(post): gcc, make, perl, diffutils
%if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version}%{?openEuler}
Requires: kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999
Requires(post): kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999
Conflicts: kernel-devel < @ZFS_META_KVER_MIN@, kernel-devel > @ZFS_META_KVER_MAX@.999
Obsoletes: spl-dkms <= %{version}
%endif
Provides: %{module}-kmod = %{version}

View File

@ -19,9 +19,13 @@
*/
#include <sys/ioctl.h>
#ifdef _KERNEL
#include <sys/fcntl.h>
#else
#include <fcntl.h>
#endif
#include <linux/fs.h>
#include <err.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

View File

@ -41,9 +41,11 @@ log_must zfs set compress=zle $TESTDSTFS
for prop in "${sync_prop_vals[@]}"; do
log_must zfs set sync=$prop $TESTSRCFS
# 15*8=120, which is greater than 113, so we are sure the data won't
# be embedded into BP.
# 32767*8=262136, which is larger than a single default recordsize of
# 131072.
FILESIZE=$(random_int_between 1 32767)
FILESIZE=$(random_int_between 15 32767)
FILESIZE=$((FILESIZE * 8))
bclone_test random $FILESIZE false $TESTSRCDIR $TESTSRCDIR
done
@ -52,9 +54,11 @@ for srcprop in "${sync_prop_vals[@]}"; do
log_must zfs set sync=$srcprop $TESTSRCFS
for dstprop in "${sync_prop_vals[@]}"; do
log_must zfs set sync=$dstprop $TESTDSTFS
# 15*8=120, which is greater than 113, so we are sure the data won't
# be embedded into BP.
# 32767*8=262136, which is larger than a single default recordsize of
# 131072.
FILESIZE=$(random_int_between 1 32767)
FILESIZE=$(random_int_between 15 32767)
FILESIZE=$((FILESIZE * 8))
bclone_test random $FILESIZE false $TESTSRCDIR $TESTDSTDIR
done

View File

@ -69,15 +69,16 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do
log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE"
# Check no ONLINE slow vdevs are show. Then mark IOs greater than
# 160ms slow, delay IOs 320ms to vdev6, check slow IOs.
# 750ms slow, delay IOs 1000ms to vdev6, check slow IOs.
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE"
log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE"
log_must set_tunable64 ZIO_SLOW_IO_MS 160
log_must zinject -d $TESTDIR/vdev6 -D320:100 $TESTPOOL2
log_must set_tunable64 ZIO_SLOW_IO_MS 750
log_must zinject -d $TESTDIR/vdev6 -D1000:100 $TESTPOOL2
log_must mkfile 1048576 /$TESTPOOL2/testfile
sync_pool $TESTPOOL2
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must zinject -c all
# Check vdev6 slow IOs are only shown when requested with -s.
log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
@ -95,10 +96,9 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE"
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE"
log_must zinject -c all
log_must zpool status -es $TESTPOOL2
zpool destroy $TESTPOOL2
log_must zpool destroy $TESTPOOL2
done
log_pass "Verify zpool status -e shows only unhealthy vdevs"

View File

@ -792,7 +792,7 @@
/* #undef ZFS_DEVICE_MINOR */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.3.99-64-FreeBSD_g1c9a4c8cb"
#define ZFS_META_ALIAS "zfs-2.3.99-92-FreeBSD_gd0a91b9f8"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -801,7 +801,7 @@
/* #undef ZFS_META_DATA */
/* Define the maximum compatible kernel version. */
#define ZFS_META_KVER_MAX "6.11"
#define ZFS_META_KVER_MAX "6.12"
/* Define the minimum compatible kernel version. */
#define ZFS_META_KVER_MIN "4.18"
@ -822,7 +822,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "64-FreeBSD_g1c9a4c8cb"
#define ZFS_META_RELEASE "92-FreeBSD_gd0a91b9f8"
/* Define the project version. */
#define ZFS_META_VERSION "2.3.99"

View File

@ -1 +1 @@
#define ZFS_META_GITREV "zfs-2.3.99-64-g1c9a4c8cb"
#define ZFS_META_GITREV "zfs-2.3.99-92-gd0a91b9f8"