Skip to content

Commit 9250403

Browse files
pcd1193182Paul Dagnelie
and
Paul Dagnelie
authored
Make ganging redundancy respect redundant_metadata property (#17073)
The redundant_metadata setting in ZFS allows users to trade resilience for performance and space savings. This applies to all data and metadata blocks in zfs, with one exception: gang blocks. Gang blocks currently just take the copies property of the IO being ganged and, if it's 1, sets it to 2. This means that we always make at least two copies of a gang header, which is good for resilience. However, if the users care more about performance than resilience, their gang blocks will be even more of a penalty than usual. We add logic to calculate the number of gang headers copies directly, and store it as a separate IO property. This is stored in the IO properties and not calculated when we decide to gang because by that point we may not have easy access to the relevant information about what kind of block is being stored. We also check the redundant_metadata property when doing so, and use that to decide whether to store an extra copy of the gang headers, compared to the underlying blocks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Paul Dagnelie <[email protected]> Co-authored-by: Paul Dagnelie <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Tony Hutter <[email protected]>
1 parent 94b9cbb commit 9250403

File tree

15 files changed

+327
-20
lines changed

15 files changed

+327
-20
lines changed

cmd/zdb/zdb.c

+7-5
Original file line numberDiff line numberDiff line change
@@ -2545,12 +2545,14 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
25452545

25462546
blkbuf[0] = '\0';
25472547

2548-
for (i = 0; i < ndvas; i++)
2548+
for (i = 0; i < ndvas; i++) {
25492549
(void) snprintf(blkbuf + strlen(blkbuf),
2550-
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
2550+
buflen - strlen(blkbuf), "%llu:%llx:%llx%s ",
25512551
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
25522552
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
2553-
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
2553+
(u_longlong_t)DVA_GET_ASIZE(&dva[i]),
2554+
(DVA_GET_GANG(&dva[i]) ? "G" : ""));
2555+
}
25542556

25552557
if (BP_IS_HOLE(bp)) {
25562558
(void) snprintf(blkbuf + strlen(blkbuf),
@@ -8981,7 +8983,7 @@ zdb_read_block(char *thing, spa_t *spa)
89818983

89828984
DVA_SET_VDEV(&dva[0], vd->vdev_id);
89838985
DVA_SET_OFFSET(&dva[0], offset);
8984-
DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
8986+
DVA_SET_GANG(&dva[0], 0);
89858987
DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
89868988

89878989
BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
@@ -8996,7 +8998,7 @@ zdb_read_block(char *thing, spa_t *spa)
89968998
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
89978999

89989000
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8999-
zio = zio_root(spa, NULL, NULL, 0);
9001+
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
90009002

90019003
if (vd == vd->vdev_top) {
90029004
/*

include/sys/dbuf.h

+1
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ typedef struct dbuf_dirty_record {
174174
arc_buf_t *dr_data;
175175
override_states_t dr_override_state;
176176
uint8_t dr_copies;
177+
uint8_t dr_gang_copies;
177178
boolean_t dr_nopwrite;
178179
boolean_t dr_brtwrite;
179180
boolean_t dr_diowrite;

include/sys/zio.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ typedef struct zio_prop {
350350
uint8_t zp_complevel;
351351
uint8_t zp_level;
352352
uint8_t zp_copies;
353+
uint8_t zp_gang_copies;
353354
dmu_object_type_t zp_type;
354355
boolean_t zp_dedup;
355356
boolean_t zp_dedup_verify;
@@ -575,7 +576,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
575576
zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);
576577

577578
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
578-
boolean_t nopwrite, boolean_t brtwrite);
579+
int gang_copies, boolean_t nopwrite, boolean_t brtwrite);
579580

580581
extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
581582

module/zfs/arc.c

+2
Original file line numberDiff line numberDiff line change
@@ -6887,6 +6887,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
68876887
localprop.zp_nopwrite = B_FALSE;
68886888
localprop.zp_copies =
68896889
MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
6890+
localprop.zp_gang_copies =
6891+
MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
68906892
}
68916893
zio_flags |= ZIO_FLAG_RAW;
68926894
} else if (ARC_BUF_COMPRESSED(buf)) {

module/zfs/dbuf.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -5352,8 +5352,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
53525352
mutex_enter(&db->db_mtx);
53535353
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
53545354
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
5355-
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
5356-
dr->dt.dl.dr_brtwrite);
5355+
dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
5356+
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
53575357
mutex_exit(&db->db_mtx);
53585358
} else if (data == NULL) {
53595359
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||

module/zfs/dmu.c

+20-1
Original file line numberDiff line numberDiff line change
@@ -1916,6 +1916,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
19161916
dr->dt.dl.dr_overridden_by = *zio->io_bp;
19171917
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
19181918
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1919+
dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;
19191920

19201921
/*
19211922
* Old style holes are filled with all zeros, whereas
@@ -2322,6 +2323,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
23222323
boolean_t dedup_verify = os->os_dedup_verify;
23232324
boolean_t encrypt = B_FALSE;
23242325
int copies = os->os_copies;
2326+
int gang_copies = os->os_copies;
23252327

23262328
/*
23272329
* We maintain different write policies for each of the following
@@ -2354,15 +2356,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
23542356
switch (os->os_redundant_metadata) {
23552357
case ZFS_REDUNDANT_METADATA_ALL:
23562358
copies++;
2359+
gang_copies++;
23572360
break;
23582361
case ZFS_REDUNDANT_METADATA_MOST:
23592362
if (level >= zfs_redundant_metadata_most_ditto_level ||
23602363
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
23612364
copies++;
2365+
if (level + 1 >=
2366+
zfs_redundant_metadata_most_ditto_level ||
2367+
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
2368+
gang_copies++;
23622369
break;
23632370
case ZFS_REDUNDANT_METADATA_SOME:
2364-
if (DMU_OT_IS_CRITICAL(type))
2371+
if (DMU_OT_IS_CRITICAL(type)) {
23652372
copies++;
2373+
gang_copies++;
2374+
} else if (DMU_OT_IS_METADATA(type)) {
2375+
gang_copies++;
2376+
}
23662377
break;
23672378
case ZFS_REDUNDANT_METADATA_NONE:
23682379
break;
@@ -2436,6 +2447,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
24362447
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
24372448
ZCHECKSUM_FLAG_NOPWRITE) &&
24382449
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
2450+
2451+
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
2452+
(os->os_redundant_metadata ==
2453+
ZFS_REDUNDANT_METADATA_MOST &&
2454+
zfs_redundant_metadata_most_ditto_level <= 1))
2455+
gang_copies++;
24392456
}
24402457

24412458
/*
@@ -2452,6 +2469,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
24522469

24532470
if (DMU_OT_IS_ENCRYPTED(type)) {
24542471
copies = MIN(copies, SPA_DVAS_PER_BP - 1);
2472+
gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
24552473
nopwrite = B_FALSE;
24562474
} else {
24572475
dedup = B_FALSE;
@@ -2469,6 +2487,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
24692487
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
24702488
zp->zp_level = level;
24712489
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
2490+
zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa));
24722491
zp->zp_dedup = dedup;
24732492
zp->zp_dedup_verify = dedup && dedup_verify;
24742493
zp->zp_nopwrite = nopwrite;

module/zfs/dmu_recv.c

+3
Original file line numberDiff line numberDiff line change
@@ -2300,6 +2300,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
23002300
zp.zp_nopwrite = B_FALSE;
23012301
zp.zp_copies = MIN(zp.zp_copies,
23022302
SPA_DVAS_PER_BP - 1);
2303+
zp.zp_gang_copies =
2304+
MIN(zp.zp_gang_copies,
2305+
SPA_DVAS_PER_BP - 1);
23032306
}
23042307
zio_flags |= ZIO_FLAG_RAW;
23052308
} else if (DRR_WRITE_COMPRESSED(drrw)) {

module/zfs/zio.c

+12-11
Original file line numberDiff line numberDiff line change
@@ -1415,8 +1415,8 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
14151415
}
14161416

14171417
void
1418-
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
1419-
boolean_t brtwrite)
1418+
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies,
1419+
boolean_t nopwrite, boolean_t brtwrite)
14201420
{
14211421
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
14221422
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
@@ -1433,6 +1433,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
14331433
zio->io_prop.zp_nopwrite = nopwrite;
14341434
zio->io_prop.zp_brtwrite = brtwrite;
14351435
zio->io_prop.zp_copies = copies;
1436+
zio->io_prop.zp_gang_copies = gang_copies;
14361437
zio->io_bp_override = bp;
14371438
}
14381439

@@ -3140,15 +3141,13 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
31403141
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
31413142

31423143
/*
3143-
* If one copy was requested, store 2 copies of the GBH, so that we
3144-
* can still traverse all the data (e.g. to free or scrub) even if a
3145-
* block is damaged. Note that we can't store 3 copies of the GBH in
3146-
* all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
3144+
* Store multiple copies of the GBH, so that we can still traverse
3145+
* all the data (e.g. to free or scrub) even if a block is damaged.
3146+
* This value respects the redundant_metadata property.
31473147
*/
3148-
int gbh_copies = copies;
3149-
if (gbh_copies == 1) {
3150-
gbh_copies = MIN(2, spa_max_replication(spa));
3151-
}
3148+
int gbh_copies = gio->io_prop.zp_gang_copies;
3149+
ASSERT3S(gbh_copies, >, 0);
3150+
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
31523151

31533152
ASSERT(ZIO_HAS_ALLOCATOR(pio));
31543153
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
@@ -3168,6 +3167,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
31683167
* since metaslab_class_throttle_reserve() always allows
31693168
* additional reservations for gang blocks.
31703169
*/
3170+
ASSERT3U(gbh_copies, >=, copies);
31713171
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
31723172
pio->io_allocator, pio, flags));
31733173
}
@@ -3230,6 +3230,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
32303230
zp.zp_type = zp.zp_storage_type = DMU_OT_NONE;
32313231
zp.zp_level = 0;
32323232
zp.zp_copies = gio->io_prop.zp_copies;
3233+
zp.zp_gang_copies = gio->io_prop.zp_gang_copies;
32333234
zp.zp_dedup = B_FALSE;
32343235
zp.zp_dedup_verify = B_FALSE;
32353236
zp.zp_nopwrite = B_FALSE;
@@ -3950,7 +3951,7 @@ zio_ddt_write(zio_t *zio)
39503951
* grow the DDT entry by to satisfy the request.
39513952
*/
39523953
zio_prop_t czp = *zp;
3953-
czp.zp_copies = need_dvas;
3954+
czp.zp_copies = czp.zp_gang_copies = need_dvas;
39543955
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
39553956
zio->io_orig_size, zio->io_orig_size, &czp,
39563957
zio_ddt_child_write_ready, NULL,

tests/runfiles/common.run

+4
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,10 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
724724
'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos']
725725
tags = ['functional', 'features', 'large_dnode']
726726

727+
[tests/functional/gang_blocks]
728+
tests = ['gang_blocks_redundant']
729+
tags = ['functional', 'gang_blocks']
730+
727731
[tests/functional/grow]
728732
pre =
729733
post =

tests/zfs-tests/include/tunables.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting
6464
MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds
6565
METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load
6666
METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging
67+
METASLAB_FORCE_GANGING_PCT metaslab.force_ganging_pct metaslab_force_ganging_pct
6768
MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals
6869
MULTIHOST_HISTORY multihost.history zfs_multihost_history
6970
MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals

tests/zfs-tests/tests/Makefile.am

+4
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
275275
functional/events/events.cfg \
276276
functional/events/events_common.kshlib \
277277
functional/fault/fault.cfg \
278+
functional/gang_blocks/gang_blocks.kshlib \
278279
functional/grow/grow.cfg \
279280
functional/history/history.cfg \
280281
functional/history/history_common.kshlib \
@@ -1558,6 +1559,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
15581559
functional/features/large_dnode/large_dnode_008_pos.ksh \
15591560
functional/features/large_dnode/large_dnode_009_pos.ksh \
15601561
functional/features/large_dnode/setup.ksh \
1562+
functional/gang_blocks/cleanup.ksh \
1563+
functional/gang_blocks/gang_blocks_redundant.ksh \
1564+
functional/gang_blocks/setup.ksh \
15611565
functional/grow/grow_pool_001_pos.ksh \
15621566
functional/grow/grow_replicas_001_pos.ksh \
15631567
functional/history/cleanup.ksh \
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/ksh -p
2+
#
3+
# CDDL HEADER START
4+
#
5+
# The contents of this file are subject to the terms of the
6+
# Common Development and Distribution License (the "License").
7+
# You may not use this file except in compliance with the License.
8+
#
9+
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10+
# or https://opensource.org/licenses/CDDL-1.0.
11+
# See the License for the specific language governing permissions
12+
# and limitations under the License.
13+
#
14+
# When distributing Covered Code, include this CDDL HEADER in each
15+
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16+
# If applicable, add the following below this CDDL HEADER, with the
17+
# fields enclosed by brackets "[]" replaced with your own identifying
18+
# information: Portions Copyright [yyyy] [name of copyright owner]
19+
#
20+
# CDDL HEADER END
21+
#
22+
23+
#
24+
# Copyright (c) 2025 by Klara Inc.
25+
#
26+
27+
. $STF_SUITE/include/libtest.shlib
28+
29+
restore_tunable METASLAB_FORCE_GANGING
30+
restore_tunable METASLAB_FORCE_GANGING_PCT
31+
default_cleanup

0 commit comments

Comments
 (0)