diff --git a/cmd/ztest.c b/cmd/ztest.c index d3bbef245831..9191c9d70e8a 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -1993,7 +1993,8 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) if (write_state == WR_COPIED && dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, - ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { + ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH | + DMU_KEEP_CACHING) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); write_state = WR_NEED_COPY; @@ -2265,19 +2266,19 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) ASSERT(doi.doi_data_block_size); ASSERT0(offset % doi.doi_data_block_size); if (ztest_random(4) != 0) { - int prefetch = ztest_random(2) ? + dmu_flags_t flags = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; /* * We will randomly set when to do O_DIRECT on a read. */ if (ztest_random(4) == 0) - prefetch |= DMU_DIRECTIO; + flags |= DMU_DIRECTIO; ztest_block_tag_t rbt; VERIFY(dmu_read(os, lr->lr_foid, offset, - sizeof (rbt), &rbt, prefetch) == 0); + sizeof (rbt), &rbt, flags) == 0); if (rbt.bt_magic == BT_MAGIC) { ztest_bt_verify(&rbt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); @@ -2308,7 +2309,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { memcpy(abuf->b_data, data, length); - VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); + VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx, 0)); } (void) ztest_log_write(zd, tx, lr); @@ -2533,7 +2534,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, object, offset, size, ZTRL_READER); error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); + DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); ASSERT0(error); } else { ASSERT3P(zio, !=, NULL); @@ -2549,7 +2550,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, object, offset, size, ZTRL_READER); error = dmu_buf_hold_noread(os, object, offset, zgd, &db); - if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; @@ -2826,7 +2826,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) enum ztest_io_type io_type; uint64_t blocksize; void *data; - uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH; + dmu_flags_t dmu_read_flags = DMU_READ_NO_PREFETCH; /* * We will randomly set when to do O_DIRECT on a read. @@ -5065,7 +5065,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) uint64_t stride = 123456789ULL; uint64_t width = 40; int free_percent = 5; - uint32_t dmu_read_flags = DMU_READ_PREFETCH; + dmu_flags_t dmu_read_flags = DMU_READ_PREFETCH; /* * We will randomly set when to do O_DIRECT on a read. @@ -5541,13 +5541,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) } if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, - off, bigbuf_arcbufs[j], tx)); + off, bigbuf_arcbufs[j], tx, 0)); } else { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, - off, bigbuf_arcbufs[2 * j], tx)); + off, bigbuf_arcbufs[2 * j], tx, 0)); VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off + chunksize / 2, - bigbuf_arcbufs[2 * j + 1], tx)); + bigbuf_arcbufs[2 * j + 1], tx, 0)); } if (i == 1) { dmu_buf_rele(dbt, FTAG); diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 285e02484c57..d2e9bf33c0e6 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -45,20 +45,6 @@ extern "C" { #define IN_DMU_SYNC 2 -/* - * define flags for dbuf_read - */ - -#define DB_RF_MUST_SUCCEED (1 << 0) -#define DB_RF_CANFAIL (1 << 1) -#define DB_RF_HAVESTRUCT (1 << 2) -#define DB_RF_NOPREFETCH (1 << 3) -#define DB_RF_NEVERWAIT (1 << 4) -#define DB_RF_CACHED (1 << 5) -#define DB_RF_NO_DECRYPT (1 << 6) -#define DB_RF_PARTIAL_FIRST (1 << 7) -#define DB_RF_PARTIAL_MORE (1 << 8) - /* * The simplified state transition diagram for dbufs looks like: * @@ -389,12 +375,15 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid, uint64_t *hash_out); -int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, dmu_flags_t flags); void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); +void dmu_buf_will_fill_flags(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail, + dmu_flags_t flags); boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); -void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); +void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx, + dmu_flags_t flags); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); @@ -476,10 +465,10 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) -#define DBUF_IS_CACHEABLE(_db) \ +#define DBUF_IS_CACHEABLE(_db) (!(_db)->db_pending_evict && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))) boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index da1fdfd23962..7871eacc2e18 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -532,6 +532,26 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); +/* + * DB_RF_* are to be used for dbuf_read() or in limited other cases. + */ +typedef enum dmu_flags { + DB_RF_MUST_SUCCEED = 0, /* Suspend on I/O errors. */ + DB_RF_CANFAIL = 1 << 0, /* Return on I/O errors. */ + DB_RF_HAVESTRUCT = 1 << 1, /* dn_struct_rwlock is locked. */ + DB_RF_NEVERWAIT = 1 << 2, + DMU_READ_PREFETCH = 0, /* Try speculative prefetch. */ + DMU_READ_NO_PREFETCH = 1 << 3, /* Don't prefetch speculatively. */ + DB_RF_NOPREFETCH = DMU_READ_NO_PREFETCH, + DMU_READ_NO_DECRYPT = 1 << 4, /* Don't decrypt. */ + DB_RF_NO_DECRYPT = DMU_READ_NO_DECRYPT, + DMU_DIRECTIO = 1 << 5, /* Bypass ARC. */ + DMU_UNCACHEDIO = 1 << 6, /* Reduce caching. */ + DMU_PARTIAL_FIRST = 1 << 7, /* First partial access. */ + DMU_PARTIAL_MORE = 1 << 8, /* Following partial access. */ + DMU_KEEP_CACHING = 1 << 9, /* Don't affect caching. */ +} dmu_flags_t; + /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a @@ -547,7 +567,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp); int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, - uint32_t flags); + dmu_flags_t flags); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); @@ -558,9 +578,9 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); * Special spill buffer support used by "SA" framework */ -int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, - dmu_buf_t **dbp); -int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, +int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags, + const void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags, const void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); @@ -579,17 +599,17 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - const void *tag, dmu_buf_t **, int flags); + const void *tag, dmu_buf_t **, dmu_flags_t flags); int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp); int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, - const void *tag, dmu_buf_t **dbp, int flags); + const void *tag, dmu_buf_t **dbp, dmu_flags_t flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, - dmu_buf_t ***dbpp, uint32_t flags); + dmu_buf_t ***dbpp, dmu_flags_t flags); int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp); @@ -781,6 +801,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags); boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); @@ -874,40 +895,36 @@ int dmu_free_long_object(objset_t *os, uint64_t object); * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ -#define DMU_READ_PREFETCH 0 /* prefetch */ -#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ -#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ -#define DMU_DIRECTIO 4 /* use Direct I/O */ - int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags); + void *buf, dmu_flags_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, - uint32_t flags); + dmu_flags_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); -int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx, uint32_t flags); + const void *buf, dmu_tx_t *tx, dmu_flags_t flags); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL -int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); -int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); -int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size); +int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags); +int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags); +int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags); int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, dmu_flags_t flags); int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, dmu_flags_t flags); int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, dmu_flags_t flags); #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, - struct arc_buf *buf, dmu_tx_t *tx); + struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags); int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, - struct arc_buf *buf, dmu_tx_t *tx); + struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf extern uint_t zfs_max_recordsize; diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index dc2b66d06e7c..21a8b16a3ee6 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -270,11 +270,13 @@ void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *); -int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags); -int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *); +int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t); +int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t, + dmu_tx_t *); #if defined(_KERNEL) -int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t); -int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *); +int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t); +int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t, + dmu_tx_t *); #endif #ifdef __cplusplus diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h index 963e841a4882..a5ddd28026ce 100644 --- a/include/sys/dmu_zfetch.h +++ b/include/sys/dmu_zfetch.h @@ -81,9 +81,10 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t); -void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, +void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t, boolean_t); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, + boolean_t, boolean_t); #ifdef __cplusplus diff --git a/include/sys/spa.h b/include/sys/spa.h index 880f60471b65..7dbb37406079 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -981,9 +981,9 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, - uint32_t flags); + dmu_flags_t flags); extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, - uint32_t flags); + dmu_flags_t flags); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, diff --git a/include/sys/zfs_racct.h b/include/sys/zfs_racct.h index 939e8fa666e9..562029d4114d 100644 --- a/include/sys/zfs_racct.h +++ b/include/sys/zfs_racct.h @@ -33,7 +33,9 @@ /* * Platform-dependent resource accounting hooks */ -void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); -void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); +void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, + dmu_flags_t flags); +void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, + dmu_flags_t flags); #endif /* _SYS_ZFS_RACCT_H */ diff --git a/lib/libzpool/zfs_racct.c b/lib/libzpool/zfs_racct.c index 0f80907a5fa8..5c9583581d94 100644 --- a/lib/libzpool/zfs_racct.c +++ b/lib/libzpool/zfs_racct.c @@ -27,13 +27,13 @@ #include void -zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { (void) spa, (void) size, (void) iops, (void) flags; } void -zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { (void) spa, (void) size, (void) iops, (void) flags; } diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index f5881d7faf59..b1a1bbc6ad85 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -304,7 +304,7 @@ Default dnode block size as a power of 2. .It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int Default dnode indirect block size as a power of 2. . -.It Sy zfs_dio_enabled Ns = Ns Sy 0 Ns | Ns 1 Pq int +.It Sy zfs_dio_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable Direct I/O. If this setting is 0, then all I/O requests will be directed through the ARC acting as though the dataset property @@ -312,6 +312,11 @@ acting as though the dataset property was set to .Sy disabled . . +.It Sy zfs_dio_strict Ns = Ns Sy 0 Ns | Ns 1 Pq int +Strictly enforce alignment for Direct I/O requests, returning +.Sy EINVAL +if not page-aligned instead of silently falling back to uncached I/O. +. .It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 When attempting to log an output nvlist of an ioctl in the on-disk history, the output will not be stored if it is larger than this size (in bytes). diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c index d7c9be70ad4a..364bbfc60abd 100644 --- a/module/os/freebsd/zfs/dmu_os.c +++ b/module/os/freebsd/zfs/dmu_os.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -71,6 +70,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct sf_buf *sf; int numbufs, i; int err; + dmu_flags_t flags = 0; if (size == 0) return (0); @@ -94,10 +94,17 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - if (tocpy == db->db_size) + if (tocpy == db->db_size) { dmu_buf_will_fill(db, tx, B_FALSE); - else - dmu_buf_will_dirty(db, tx); + } else { + if (i == numbufs - 1 && bufoff + tocpy < db->db_size) { + if (bufoff == 0) + flags |= DMU_PARTIAL_FIRST; + else + flags |= DMU_PARTIAL_MORE; + } + dmu_buf_will_dirty_flags(db, tx, flags); + } for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(ptoa((*ma)->pindex), ==, diff --git a/module/os/freebsd/zfs/zfs_racct.c b/module/os/freebsd/zfs/zfs_racct.c index bdbbdacd984e..50d1cbf53afc 100644 --- a/module/os/freebsd/zfs/zfs_racct.c +++ b/module/os/freebsd/zfs/zfs_racct.c @@ -28,7 +28,7 @@ #include void -zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { curthread->td_ru.ru_inblock += iops; #ifdef RACCT @@ -46,7 +46,7 @@ zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) } void -zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { curthread->td_ru.ru_oublock += iops; #ifdef RACCT diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index b2080a48c4ad..570b32619214 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -518,7 +518,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) page_unhold(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, bytes); + uio, bytes, DMU_READ_PREFETCH); } len -= bytes; off = 0; diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index d18ea9d59fa3..ea417bbe6605 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -752,7 +752,7 @@ zvol_geom_bio_strategy(struct bio *bp) while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr, + error = dmu_read_by_dnode(zv->zv_dn, off, size, addr, DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); @@ -761,7 +761,8 @@ zvol_geom_bio_strategy(struct bio *bp) if (error) { dmu_tx_abort(tx); } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + dmu_write_by_dnode(zv->zv_dn, off, size, addr, + tx, DMU_READ_PREFETCH); zvol_log_write(zv, tx, off, size, commit); dmu_tx_commit(tx); } @@ -850,7 +851,8 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) if (bytes > volsize - zfs_uio_offset(&uio)) bytes = volsize - zfs_uio_offset(&uio); - error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); + error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, + DMU_READ_PREFETCH); if (error) { /* Convert checksum errors into IO errors. */ if (error == ECKSUM) @@ -909,7 +911,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, + DMU_READ_PREFETCH); if (error == 0) zvol_log_write(zv, tx, off, bytes, commit); dmu_tx_commit(tx); diff --git a/module/os/linux/zfs/zfs_racct.c b/module/os/linux/zfs/zfs_racct.c index 4dbd6a28b594..18c5d67f9e32 100644 --- a/module/os/linux/zfs/zfs_racct.c +++ b/module/os/linux/zfs/zfs_racct.c @@ -30,14 +30,14 @@ #include void -zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { task_io_account_read(size); spa_iostats_read_add(spa, size, iops, flags); } void -zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { task_io_account_write(size); spa_iostats_write_add(spa, size, iops, flags); @@ -46,13 +46,13 @@ zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) #else void -zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { (void) spa, (void) size, (void) iops, (void) flags; } void -zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { (void) spa, (void) size, (void) iops, (void) flags; } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 9ceb6cb8dbdd..0138afb14f95 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -329,7 +329,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, bytes); + uio, bytes, DMU_READ_PREFETCH); } len -= bytes; diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index c8a04539258f..00dda4934369 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -305,7 +305,8 @@ zvol_write(zv_request_t *zvr) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, + DMU_READ_PREFETCH); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } @@ -475,7 +476,8 @@ zvol_read(zv_request_t *zvr) if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; - error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); + error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, + DMU_READ_PREFETCH); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 4b09dd61215f..ddb5b2686568 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6103,7 +6103,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); - zfs_racct_read(spa, size, 1, 0); + zfs_racct_read(spa, size, 1, + (*arc_flags & ARC_FLAG_UNCACHED) ? + DMU_UNCACHEDIO : 0); } /* Check if the spa even has l2 configured */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 4cf6c45400a1..aa8f12f33ea3 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1522,7 +1522,8 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int -dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, + dmu_flags_t flags) { objset_t *os = db->db_objset; dmu_buf_impl_t *dndb; @@ -1530,7 +1531,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) zbookmark_phys_t zb; int err; - if ((flags & DB_RF_NO_DECRYPT) != 0 || + if ((flags & DMU_READ_NO_DECRYPT) != 0 || !os->os_encrypted || os->os_raw_receive || (dndb = dn->dn_dbuf) == NULL) return (0); @@ -1584,7 +1585,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) * returning. */ static int -dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, +dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags, db_lock_type_t dblt, blkptr_t *bp, const void *tag) { zbookmark_phys_t zb; @@ -1650,7 +1651,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; - if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; /* @@ -1751,7 +1752,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } int -dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) +dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags) { dnode_t *dn; boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch; @@ -1771,12 +1772,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) goto done; prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0; + (flags & DMU_READ_NO_PREFETCH) == 0; mutex_enter(&db->db_mtx); - if (flags & DB_RF_PARTIAL_FIRST) + if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING))) + db->db_pending_evict = B_FALSE; + if (flags & DMU_PARTIAL_FIRST) db->db_partial_read = B_TRUE; - else if (!(flags & DB_RF_PARTIAL_MORE)) + else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING))) db->db_partial_read = B_FALSE; miss = (db->db_state != DB_CACHED); @@ -1817,7 +1820,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) * unauthenticated blocks, which will verify their MAC if * the key is now available. */ - if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL && + if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { @@ -1865,7 +1868,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) if (err == 0 && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss, - flags & DB_RF_HAVESTRUCT); + flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) || + db->db_pending_evict); } DB_DNODE_EXIT(db); @@ -1897,11 +1901,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) } static void -dbuf_noread(dmu_buf_impl_t *db) +dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); + if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING))) + db->db_pending_evict = B_FALSE; + db->db_partial_read = B_FALSE; while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { @@ -2214,8 +2221,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) kmem_free(dr, sizeof (*dr)); return (NULL); } - int err = dbuf_read(parent_db, NULL, - (DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL | + DMU_READ_NO_PREFETCH); if (err != 0) { dbuf_rele(parent_db, FTAG); kmem_free(dr, sizeof (*dr)); @@ -2642,8 +2649,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (B_FALSE); } -static void -dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) +void +dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; boolean_t undirty = B_FALSE; @@ -2695,7 +2702,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) * not the uderlying block that is being replaced. dbuf_undirty() will * do brt_pending_remove() before removing the dirty record. */ - (void) dbuf_read(db, NULL, flags); + (void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED); if (undirty) { mutex_enter(&db->db_mtx); VERIFY(!dbuf_undirty(db, tx)); @@ -2707,8 +2714,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { - dmu_buf_will_dirty_impl(db_fake, - DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); + dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); } boolean_t @@ -2872,7 +2878,7 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx) DBUF_VERIFY(db); mutex_exit(&db->db_mtx); - dbuf_noread(db); + dbuf_noread(db, DMU_KEEP_CACHING); (void) dbuf_dirty(db, tx); } @@ -2886,12 +2892,13 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) DTRACE_SET_STATE(db, "allocating NOFILL buffer"); mutex_exit(&db->db_mtx); - dbuf_noread(db); + dbuf_noread(db, DMU_KEEP_CACHING); (void) dbuf_dirty(db, tx); } void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) +dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail, + dmu_flags_t flags) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -2913,7 +2920,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) */ if (canfail && dr) { mutex_exit(&db->db_mtx); - dmu_buf_will_dirty(db_fake, tx); + dmu_buf_will_dirty_flags(db_fake, tx, flags); return; } /* @@ -2929,10 +2936,16 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) } mutex_exit(&db->db_mtx); - dbuf_noread(db); + dbuf_noread(db, flags); (void) dbuf_dirty(db, tx); } +void +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) +{ + dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH); +} + /* * This function is effectively the same as dmu_buf_will_dirty(), but * indicates the caller expects raw encrypted data in the db, and provides @@ -2955,8 +2968,8 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, ASSERT0(db->db_level); ASSERT(db->db_objset->os_raw_receive); - dmu_buf_will_dirty_impl(db_fake, - DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); + dmu_buf_will_dirty_flags(db_fake, tx, + DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT); dr = dbuf_find_dirty_eq(db, tx->tx_txg); @@ -3098,7 +3111,8 @@ dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx) * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void -dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx, + dmu_flags_t flags) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); @@ -3112,6 +3126,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); + if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING))) + db->db_pending_evict = B_FALSE; + db->db_partial_read = B_FALSE; while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); @@ -3366,8 +3383,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, if (err) return (err); - err = dbuf_read(*parentp, NULL, - (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL | + DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; @@ -3426,7 +3443,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_user = NULL; db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; - db->db_pending_evict = FALSE; + db->db_pending_evict = TRUE; + db->db_partial_read = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -3637,8 +3655,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dbuf_prefetch_fini(dpa, B_TRUE); return; } - (void) dbuf_read(db, NULL, - DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + (void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT | + DMU_READ_NO_PREFETCH); dbuf_rele(db, FTAG); } @@ -4023,6 +4041,7 @@ dbuf_create_bonus(dnode_t *dn) ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL, dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID)); + dn->dn_bonus->db_pending_evict = FALSE; } int @@ -4188,8 +4207,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); - } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) || - db->db_pending_evict) { + } else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) { + /* + * We don't expect more accesses to the dbuf, and it + * is either not cacheable or was marked for eviction. + */ dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 2b52ae139bac..b3cea3fbf299 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -222,20 +222,14 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, - const void *tag, dmu_buf_t **dbp, int flags) + const void *tag, dmu_buf_t **dbp, dmu_flags_t flags) { int err; - int db_flags = DB_RF_CANFAIL; - - if (flags & DMU_READ_NO_PREFETCH) - db_flags |= DB_RF_NOPREFETCH; - if (flags & DMU_READ_NO_DECRYPT) - db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); - err = dbuf_read(db, NULL, db_flags); + err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; @@ -247,20 +241,14 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - const void *tag, dmu_buf_t **dbp, int flags) + const void *tag, dmu_buf_t **dbp, dmu_flags_t flags) { int err; - int db_flags = DB_RF_CANFAIL; - - if (flags & DMU_READ_NO_PREFETCH) - db_flags |= DB_RF_NOPREFETCH; - if (flags & DMU_READ_NO_DECRYPT) - db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); - err = dbuf_read(db, NULL, db_flags); + err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; @@ -358,16 +346,10 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, - uint32_t flags) + dmu_flags_t flags) { dmu_buf_impl_t *db; int error; - uint32_t db_flags = DB_RF_MUST_SUCCEED; - - if (flags & DMU_READ_NO_PREFETCH) - db_flags |= DB_RF_NOPREFETCH; - if (flags & DMU_READ_NO_DECRYPT) - db_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { @@ -393,7 +375,7 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, */ rw_exit(&dn->dn_struct_rwlock); - error = dbuf_read(db, NULL, db_flags); + error = dbuf_read(db, NULL, flags | DB_RF_CANFAIL); if (error) { dnode_evict_bonus(dn); dbuf_rele(db, tag); @@ -431,7 +413,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp) * dmu_spill_hold_existing() should be used. */ int -dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag, +dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; @@ -489,18 +471,14 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp) } int -dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; int err; - uint32_t db_flags = DB_RF_CANFAIL; - - if (flags & DMU_READ_NO_DECRYPT) - db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); - err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp); + err = dmu_spill_hold_by_dnode(DB_DNODE(db), flags, tag, dbp); DB_DNODE_EXIT(db); return (err); @@ -515,12 +493,12 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, - uint32_t flags) + dmu_flags_t flags) { dmu_buf_t **dbp; zstream_t *zs = NULL; uint64_t blkid, nblks, i; - uint32_t dbuf_flags; + dmu_flags_t dbuf_flags; int err; zio_t *zio = NULL; boolean_t missed = B_FALSE; @@ -532,11 +510,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | - DB_RF_NOPREFETCH; - - if ((flags & DMU_READ_NO_DECRYPT) != 0) - dbuf_flags |= DB_RF_NO_DECRYPT; + dbuf_flags = (flags & ~DMU_READ_PREFETCH) | DMU_READ_NO_PREFETCH | + DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { @@ -569,15 +544,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, * that if multiple threads block on same indirect block, we * base predictions on the original less racy request order. */ - zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read, - B_TRUE); + zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, + read && !(flags & DMU_DIRECTIO), B_TRUE); } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { if (zs) { dmu_zfetch_run(&dn->dn_zfetch, zs, missed, - B_TRUE); + B_TRUE, (flags & DMU_UNCACHEDIO)); } rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); @@ -599,9 +574,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, offset + length < db->db.db_offset + db->db.db_size) { if (offset <= db->db.db_offset) - dbuf_flags |= DB_RF_PARTIAL_FIRST; + dbuf_flags |= DMU_PARTIAL_FIRST; else - dbuf_flags |= DB_RF_PARTIAL_MORE; + dbuf_flags |= DMU_PARTIAL_MORE; } (void) dbuf_read(db, zio, dbuf_flags); if (db->db_state != DB_CACHED) @@ -621,8 +596,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, if (!read && ((flags & DMU_DIRECTIO) == 0)) zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags); - if (zs) - dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); + if (zs) { + dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE, + (flags & DMU_UNCACHEDIO)); + } rw_exit(&dn->dn_struct_rwlock); if (read) { @@ -1170,7 +1147,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) + void *buf, dmu_flags_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; @@ -1198,6 +1175,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, abd_free(data); return (err); } + flags &= ~DMU_DIRECTIO; while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); @@ -1236,7 +1214,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) + void *buf, dmu_flags_t flags) { dnode_t *dn; int err; @@ -1252,14 +1230,14 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, - uint32_t flags) + dmu_flags_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) + const void *buf, dmu_tx_t *tx, dmu_flags_t flags) { int i; @@ -1275,10 +1253,17 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx, B_FALSE); - else - dmu_buf_will_dirty(db, tx); + if (tocpy == db->db_size) { + dmu_buf_will_fill_flags(db, tx, B_FALSE, flags); + } else { + if (i == numbufs - 1 && bufoff + tocpy < db->db_size) { + if (bufoff == 0) + flags |= DMU_PARTIAL_FIRST; + else + flags |= DMU_PARTIAL_MORE; + } + dmu_buf_will_dirty_flags(db, tx, flags); + } ASSERT(db->db_data != NULL); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); @@ -1304,17 +1289,13 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); - dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx, DMU_READ_PREFETCH); dmu_buf_rele_array(dbp, numbufs, FTAG); } -/* - * This interface is not used internally by ZFS but is provided for - * use by Lustre which is built on the DMU interfaces. - */ int -dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx, uint32_t flags) +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx, dmu_flags_t flags) { dmu_buf_t **dbp; int numbufs; @@ -1327,25 +1308,19 @@ dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) && zfs_dio_aligned(offset, size, dn->dn_datablksz)) { abd_t *data = abd_get_from_buf((void *)buf, size); - error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); + error = dmu_write_abd(dn, offset, size, data, flags, tx); abd_free(data); return (error); } + flags &= ~DMU_DIRECTIO; VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); - dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + FALSE, FTAG, &numbufs, &dbp, flags)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx, flags); dmu_buf_rele_array(dbp, numbufs, FTAG); return (0); } -int -dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) -{ - return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0)); -} - void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) @@ -1402,20 +1377,22 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, #ifdef _KERNEL int -dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) +dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags) { dmu_buf_t **dbp; int numbufs, i, err; if (uio->uio_extflg & UIO_DIRECT) - return (dmu_read_uio_direct(dn, uio, size)); + return (dmu_read_uio_direct(dn, uio, size, flags)); + flags &= ~DMU_DIRECTIO; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, - TRUE, FTAG, &numbufs, &dbp, 0); + TRUE, FTAG, &numbufs, &dbp, flags); if (err) return (err); @@ -1453,7 +1430,8 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) * because we don't have to find the dnode_t for the object. */ int -dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) +dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; int err; @@ -1462,7 +1440,7 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) return (0); DB_DNODE_ENTER(db); - err = dmu_read_uio_dnode(DB_DNODE(db), uio, size); + err = dmu_read_uio_dnode(DB_DNODE(db), uio, size, flags); DB_DNODE_EXIT(db); return (err); @@ -1474,7 +1452,8 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) * Starting at offset zfs_uio_offset(uio). */ int -dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) +dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags) { dnode_t *dn; int err; @@ -1486,7 +1465,7 @@ dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) if (err) return (err); - err = dmu_read_uio_dnode(dn, uio, size); + err = dmu_read_uio_dnode(dn, uio, size, flags); dnode_rele(dn, FTAG); @@ -1494,12 +1473,14 @@ dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) } int -dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) +dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx, + dmu_flags_t flags) { dmu_buf_t **dbp; int numbufs; int err = 0; uint64_t write_size; + dmu_flags_t oflags = flags; top: write_size = size; @@ -1512,13 +1493,14 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) (write_size >= dn->dn_datablksz)) { if (zfs_dio_aligned(zfs_uio_offset(uio), write_size, dn->dn_datablksz)) { - return (dmu_write_uio_direct(dn, uio, size, tx)); + return (dmu_write_uio_direct(dn, uio, size, flags, tx)); } else if (write_size > dn->dn_datablksz && zfs_dio_offset_aligned(zfs_uio_offset(uio), dn->dn_datablksz)) { write_size = dn->dn_datablksz * (write_size / dn->dn_datablksz); - err = dmu_write_uio_direct(dn, uio, write_size, tx); + err = dmu_write_uio_direct(dn, uio, write_size, flags, + tx); if (err == 0) { size -= write_size; goto top; @@ -1530,9 +1512,10 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz); } } + flags &= ~DMU_DIRECTIO; err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + FALSE, FTAG, &numbufs, &dbp, flags); if (err) return (err); @@ -1549,10 +1532,17 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx, B_TRUE); - else - dmu_buf_will_dirty(db, tx); + if (tocpy == db->db_size) { + dmu_buf_will_fill_flags(db, tx, B_TRUE, flags); + } else { + if (i == numbufs - 1 && bufoff + tocpy < db->db_size) { + if (bufoff == 0) + flags |= DMU_PARTIAL_FIRST; + else + flags |= DMU_PARTIAL_MORE; + } + dmu_buf_will_dirty_flags(db, tx, flags); + } ASSERT(db->db_data != NULL); err = zfs_uio_fault_move((char *)db->db_data + bufoff, @@ -1575,6 +1565,7 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_buf_rele_array(dbp, numbufs, FTAG); if ((uio->uio_extflg & UIO_DIRECT) && size > 0) { + flags = oflags; goto top; } @@ -1592,7 +1583,7 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, - dmu_tx_t *tx) + dmu_tx_t *tx, dmu_flags_t flags) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; int err; @@ -1601,7 +1592,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, return (0); DB_DNODE_ENTER(db); - err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx); + err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx, flags); DB_DNODE_EXIT(db); return (err); @@ -1614,7 +1605,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, */ int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, - dmu_tx_t *tx) + dmu_tx_t *tx, dmu_flags_t flags) { dnode_t *dn; int err; @@ -1626,7 +1617,7 @@ dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, if (err) return (err); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_write_uio_dnode(dn, uio, size, tx, flags); dnode_rele(dn, FTAG); @@ -1796,11 +1787,10 @@ dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, */ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, - dmu_tx_t *tx) + dmu_tx_t *tx, dmu_flags_t flags) { dmu_buf_impl_t *db; objset_t *os = dn->dn_objset; - uint64_t object = dn->dn_object; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; @@ -1816,8 +1806,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { - zfs_racct_write(os->os_spa, blksz, 1, 0); - dbuf_assign_arcbuf(db, buf, tx); + zfs_racct_write(os->os_spa, blksz, 1, flags); + dbuf_assign_arcbuf(db, buf, tx, flags); dbuf_rele(db, FTAG); } else { /* compressed bufs must always be assignable to their dbuf */ @@ -1825,7 +1815,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); dbuf_rele(db, FTAG); - dmu_write(os, object, offset, blksz, buf->b_data, tx); + dmu_write_by_dnode(dn, offset, blksz, buf->b_data, tx, flags); dmu_return_arcbuf(buf); } @@ -1834,13 +1824,13 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, - dmu_tx_t *tx) + dmu_tx_t *tx, dmu_flags_t flags) { int err; dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(db); - err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx); + err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx, flags); DB_DNODE_EXIT(db); return (err); @@ -1985,7 +1975,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, int error; error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL, - DB_RF_CANFAIL | DB_RF_NOPREFETCH); + DB_RF_CANFAIL | DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); if (error != 0) return (error); @@ -2928,7 +2918,6 @@ EXPORT_SYMBOL(dmu_read_uio_dbuf); EXPORT_SYMBOL(dmu_read_uio_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); -EXPORT_SYMBOL(dmu_write_by_dnode_flags); EXPORT_SYMBOL(dmu_write_uio); EXPORT_SYMBOL(dmu_write_uio_dbuf); EXPORT_SYMBOL(dmu_write_uio_dnode); diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 2d5253a5d060..12b0ffa2c99b 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -208,7 +208,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) int dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, - abd_t *data, uint32_t flags, dmu_tx_t *tx) + abd_t *data, dmu_flags_t flags, dmu_tx_t *tx) { dmu_buf_t **dbp; spa_t *spa = dn->dn_objset->os_spa; @@ -247,7 +247,7 @@ dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, int dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, - abd_t *data, uint32_t flags) + abd_t *data, dmu_flags_t flags) { objset_t *os = dn->dn_objset; spa_t *spa = os->os_spa; @@ -351,7 +351,8 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, #ifdef _KERNEL int -dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size) +dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags) { offset_t offset = zfs_uio_offset(uio); offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; @@ -362,7 +363,7 @@ dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size) abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], offset & (PAGESIZE - 1), size); - err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO); + err = dmu_read_abd(dn, offset, size, data, flags); abd_free(data); if (err == 0) @@ -372,7 +373,8 @@ dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size) } int -dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) +dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags, dmu_tx_t *tx) { offset_t offset = zfs_uio_offset(uio); offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; @@ -383,7 +385,7 @@ dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], offset & (PAGESIZE - 1), size); - err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); + err = dmu_write_abd(dn, offset, size, data, flags, tx); abd_free(data); if (err == 0) diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 6ab4304fae89..cc804f4ca972 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -2330,12 +2330,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) data = DN_BONUS(dn->dn_phys); } } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { - int rf = 0; + dmu_flags_t rf = DB_RF_MUST_SUCCEED; if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; - error = dmu_spill_hold_by_dnode(dn, - rf | DB_RF_MUST_SUCCEED, + error = dmu_spill_hold_by_dnode(dn, rf, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 6d27dabc2e56..3a4bd7a1cea9 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2135,7 +2135,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, if (data != NULL) { dmu_buf_t *db; dnode_t *dn; - uint32_t flags = DMU_READ_NO_PREFETCH; + dmu_flags_t flags = DMU_READ_NO_PREFETCH; if (rwa->raw) flags |= DMU_READ_NO_DECRYPT; @@ -2277,14 +2277,18 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) dmu_write_by_dnode(dn, drrw->drr_offset, drrw->drr_logical_size, - abd_to_buf(decomp_abd), tx); + abd_to_buf(decomp_abd), tx, + DMU_READ_NO_PREFETCH | + DMU_UNCACHEDIO); } abd_free(decomp_abd); } else { dmu_write_by_dnode(dn, drrw->drr_offset, drrw->drr_logical_size, - abd_to_buf(abd), tx); + abd_to_buf(abd), tx, + DMU_READ_NO_PREFETCH | + DMU_UNCACHEDIO); } if (err == 0) abd_free(abd); @@ -2407,10 +2411,10 @@ receive_process_write_record(struct receive_writer_arg *rwa, if (rwa->heal) { blkptr_t *bp; dmu_buf_t *dbp; - int flags = DB_RF_CANFAIL; + dmu_flags_t flags = DB_RF_CANFAIL; if (rwa->raw) - flags |= DB_RF_NO_DECRYPT; + flags |= DMU_READ_NO_DECRYPT; if (rwa->byteswap) { dmu_object_byteswap_t byteswap = @@ -2567,8 +2571,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, rwa->max_object = drrs->drr_object; VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); - if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, - &db_spill)) != 0) { + if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT | + DB_RF_CANFAIL, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } @@ -2621,7 +2625,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); abd_free(abd); - dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); + dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx, + DMU_UNCACHEDIO); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index c9363e6aec3d..7b9250c873cc 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -297,7 +297,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_FLAG_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int32_t i, ptidx, pidx; uint32_t prefetchlimit; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; @@ -364,8 +364,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, kmem_free(czb, sizeof (zbookmark_phys_t)); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_FLAG_WAIT; - uint32_t zio_flags = ZIO_FLAG_CANFAIL; + arc_flags_t flags = ARC_FLAG_WAIT; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; dnode_phys_t *child_dnp; @@ -397,7 +397,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; @@ -669,7 +669,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; - uint32_t flags = ARC_FLAG_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; ASSERT(!BP_IS_REDACTED(rootbp)); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 5457ca2a8f42..f2bd6a5e3c3c 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -222,8 +222,8 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) * PARTIAL_FIRST allows caching for uncacheable blocks. It will * be cleared after dmu_buf_will_dirty() call dbuf_read() again. */ - err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH | - (level == 0 ? DB_RF_PARTIAL_FIRST : 0)); + err = dbuf_read(db, zio, DB_RF_CANFAIL | DMU_READ_NO_PREFETCH | + (level == 0 ? (DMU_UNCACHEDIO | DMU_PARTIAL_FIRST) : 0)); dbuf_rele(db, FTAG); return (err); } diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index c3e20572884b..51165d0bf723 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -690,7 +690,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, void dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, - boolean_t have_lock) + boolean_t have_lock, boolean_t uncached) { int64_t pf_start, pf_end, ipf_start, ipf_end; int epbs, issued; @@ -745,7 +745,8 @@ dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, issued = 0; for (int64_t blk = pf_start; blk < pf_end; blk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, - ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); + ZIO_PRIORITY_ASYNC_READ, uncached ? + ARC_FLAG_UNCACHED : 0, dmu_zfetch_done, zs); } for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, @@ -761,13 +762,13 @@ dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, void dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, - boolean_t missed, boolean_t have_lock) + boolean_t missed, boolean_t have_lock, boolean_t uncached) { zstream_t *zs; zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); if (zs) - dmu_zfetch_run(zf, zs, missed, have_lock); + dmu_zfetch_run(zf, zs, missed, have_lock, uncached); } ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 6f68e76561bc..88ea0fe6a9f4 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1510,7 +1510,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, * if we get the encrypted or decrypted version. */ err = dbuf_read(db, NULL, DB_RF_CANFAIL | - DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH); + DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT); if (err) { DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); @@ -2578,7 +2578,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT | - DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH); + DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT); if (error) { dbuf_rele(db, FTAG); return (error); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index e2e576dfbbd1..4067f221f1bf 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -513,6 +513,7 @@ dnode_evict_dbufs(dnode_t *dn) avl_remove(&dn->dn_dbufs, db_marker); } else { db->db_pending_evict = TRUE; + db->db_partial_read = FALSE; mutex_exit(&db->db_mtx); db_next = AVL_NEXT(&dn->dn_dbufs, db); } diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 48ad335905b4..5db470ce6242 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -703,8 +703,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, boolean_t dummy; if (hdl->sa_spill == NULL) { - VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL, - &hdl->sa_spill) == 0); + VERIFY0(dmu_spill_hold_by_bonus(hdl->sa_bonus, + DB_RF_MUST_SUCCEED, NULL, &hdl->sa_spill)); } dmu_buf_will_dirty(hdl->sa_spill, tx); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 3a479ddac87e..6d7cabcf766d 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -948,7 +948,8 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type, } void -spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, + dmu_flags_t flags) { spa_history_kstat_t *shk = &spa->spa_stats.iostats; kstat_t *ksp = shk->kstat; @@ -967,7 +968,8 @@ spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) } void -spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, + dmu_flags_t flags) { spa_history_kstat_t *shk = &spa->spa_stats.iostats; kstat_t *ksp = shk->kstat; diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 4374666e76b6..f90d88f1e781 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -669,7 +669,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, int err; DB_DNODE_ENTER(db); err = dmu_read_by_dnode(DB_DNODE(db), off, len, - &lr->lr_data[0], DMU_READ_NO_PREFETCH); + &lr->lr_data[0], DMU_READ_NO_PREFETCH | + DMU_KEEP_CACHING); DB_DNODE_EXIT(db); if (err != 0) { zil_itx_destroy(itx); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index afd9e61313a9..6283c185cfe3 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -89,6 +89,12 @@ static int zfs_dio_enabled = 0; static int zfs_dio_enabled = 1; #endif +/* + * Strictly enforce alignment for Direct I/O requests, returning EINVAL + * if not page-aligned instead of silently falling back to uncached I/O. + */ +static int zfs_dio_strict = 0; + /* * Maximum bytes to read per chunk in zfs_read(). @@ -243,46 +249,54 @@ zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, int ioflag = *ioflagp; int error = 0; - if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED || - zn_has_cached_data(zp, zfs_uio_offset(uio), - zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { - /* - * Direct I/O is disabled or the region is mmap'ed. In either - * case the I/O request will just directed through the ARC. - */ - ioflag &= ~O_DIRECT; + if (os->os_direct == ZFS_DIRECT_ALWAYS) { + /* Force either direct or uncached I/O. */ + ioflag |= O_DIRECT; + } + + if ((ioflag & O_DIRECT) == 0) goto out; - } else if (os->os_direct == ZFS_DIRECT_ALWAYS && - zfs_uio_page_aligned(uio) && - zfs_uio_aligned(uio, PAGE_SIZE)) { - if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) || - (rw == UIO_READ)) { - ioflag |= O_DIRECT; - } - } else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) { + + if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED) { /* - * Direct I/O was requested through the direct=always, but it - * is not properly PAGE_SIZE aligned. The request will be - * directed through the ARC. + * Direct I/O is disabled. The I/O request will be directed + * through the ARC as uncached I/O. */ - ioflag &= ~O_DIRECT; + goto out; } - if (ioflag & O_DIRECT) { - if (!zfs_uio_page_aligned(uio) || - !zfs_uio_aligned(uio, PAGE_SIZE)) { + if (!zfs_uio_page_aligned(uio) || + !zfs_uio_aligned(uio, PAGE_SIZE)) { + /* + * Misaligned requests can be executed through the ARC as + * uncached I/O. But if O_DIRECT was set by user and we + * were set to be strict, then it is a failure. + */ + if ((*ioflagp & O_DIRECT) && zfs_dio_strict) error = SET_ERROR(EINVAL); - goto out; - } + goto out; + } - error = zfs_uio_get_dio_pages_alloc(uio, rw); - if (error) { - goto out; - } + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { + /* + * The region is mmap'ed. The I/O request will be directed + * through the ARC as uncached I/O. + */ + goto out; } - IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT); - ASSERT0(error); + /* + * For short writes the page mapping of Direct I/O makes no sense. + * Direct them through the ARC as uncached I/O. + */ + if (rw == UIO_WRITE && zfs_uio_resid(uio) < zp->z_blksz) + goto out; + + error = zfs_uio_get_dio_pages_alloc(uio, rw); + if (error) + goto out; + ASSERT(uio->uio_extflg & UIO_DIRECT); out: *ioflagp = ioflag; @@ -392,6 +406,9 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) ssize_t start_resid = n; ssize_t dio_remaining_resid = 0; + dmu_flags_t dflags = DMU_READ_PREFETCH; + if (ioflag & O_DIRECT) + dflags |= DMU_UNCACHEDIO; if (uio->uio_extflg & UIO_DIRECT) { /* * All pages for an O_DIRECT request ahve already been mapped @@ -414,6 +431,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); if (dio_remaining_resid != 0) n -= dio_remaining_resid; + dflags |= DMU_DIRECTIO; } while (n > 0) { @@ -429,7 +447,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes); + uio, nbytes, dflags); } if (error) { @@ -479,15 +497,17 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * remainder of the file can be read using the ARC. */ uio->uio_extflg &= ~UIO_DIRECT; + dflags &= ~DMU_DIRECTIO; if (zn_has_cached_data(zp, zfs_uio_offset(uio), zfs_uio_offset(uio) + dio_remaining_resid - 1)) { error = mappedread(zp, dio_remaining_resid, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, - dio_remaining_resid); + dio_remaining_resid, dflags); } uio->uio_extflg |= UIO_DIRECT; + dflags |= DMU_DIRECTIO; if (error != 0) n += dio_remaining_resid; @@ -859,12 +879,18 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_rangelock_reduce(lr, woff, n); } + dmu_flags_t dflags = DMU_READ_PREFETCH; + if (ioflag & O_DIRECT) + dflags |= DMU_UNCACHEDIO; + if (uio->uio_extflg & UIO_DIRECT) + dflags |= DMU_DIRECTIO; + ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); zfs_uio_fault_disable(uio, B_TRUE); error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); + uio, nbytes, tx, dflags); zfs_uio_fault_disable(uio, B_FALSE); #ifdef __linux__ if (error == EFAULT) { @@ -903,7 +929,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * arc buffer to a dbuf. */ error = dmu_assign_arcbuf_by_dbuf( - sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + sa_get_db(zp->z_sa_hdl), woff, abuf, tx, dflags); if (error != 0) { /* * XXX This might not be necessary if @@ -1192,7 +1218,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); + DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ @@ -1882,3 +1908,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, "Enable Direct I/O"); + +ZFS_MODULE_PARAM(zfs, zfs_, dio_strict, INT, ZMOD_RW, + "Return errors on misaligned Direct I/O"); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index d985ec0d68a0..aed5c13ed462 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -880,8 +880,9 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, - offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { + if (wr_state == WR_COPIED && + dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1, + DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; @@ -974,7 +975,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, - DMU_READ_NO_PREFETCH); + DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); } else { /* indirect write */ ASSERT3P(zio, !=, NULL); /* diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 79dc64ad9350..92ec49925d14 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -106,6 +106,7 @@ VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq BCLONE_ENABLED bclone_enabled zfs_bclone_enabled BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty DIO_ENABLED dio_enabled zfs_dio_enabled +DIO_STRICT dio_strict zfs_dio_strict XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh index fe3fd7a2a022..3a72fdaae38f 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh @@ -40,8 +40,10 @@ verify_runnable "global" +log_must save_tunable DIO_STRICT function cleanup { + restore_tunable DIO_STRICT zfs set recordsize=$rs $TESTPOOL/$TESTFS zfs set direct=standard $TESTPOOL/$TESTFS log_must rm -f $tmp_file @@ -61,6 +63,13 @@ file_size=$((rs * 8)) log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1 +log_must set_tunable32 DIO_STRICT 0 +log_must zfs set direct=standard $TESTPOOL/$TESTFS +# sub-pagesize direct writes/read will always pass if not strict. +log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D +log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d + +log_must set_tunable32 DIO_STRICT 1 log_must zfs set direct=standard $TESTPOOL/$TESTFS # sub-pagesize direct writes/read will always fail if direct=standard. log_mustnot stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D diff --git a/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh b/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh index fb75e7a8b7aa..ab749b5f793a 100755 --- a/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh +++ b/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh @@ -48,6 +48,7 @@ TESTDS=${TESTPOOL}/${TESTFS} TESTFILE=${TESTDIR}/${TESTFILE0} log_must save_tunable DIO_ENABLED +log_must save_tunable DIO_STRICT typeset recordsize_saved=$(get_prop recordsize $TESTDS) typeset direct_saved=$(get_prop direct $TESTDS) @@ -57,6 +58,7 @@ function cleanup zfs set recordsize=$recordsize_saved $TESTDS zfs set direct=$direct_saved $TESTDS restore_tunable DIO_ENABLED + restore_tunable DIO_STRICT } log_onexit cleanup @@ -154,6 +156,7 @@ for krs in 4 8 16 32 64 128 256 512 ; do done # reset for write tests +log_must set_tunable32 DIO_STRICT 1 log_must zfs set recordsize=16K $TESTDS log_must zfs set direct=standard $TESTDS @@ -173,4 +176,12 @@ log_must zpool sync assert_dioalign $TESTFILE $PAGE_SIZE 16384 log_mustnot dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct +# same again, but without strict, which should succeed. +log_must set_tunable32 DIO_STRICT 0 +log_must rm -f $TESTFILE +log_must touch $TESTFILE +log_must zpool sync +assert_dioalign $TESTFILE $PAGE_SIZE 16384 +log_must dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct + log_pass $CLAIM