Skip to content

Wire O_DIRECT also to Uncached I/O #17218

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1993,7 +1993,8 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)

if (write_state == WR_COPIED &&
dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH |
DMU_KEEP_CACHING) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
write_state = WR_NEED_COPY;
Expand Down Expand Up @@ -2265,19 +2266,19 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
ASSERT(doi.doi_data_block_size);
ASSERT0(offset % doi.doi_data_block_size);
if (ztest_random(4) != 0) {
int prefetch = ztest_random(2) ?
dmu_flags_t flags = ztest_random(2) ?
DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;

/*
* We will randomly set when to do O_DIRECT on a read.
*/
if (ztest_random(4) == 0)
prefetch |= DMU_DIRECTIO;
flags |= DMU_DIRECTIO;

ztest_block_tag_t rbt;

VERIFY(dmu_read(os, lr->lr_foid, offset,
sizeof (rbt), &rbt, prefetch) == 0);
sizeof (rbt), &rbt, flags) == 0);
if (rbt.bt_magic == BT_MAGIC) {
ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
offset, gen, txg, crtxg);
Expand Down Expand Up @@ -2308,7 +2309,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_write(os, lr->lr_foid, offset, length, data, tx);
} else {
memcpy(abuf->b_data, data, length);
VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx));
VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx, 0));
}

(void) ztest_log_write(zd, tx, lr);
Expand Down Expand Up @@ -2533,7 +2534,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
object, offset, size, ZTRL_READER);

error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
ASSERT0(error);
} else {
ASSERT3P(zio, !=, NULL);
Expand All @@ -2549,7 +2550,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
object, offset, size, ZTRL_READER);

error = dmu_buf_hold_noread(os, object, offset, zgd, &db);

if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;

Expand Down Expand Up @@ -2826,7 +2826,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
enum ztest_io_type io_type;
uint64_t blocksize;
void *data;
uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH;
dmu_flags_t dmu_read_flags = DMU_READ_NO_PREFETCH;

/*
* We will randomly set when to do O_DIRECT on a read.
Expand Down Expand Up @@ -5065,7 +5065,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
uint64_t stride = 123456789ULL;
uint64_t width = 40;
int free_percent = 5;
uint32_t dmu_read_flags = DMU_READ_PREFETCH;
dmu_flags_t dmu_read_flags = DMU_READ_PREFETCH;

/*
* We will randomly set when to do O_DIRECT on a read.
Expand Down Expand Up @@ -5541,13 +5541,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
}
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off, bigbuf_arcbufs[j], tx));
off, bigbuf_arcbufs[j], tx, 0));
} else {
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off, bigbuf_arcbufs[2 * j], tx));
off, bigbuf_arcbufs[2 * j], tx, 0));
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off + chunksize / 2,
bigbuf_arcbufs[2 * j + 1], tx));
bigbuf_arcbufs[2 * j + 1], tx, 0));
}
if (i == 1) {
dmu_buf_rele(dbt, FTAG);
Expand Down
25 changes: 7 additions & 18 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,6 @@ extern "C" {

#define IN_DMU_SYNC 2

/*
* define flags for dbuf_read
*/

#define DB_RF_MUST_SUCCEED (1 << 0)
#define DB_RF_CANFAIL (1 << 1)
#define DB_RF_HAVESTRUCT (1 << 2)
#define DB_RF_NOPREFETCH (1 << 3)
#define DB_RF_NEVERWAIT (1 << 4)
#define DB_RF_CACHED (1 << 5)
#define DB_RF_NO_DECRYPT (1 << 6)
#define DB_RF_PARTIAL_FIRST (1 << 7)
#define DB_RF_PARTIAL_MORE (1 << 8)

/*
* The simplified state transition diagram for dbufs looks like:
*
Expand Down Expand Up @@ -389,12 +375,15 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag,
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid, uint64_t *hash_out);

int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, dmu_flags_t flags);
void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
void dmu_buf_will_fill_flags(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail,
dmu_flags_t flags);
boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
dmu_flags_t flags);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
dmu_tx_t *tx);
Expand Down Expand Up @@ -476,10 +465,10 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
#define DBUF_GET_BUFC_TYPE(_db) \
(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)

#define DBUF_IS_CACHEABLE(_db) \
#define DBUF_IS_CACHEABLE(_db) (!(_db)->db_pending_evict && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))))

boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp);

Expand Down
67 changes: 42 additions & 25 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,26 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
struct zio_prop *zp);

/*
* DB_RF_* are to be used for dbuf_read() or in limited other cases.
*/
typedef enum dmu_flags {
DB_RF_MUST_SUCCEED = 0, /* Suspend on I/O errors. */
DB_RF_CANFAIL = 1 << 0, /* Return on I/O errors. */
DB_RF_HAVESTRUCT = 1 << 1, /* dn_struct_rwlock is locked. */
DB_RF_NEVERWAIT = 1 << 2,
DMU_READ_PREFETCH = 0, /* Try speculative prefetch. */
DMU_READ_NO_PREFETCH = 1 << 3, /* Don't prefetch speculatively. */
DB_RF_NOPREFETCH = DMU_READ_NO_PREFETCH,
DMU_READ_NO_DECRYPT = 1 << 4, /* Don't decrypt. */
DB_RF_NO_DECRYPT = DMU_READ_NO_DECRYPT,
DMU_DIRECTIO = 1 << 5, /* Bypass ARC. */
DMU_UNCACHEDIO = 1 << 6, /* Reduce caching. */
DMU_PARTIAL_FIRST = 1 << 7, /* First partial access. */
DMU_PARTIAL_MORE = 1 << 8, /* Following partial access. */
DMU_KEEP_CACHING = 1 << 9, /* Don't affect caching. */
} dmu_flags_t;

/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
Expand All @@ -547,7 +567,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag,
dmu_buf_t **dbp);
int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
uint32_t flags);
dmu_flags_t flags);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
Expand All @@ -558,9 +578,9 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
* Special spill buffer support used by "SA" framework
*/

int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags,
const void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags,
const void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);

Expand All @@ -579,17 +599,17 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
* The object number must be a valid, allocated object number.
*/
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **, int flags);
const void *tag, dmu_buf_t **, dmu_flags_t flags);
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp);
int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags);
const void *tag, dmu_buf_t **dbp, dmu_flags_t flags);
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp, uint32_t flags);
dmu_buf_t ***dbpp, dmu_flags_t flags);
int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
dmu_buf_t **dbp);

Expand Down Expand Up @@ -781,6 +801,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
*/
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags);
boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
Expand Down Expand Up @@ -874,40 +895,36 @@ int dmu_free_long_object(objset_t *os, uint64_t object);
* Canfail routines will return 0 on success, or an errno if there is a
* nonrecoverable I/O error.
*/
#define DMU_READ_PREFETCH 0 /* prefetch */
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */
#define DMU_DIRECTIO 4 /* use Direct I/O */

int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags);
void *buf, dmu_flags_t flags);
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags);
dmu_flags_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx, uint32_t flags);
const void *buf, dmu_tx_t *tx, dmu_flags_t flags);
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx);
#ifdef _KERNEL
int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
struct arc_buf *buf, dmu_tx_t *tx);
struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
struct arc_buf *buf, dmu_tx_t *tx);
struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
#define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf
extern uint_t zfs_max_recordsize;

Expand Down
10 changes: 6 additions & 4 deletions include/sys/dmu_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,13 @@ void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);

int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *);
int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags);
int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *);
int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t);
int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t,
dmu_tx_t *);
#if defined(_KERNEL)
int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t);
int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *);
int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t);
int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t,
dmu_tx_t *);
#endif

#ifdef __cplusplus
Expand Down
5 changes: 3 additions & 2 deletions include/sys/dmu_zfetch.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *);
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
boolean_t);
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t,
boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
boolean_t, boolean_t);


#ifdef __cplusplus
Expand Down
4 changes: 2 additions & 2 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -981,9 +981,9 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
uint64_t extents_skipped, uint64_t bytes_skipped,
uint64_t extents_failed, uint64_t bytes_failed);
extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops,
uint32_t flags);
dmu_flags_t flags);
extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops,
uint32_t flags);
dmu_flags_t flags);
extern void spa_import_progress_add(spa_t *spa);
extern void spa_import_progress_remove(uint64_t spa_guid);
extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,
Expand Down
6 changes: 4 additions & 2 deletions include/sys/zfs_racct.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
/*
* Platform-dependent resource accounting hooks
*/
void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags);
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags);
void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags);
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags);

#endif /* _SYS_ZFS_RACCT_H */
4 changes: 2 additions & 2 deletions lib/libzpool/zfs_racct.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@
#include <sys/zfs_racct.h>

void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}

void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}
7 changes: 6 additions & 1 deletion man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -304,14 +304,19 @@ Default dnode block size as a power of 2.
.It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int
Default dnode indirect block size as a power of 2.
.
.It Sy zfs_dio_enabled Ns = Ns Sy 0 Ns | Ns 1 Pq int
.It Sy zfs_dio_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
Enable Direct I/O.
If this setting is 0, then all I/O requests will be directed through the ARC
acting as though the dataset property
.Sy direct
was set to
.Sy disabled .
.
.It Sy zfs_dio_strict Ns = Ns Sy 0 Ns | Ns 1 Pq int
Strictly enforce alignment for Direct I/O requests, returning
.Sy EINVAL
if not page-aligned instead of silently falling back to uncached I/O.
.
.It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
When attempting to log an output nvlist of an ioctl in the on-disk history,
the output will not be stored if it is larger than this size (in bytes).
Expand Down
Loading
Loading