diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index e3b3c0d7d0e..6481ccd8efb 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -411,6 +411,7 @@ struct dfuse_event { union { size_t de_req_len; size_t de_readahead_len; + struct dfuse_info *de_di; }; void (*de_complete_cb)(struct dfuse_event *ev); struct stat de_attr; @@ -457,6 +458,7 @@ struct dfuse_pool { ACTION(CREATE) \ ACTION(MKNOD) \ ACTION(FGETATTR) \ + ACTION(PRE_GETATTR) \ ACTION(GETATTR) \ ACTION(FSETATTR) \ ACTION(SETATTR) \ @@ -535,6 +537,9 @@ struct dfuse_cont { double dfc_dentry_dir_timeout; double dfc_ndentry_timeout; double dfc_data_timeout; + + double dfc_dentry_inval_time; + bool dfc_data_otoc; bool dfc_direct_io_disable; bool dfc_wb_cache; @@ -551,9 +556,6 @@ struct dfuse_cont { #define DFUSE_IE_STAT_ADD(_ie, _stat) \ atomic_fetch_add_relaxed(&(_ie)->ie_dfs->dfs_stat_value[(_stat)], 1) -void -dfuse_set_default_cont_cache_values(struct dfuse_cont *dfc); - /* Connect to a container via a label * Called either for labels on the command line or via dfuse_cont_get_handle() if opening via uuid * @@ -749,7 +751,7 @@ dfuse_loop(struct dfuse_info *dfuse_info); _Static_assert(IS_OH(_oh), "Param is not open handle"); \ (_oh)->doh_ie = NULL; \ __rc = fuse_reply_err(req, 0); \ - if (__rc != 0) \ + if ((__rc != 0) && (__rc != -ENOENT)) \ DS_ERROR(-__rc, "fuse_reply_err() error"); \ } while (0) @@ -999,6 +1001,36 @@ struct dfuse_inode_entry { /** File has been unlinked from daos */ bool ie_unlinked; + /* Data cache metadata, list known size/mtime for file, if these have been updated then + * the data cache should be dropped. + * + * When the last fd on a file is closed and all writes are completed then dfuse will launch + * a stat to get the update size/mtime for the inode. Future opens should block on this + * stat in order to know if the file has been updated. + * + * Access is controlled via atomics and semaphore, when a decision to make the stat is taken + * then active in increased, and the sem is posted. + * + * Future accesses of the inode should check active, if the value is 0 then there is nothing + * to do. + * If active is positive then it should increase active, wait on the semaphore, decrease + * active and then post the semaphore if active != 0; + * + * After active is 0, (or the semaphore has been waited on) then the new stat structure is + * valid. + * + * The release() code to initialize stat is atomic as it's only triggered by the last + * release on a inode. It could race with open() where the inode is known in advance + * or create() where it is not. Open will flush the stat before setting keep_cache. + */ + struct { + struct stat stat; + bool valid; + ATOMIC uint32_t active; + struct timespec last_update; + sem_t sem; + } ie_dc; + /* Lock for writes, shared locks are held during write-back reads, exclusive lock is * acquired and released to flush outstanding writes for getattr, close and forget. */ @@ -1016,6 +1048,8 @@ struct dfuse_inode_entry { struct read_chunk_core *ie_chunk; }; +void +dfuse_ie_cs_flush(struct dfuse_inode_entry *ie); /* Flush write-back cache writes to a inode. It does this by waiting for and then releasing an * exclusive lock on the inode. Writes take a shared lock so this will block until all pending * writes are complete. @@ -1136,6 +1170,11 @@ dfuse_mcache_get_valid(struct dfuse_inode_entry *ie, double max_age, double *tim bool dfuse_dentry_get_valid(struct dfuse_inode_entry *ie, double max_age, double *timeout); +void +dfuse_dc_cache_set_time(struct dfuse_inode_entry *ie); +bool +dfuse_dc_cache_get_valid(struct dfuse_inode_entry *ie, double max_age, double *timeout); + /* inval.c */ int diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index 4f654fa3209..fe098d5a339 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -752,7 +752,7 @@ dfuse_cont_get_cache(struct dfuse_info *dfuse_info, struct dfuse_cont *dfc) * dentries which represent directories and are therefore referenced much * more often during path-walk activities are set to five seconds. */ -void +static void dfuse_set_default_cont_cache_values(struct dfuse_cont *dfc) { dfc->dfc_attr_timeout = 1; @@ -876,6 +876,8 @@ dfuse_cont_open(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, const cha uuid_copy(dfc->dfc_uuid, c_info.ci_uuid); if (dfuse_info->di_caching) { + double timeout; + rc = dfuse_cont_get_cache(dfuse_info, dfc); if (rc == ENODATA) { DFUSE_TRA_INFO(dfc, "Using default caching values"); @@ -883,6 +885,20 @@ dfuse_cont_open(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, const cha } else if (rc != 0) { D_GOTO(err_umount, rc); } + + /* Set the timeout for invalidate. This is the duration after which inodes + * will be evicted from the kernel. Data timeout is considered here as + * well as attrs are used to decide if keeping the data in cache is + * correct. Data timeout can be set to True/-1 so cap this duration at + * ten minutes or nothing would ever get evicted. + */ + timeout = max(dfc->dfc_attr_timeout, dfc->dfc_data_timeout); + + timeout = min(timeout, 10 * 60); + + timeout = max(timeout, dfc->dfc_dentry_timeout); + + dfc->dfc_dentry_inval_time = timeout + 3; } rc = ival_add_cont_buckets(dfc); @@ -1048,6 +1064,51 @@ dfuse_mcache_get_valid(struct dfuse_inode_entry *ie, double max_age, double *tim return use; } +/* Set a timer to mark cache entry as valid */ +void +dfuse_dc_cache_set_time(struct dfuse_inode_entry *ie) +{ + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + ie->ie_dc.last_update = now; +} + +bool +dfuse_dc_cache_get_valid(struct dfuse_inode_entry *ie, double max_age, double *timeout) +{ + bool use = false; + struct timespec now; + struct timespec left; + double time_left; + + D_ASSERT(max_age != -1); + D_ASSERT(max_age >= 0); + + if (ie->ie_dc.last_update.tv_sec == 0) + return false; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + + left.tv_sec = now.tv_sec - ie->ie_dc.last_update.tv_sec; + left.tv_nsec = now.tv_nsec - ie->ie_dc.last_update.tv_nsec; + if (left.tv_nsec < 0) { + left.tv_sec--; + left.tv_nsec += 1000000000; + } + time_left = max_age - (left.tv_sec + ((double)left.tv_nsec / 1000000000)); + if (time_left > 0) { + use = true; + + DFUSE_TRA_DEBUG(ie, "Allowing cache use, time remaining: %.1lf", time_left); + + if (timeout) + *timeout = time_left; + } + + return use; +} + bool dfuse_dentry_get_valid(struct dfuse_inode_entry *ie, double max_age, double *timeout) { @@ -1251,6 +1312,7 @@ dfuse_ie_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) atomic_init(&ie->ie_open_count, 0); atomic_init(&ie->ie_open_write_count, 0); atomic_init(&ie->ie_il_count, 0); + sem_init(&ie->ie_dc.sem, 0, 0); atomic_init(&ie->ie_linear_read, true); atomic_fetch_add_relaxed(&dfuse_info->di_inode_count, 1); D_INIT_LIST_HEAD(&ie->ie_evict_entry); diff --git a/src/client/dfuse/dfuse_fuseops.c b/src/client/dfuse/dfuse_fuseops.c index 980e71ac9af..7dcac44bab8 100644 --- a/src/client/dfuse/dfuse_fuseops.c +++ b/src/client/dfuse/dfuse_fuseops.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -54,6 +54,12 @@ dfuse_show_flags(void *handle, unsigned int cap, unsigned int want) #ifdef FUSE_CAP_EXPLICIT_INVAL_DATA SHOW_FLAG(handle, cap, want, FUSE_CAP_EXPLICIT_INVAL_DATA); #endif +#ifdef FUSE_CAP_EXPIRE_ONLY + SHOW_FLAG(handle, cap, want, FUSE_CAP_EXPIRE_ONLY); +#endif +#ifdef FUSE_CAP_SETXATTR_EXT + SHOW_FLAG(handle, cap, want, FUSE_CAP_SETXATTR_EXT); +#endif if (cap) DFUSE_TRA_WARNING(handle, "Unknown capability flags %#x", cap); @@ -90,6 +96,12 @@ dfuse_fuse_init(void *arg, struct fuse_conn_info *conn) conn->want |= FUSE_CAP_READDIRPLUS; conn->want |= FUSE_CAP_READDIRPLUS_AUTO; +#ifdef FUSE_CAP_EXPLICIT_INVAL_DATA + /* DAOS-15338 Do not let the kernel evict data on mtime changes */ + conn->want |= FUSE_CAP_EXPLICIT_INVAL_DATA; + conn->want &= ~FUSE_CAP_AUTO_INVAL_DATA; +#endif + #ifdef FUSE_CAP_CACHE_SYMLINKS conn->want |= FUSE_CAP_CACHE_SYMLINKS; #endif @@ -164,6 +176,22 @@ df_ll_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) DFUSE_IE_STAT_ADD(inode, DS_GETATTR); } + /* Check for stat-after-write/close. On close a stat is performed so the first getattr + * call can use the result of that. + */ + if (inode->ie_dfs->dfc_attr_timeout) { + double timeout; + dfuse_ie_cs_flush(inode); + + if (dfuse_dc_cache_get_valid(inode, inode->ie_dfs->dfc_attr_timeout, &timeout)) { + if (inode->ie_dc.valid) { + DFUSE_IE_STAT_ADD(inode, DS_PRE_GETATTR); + inode->ie_dc.valid = false; + DFUSE_REPLY_ATTR_FORCE(inode, req, timeout); + return; + } + } + } DFUSE_IE_WFLUSH(inode); if (inode->ie_dfs->dfc_attr_timeout && diff --git a/src/client/dfuse/inval.c b/src/client/dfuse/inval.c index 740d1eb52e6..7bc6185193d 100644 --- a/src/client/dfuse/inval.c +++ b/src/client/dfuse/inval.c @@ -344,7 +344,7 @@ ival_update_inode(struct dfuse_inode_entry *inode, double timeout) if (S_ISDIR(inode->ie_stat.st_mode)) timeout += INVAL_DIRECTORY_GRACE; else - timeout += INVAL_FILE_GRACE; + timeout = inode->ie_dfs->dfc_dentry_inval_time; clock_gettime(CLOCK_MONOTONIC_COARSE, &now); @@ -444,7 +444,7 @@ ival_bucket_dec_value(double timeout) } /* Ensure the correct buckets exist for a attached container. Pools have a zero dentry timeout - * so skip zero values + * so skip zero values. */ int ival_add_cont_buckets(struct dfuse_cont *dfc) @@ -457,7 +457,7 @@ ival_add_cont_buckets(struct dfuse_cont *dfc) if (rc != 0) goto out; if (dfc->dfc_dentry_timeout != 0) { - rc = ival_bucket_add_value(dfc->dfc_dentry_timeout + INVAL_FILE_GRACE); + rc = ival_bucket_add_value(dfc->dfc_dentry_inval_time); if (rc != 0) ival_bucket_dec_value(dfc->dfc_dentry_dir_timeout + INVAL_DIRECTORY_GRACE); } @@ -473,7 +473,7 @@ ival_dec_cont_buckets(struct dfuse_cont *dfc) { D_MUTEX_LOCK(&ival_lock); if (dfc->dfc_dentry_timeout != 0) - ival_bucket_dec_value(dfc->dfc_dentry_timeout + INVAL_FILE_GRACE); + ival_bucket_dec_value(dfc->dfc_dentry_inval_time); ival_bucket_dec_value(dfc->dfc_dentry_dir_timeout + INVAL_DIRECTORY_GRACE); D_MUTEX_UNLOCK(&ival_lock); } diff --git a/src/client/dfuse/ops/fgetattr.c b/src/client/dfuse/ops/fgetattr.c index 6fdee73515c..62645290f0e 100644 --- a/src/client/dfuse/ops/fgetattr.c +++ b/src/client/dfuse/ops/fgetattr.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -10,14 +10,28 @@ static void dfuse_cb_getattr_cb(struct dfuse_event *ev) { + struct dfuse_inode_entry *ie = ev->de_ie; + if (ev->de_ev.ev_error != 0) { - DFUSE_REPLY_ERR_RAW(ev->de_ie, ev->de_req, ev->de_ev.ev_error); + DFUSE_REPLY_ERR_RAW(ie, ev->de_req, ev->de_ev.ev_error); D_GOTO(release, 0); } - ev->de_attr.st_ino = ev->de_ie->ie_stat.st_ino; + ev->de_attr.st_ino = ie->ie_stat.st_ino; + + ie->ie_stat = ev->de_attr; - ev->de_ie->ie_stat = ev->de_attr; + if ((ie->ie_dfs->dfc_data_timeout != 0) && + (dfuse_dcache_get_valid(ie, ie->ie_dfs->dfc_data_timeout))) { + /* This tries to match the code in fuse_change_attributes in fs/fuse/inode.c, + * if the mtime or the size has changed then drop the data cache. + */ + if ((ie->ie_stat.st_size != ie->ie_dc.stat.st_size) || + (d_timediff_ns(&ie->ie_stat.st_mtim, &ie->ie_dc.stat.st_mtim) != 0)) { + DFUSE_TRA_DEBUG(ie, "Invalidating data cache"); + dfuse_dcache_evict(ie); + } + } DFUSE_REPLY_ATTR(ev->de_ie, ev->de_req, &ev->de_attr); release: diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 772e92e1c13..bfc8fbb58d0 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -54,25 +54,39 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if (rc) D_GOTO(err, rc); + /* + * dfs_dup() just locally duplicates the file handle. If we have + * O_TRUNC flag, we need to truncate the file manually. + */ + if (fi->flags & O_TRUNC) { + rc = dfs_punch(ie->ie_dfs->dfs_ns, ie->ie_obj, 0, DFS_MAX_FSIZE); + if (rc) + D_GOTO(err, rc); + dfuse_dcache_evict(oh->doh_ie); + } + + /* There are no failure points between here and the kernel reply */ + if ((fi->flags & O_ACCMODE) != O_RDONLY) oh->doh_writeable = true; if (ie->ie_dfs->dfc_data_timeout != 0) { - if (fi->flags & O_DIRECT) + if (fi->flags & O_DIRECT) { fi_out.direct_io = 1; - - /* If the file is already open or (potentially) in cache then allow any existing - * kernel cache to be used. If not then use pre-read. - * This should mean that pre-read is only used on the first read, and on files - * which pre-existed in the container. - */ - - if (atomic_load_relaxed(&ie->ie_open_count) > 0 || - ((ie->ie_dcache_last_update.tv_sec != 0) && - dfuse_dcache_get_valid(ie, ie->ie_dfs->dfc_data_timeout))) { - fi_out.keep_cache = 1; } else { - prefetch = true; + dfuse_ie_cs_flush(ie); + + /* If the file is already open or (potentially) in cache then allow any + * existing kernel cache to be used. If not then use pre-read. This should + * mean that pre-read is only used on the first read, and on files which + * pre-existed in the container. + */ + if (atomic_load_relaxed(&ie->ie_open_count) > 0 || + dfuse_dcache_get_valid(ie, ie->ie_dfs->dfc_data_timeout)) { + fi_out.keep_cache = 1; + } else { + prefetch = true; + } } } else if (ie->ie_dfs->dfc_data_otoc) { /* Open to close caching, this allows the use of shared mmap */ @@ -85,6 +99,8 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) fi_out.direct_io = 1; } + atomic_fetch_add_relaxed(&ie->ie_open_count, 1); + if (ie->ie_dfs->dfc_direct_io_disable) fi_out.direct_io = 0; @@ -93,19 +109,6 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) fi_out.fh = (uint64_t)oh; - /* - * dfs_dup() just locally duplicates the file handle. If we have - * O_TRUNC flag, we need to truncate the file manually. - */ - if (fi->flags & O_TRUNC) { - rc = dfs_punch(ie->ie_dfs->dfs_ns, ie->ie_obj, 0, DFS_MAX_FSIZE); - if (rc) - D_GOTO(err, rc); - dfuse_dcache_evict(oh->doh_ie); - } - - atomic_fetch_add_relaxed(&ie->ie_open_count, 1); - /* Enable this for files up to the max read size. */ if (prefetch && oh->doh_parent_dir && atomic_load_relaxed(&oh->doh_parent_dir->ie_linear_read) && ie->ie_stat.st_size > 0 && @@ -132,6 +135,101 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) DFUSE_REPLY_ERR_RAW(ie, req, rc); } +void +dfuse_ie_cs_flush(struct dfuse_inode_entry *ie) +{ + uint32_t new = 2; + uint32_t old = 1; + +restart: + if (atomic_compare_exchange(&ie->ie_dc.active, old, new)) { + /* ie->ie_ref was 1, now it's 2 */ + sem_wait(&ie->ie_dc.sem); + + } else if (old == 0) { + /* ie->ie_ref was and is 0 */ + return; + } else { + new = old + 1; + + /* ie->ie_ref was > 1 and is unchanged */ + if (!atomic_compare_exchange(&ie->ie_dc.active, old, new)) + goto restart; + + /* ie->ie_ref was > 1 and is unchanged */ + + sem_wait(&ie->ie_dc.sem); + } + + old = atomic_fetch_sub(&ie->ie_dc.active, 1); + + if (old > 1) + sem_post(&ie->ie_dc.sem); +} + +static void +getattr_cb(struct dfuse_event *ev) +{ + if (ev->de_ev.ev_error != 0) { + ev->de_ie->ie_dc.valid = false; + goto free; + } + + ev->de_ie->ie_dc.stat.st_ino = ev->de_ie->ie_stat.st_ino; + ev->de_ie->ie_dc.valid = true; + dfuse_dc_cache_set_time(ev->de_ie); + + /* Data will have been read directory into ie->ie_dc.stat */ + +free: + atomic_fetch_sub(&ev->de_ie->ie_dc.active, 1); + sem_post(&ev->de_ie->ie_dc.sem); + dfuse_inode_decref(ev->de_di, ev->de_ie); + D_FREE(ev); +} + +void +size_resample(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) +{ + struct dfuse_event *ev; + uint64_t eqt_idx; + struct dfuse_eq *eqt; + int rc; + + if (ie->ie_unlinked) + return; + + eqt_idx = atomic_fetch_add_relaxed(&dfuse_info->di_eqt_idx, 1); + eqt = &dfuse_info->di_eqt[eqt_idx % dfuse_info->di_eq_count]; + D_ALLOC_PTR(ev); + if (ev == NULL) + return; + + ev->de_complete_cb = getattr_cb; + ev->de_ie = ie; + ev->de_di = dfuse_info; + + atomic_fetch_add_relaxed(&ie->ie_ref, 1); + + rc = daos_event_init(&ev->de_ev, eqt->de_eq, NULL); + if (rc != -DER_SUCCESS) + goto err; + + rc = dfs_ostatx(ie->ie_dfs->dfs_ns, ie->ie_obj, &ie->ie_dc.stat, &ev->de_ev); + if (rc != 0) + goto err; + + atomic_fetch_add(&ie->ie_dc.active, 1); + + sem_post(&eqt->de_sem); + + return; +err: + dfuse_inode_decref(ev->de_di, ev->de_ie); + ie->ie_dc.valid = false; + D_FREE(ev); +} + void dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { @@ -141,6 +239,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) int rc; uint32_t oc; uint32_t il_calls; + bool update_data_cache = false; /* Perform the opposite of what the ioctl call does, always change the open handle count * but the inode only tracks number of open handles with non-zero ioctl counts @@ -185,7 +284,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if (il_calls == 0) { DFUSE_TRA_DEBUG(oh, "Evicting metadata cache, setting data cache"); dfuse_mcache_evict(oh->doh_ie); - dfuse_dcache_set_time(oh->doh_ie); + update_data_cache = true; } else { DFUSE_TRA_DEBUG(oh, "Evicting cache"); dfuse_cache_evict(oh->doh_ie); @@ -196,6 +295,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if (oh->doh_caching) { if (il_calls == 0) { DFUSE_TRA_DEBUG(oh, "Saving data cache"); + /* Set the time here but do not re-sample as there are no writes. */ dfuse_dcache_set_time(oh->doh_ie); } else { DFUSE_TRA_DEBUG(oh, "Evicting cache"); @@ -204,9 +304,9 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) } } DFUSE_TRA_DEBUG(oh, "il_calls %d, caching %d,", il_calls, oh->doh_caching); - if (il_calls != 0) { + if (il_calls != 0) atomic_fetch_sub_relaxed(&oh->doh_ie->ie_il_count, 1); - } + oc = atomic_fetch_sub_relaxed(&oh->doh_ie->ie_open_count, 1); if (oc == 1) { if (read_chunk_close(oh->doh_ie)) @@ -218,6 +318,13 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) atomic_fetch_add_relaxed(&ie->ie_ref, 1); } + if (update_data_cache) { + dfuse_dcache_set_time(oh->doh_ie); + if (oc == 1) + /* Fire off a ostatx call to fetch the new mtime */ + size_resample(dfuse_info, oh->doh_ie); + } + rc = dfs_release(oh->doh_obj); if (rc == 0) { DFUSE_REPLY_ZERO_OH(oh, req); diff --git a/src/client/dfuse/ops/setattr.c b/src/client/dfuse/ops/setattr.c index 7ba3e0e1de3..12217891cf3 100644 --- a/src/client/dfuse/ops/setattr.c +++ b/src/client/dfuse/ops/setattr.c @@ -15,6 +15,8 @@ dfuse_cb_setattr(fuse_req_t req, struct dfuse_inode_entry *ie, struct stat *attr DFUSE_TRA_DEBUG(ie, "flags %#x", to_set); + dfuse_ie_cs_flush(ie); + if (ie->ie_unlinked) { DFUSE_TRA_DEBUG(ie, "File is unlinked, returning most recent data"); diff --git a/src/tests/ftest/dfuse/caching_check.py b/src/tests/ftest/dfuse/caching_check.py index 852a24f0dfd..2d9332a6dc1 100644 --- a/src/tests/ftest/dfuse/caching_check.py +++ b/src/tests/ftest/dfuse/caching_check.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2019-2023 Intel Corporation. + (C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -45,13 +45,16 @@ def test_dfuse_caching_check(self): self.log_step('Write to the dfuse mount point') self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) + self.dfuse.get_stats() self.log_step('Get baseline read performance from dfuse with caching disabled') self.ior_cmd.update_params(flags=flags[1]) base_read_arr = [] out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) + self.dfuse.get_stats() base_read_arr.append(IorCommand.get_ior_metrics(out)) out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) + self.dfuse.get_stats() base_read_arr.append(IorCommand.get_ior_metrics(out)) # the index of max_mib @@ -64,10 +67,12 @@ def test_dfuse_caching_check(self): self.log_step('Get first read performance with caching enabled') out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) + self.dfuse.get_stats() base_read_arr.append(IorCommand.get_ior_metrics(out)) self.log_step('Get cached read performance') - out = self.run_ior_with_pool(fail_on_warning=False) + out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) + self.dfuse.get_stats() with_caching = IorCommand.get_ior_metrics(out) self.log_step('Verify cached read performance is greater than first read') diff --git a/src/tests/ftest/dfuse/read.py b/src/tests/ftest/dfuse/read.py index 607b8f99f9f..b6a390a1daf 100644 --- a/src/tests/ftest/dfuse/read.py +++ b/src/tests/ftest/dfuse/read.py @@ -126,3 +126,87 @@ def test_dfuse_pre_read(self): ) self.assertEqual(data["inodes"], 4, "expected 4 inodes in cache") + + +class DFuseReadTest(TestWithServers): + """Base ReadTest test class. + + :avocado: recursive + """ + + def test_dfuse_read(self): + """ + Test Description: + Run a simple Write/Read test to check for read caching. + + Write a file, then read from it and verify that there were no reads at the dfuse level. + + Evict the file, read from it twice and verify the second read comes from cache. + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=dfuse,dfs + :avocado: tags=DFuseReadTest,test_dfuse_read + """ + + pool = self.get_pool(connect=False) + container = self.get_container(pool) + + dfuse = get_dfuse(self, self.hostlist_clients) + + cont_attrs = {} + + cont_attrs["dfuse-data-cache"] = "1h" + cont_attrs["dfuse-attr-time"] = "1h" + cont_attrs["dfuse-dentry-time"] = "1h" + cont_attrs["dfuse-ndentry-time"] = "1h" + + container.set_attr(attrs=cont_attrs) + + start_dfuse(self, dfuse, pool, container) + + fuse_root_dir = dfuse.mount_dir.value + + cmd = f"dd if=/dev/zero of={fuse_root_dir}/test_file count=16 bs=1M" + result = run_remote(self.log, self.hostlist_clients, cmd) + if not result.passed: + self.fail(f'"{cmd}" failed on {result.failed_hosts}') + + cmd = f"dd if={fuse_root_dir}/test_file of=/dev/zero count=16 bs=1M" + result = run_remote(self.log, self.hostlist_clients, cmd) + if not result.passed: + self.fail(f'"{cmd}" failed on {result.failed_hosts}') + + data = dfuse.get_stats() + + self.assertEqual( + data["statistics"].get("read", 0), 0, "Did not expect any read calls" + ) + + cmd = f"daos filesystem evict {fuse_root_dir}/test_file" + result = run_remote(self.log, self.hostlist_clients, cmd) + if not result.passed: + self.fail(f'"{cmd}" failed on {result.failed_hosts}') + + cmd = f"dd if={fuse_root_dir}/test_file of=/dev/zero count=16 bs=1M" + result = run_remote(self.log, self.hostlist_clients, cmd) + if not result.passed: + self.fail(f'"{cmd}" failed on {result.failed_hosts}') + + data = dfuse.get_stats() + + self.assertGreater( + data["statistics"].get("read", 0), 0, "expected non-zero pre read" + ) + + result = run_remote(self.log, self.hostlist_clients, cmd) + if not result.passed: + self.fail(f'"{cmd}" failed on {result.failed_hosts}') + + data2 = dfuse.get_stats() + + self.assertEqual( + data["statistics"].get("read", 0), + data2["statistics"].get("read", 0), + "Did not expect more read calls", + ) diff --git a/src/tests/ftest/util/dfuse_utils.py b/src/tests/ftest/util/dfuse_utils.py index a26f372e76d..4af62d5d2e4 100644 --- a/src/tests/ftest/util/dfuse_utils.py +++ b/src/tests/ftest/util/dfuse_utils.py @@ -384,6 +384,12 @@ def get_stats(self): """Return the I/O stats for the filesystem Only works if there is one entry in the client list. + + Raises: + CommandFailure: if the command fails + + Returns: + dict: the json response """ if len(self.hosts) != 1: @@ -392,11 +398,11 @@ def get_stats(self): cmd = f"daos filesystem query --json {self.mount_dir.value}" result = run_remote(self.log, self.hosts, cmd) if not result.passed: - raise CommandFailure(f'"fs query failed on {result.failed_hosts}') + raise CommandFailure(f"fs query failed on {result.failed_hosts}") data = json.loads("\n".join(result.output[0].stdout)) if data["status"] != 0 or data["error"] is not None: - raise CommandFailure("fs query returned bad data.") + raise CommandFailure("fs query returned bad data") return data["response"] diff --git a/utils/cq/words.dict b/utils/cq/words.dict index dbd7d6e5826..44917e11c50 100644 --- a/utils/cq/words.dict +++ b/utils/cq/words.dict @@ -138,6 +138,7 @@ debian debuginfo defusedxml del +dentry deps dereference dereferencing diff --git a/utils/node_local_test.py b/utils/node_local_test.py index c7c7dd6a6d0..5db5cb2fed1 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -2295,6 +2295,83 @@ def test_pre_read(self): assert len(data5) == 0 assert raw_data1 == data6 + def test_read_from_cache(self): + """Test a basic read. + + Write to a file, then read from it. With write-through caching on then the read should come + from the page cache. Due to the way this is implement the cache will be truncated down + to a page size so this test only works for whole pages. + + The I/O that dfuse should see are: + create + write + release + stat + open + release + open + release + + After that update the file time from a different path and re-read the file, this should + trigger an actual read due to the file size changing triggering a cache invalidation. + """ + + # The value of attr-time. Too slow and the test won't run fast enough and will fail, + # too long and the wall-clock time will be affected. + attr_time = 30 + + self.container.set_attrs({'dfuse-attr-time': str(attr_time), + 'dfuse-data-cache': '5m', + 'dfuse-dentry-time': '5m'}) + + dfuse0 = DFuse(self.server, + self.conf, + caching=True, + wbcache=False, + container=self.container) + dfuse0.start(v_hint='rfc') + + file_name = join(dfuse0.dir, 'file') + + # Create the file. + subprocess.run(["dd", "if=/dev/zero", f"of={file_name}", 'count=1', 'bs=4k'], check=True) + + start = time.time() + + # Read it after write, this should come from cache. + subprocess.run(["dd", "of=/dev/zero", f"if={file_name}", 'count=4k', 'bs=1'], check=True) + sd = dfuse0.check_usage() + + assert sd["statistics"].get("read", 0) == 0, sd + + # Read it after read, this should also come from cache. + subprocess.run(["dd", "of=/dev/zero", f"if={file_name}", 'count=1', 'bs=4k'], check=True) + sd = dfuse0.check_usage() + assert sd["statistics"].get("read", 0) == 0, sd + + elapsed = time.time() - start + + to_sleep = attr_time + 5 - elapsed + + if to_sleep < 5: + raise NLTestFail("attr_time not high enough") + + print(f"Sleeping for {to_sleep} seconds") + time.sleep(to_sleep) + + # Now the attr time has expired but the dentry and data caches are still valid. At this + # point change the file time using touch and then re-read the file which should perform + # an actual read this time. + self.server.run_daos_client_cmd_pil4dfs(['touch', 'file'], container=self.container) + + # Read it after attr timeout, this should not come from cache due to the times changing. + subprocess.run(["dd", "of=/dev/zero", f"if={file_name}", 'count=1', 'bs=4k'], check=True) + sd = dfuse0.check_usage() + assert sd["statistics"].get("read", 0) > 0, sd + + if dfuse0.stop(): + self.fatal_errors = True + def test_two_mounts(self): """Create two mounts, and check that a file created in one can be read from the other""" dfuse0 = DFuse(self.server, @@ -3463,11 +3540,13 @@ def test_cont_chown(self): if dfuse.stop(): self.fatal_errors = True - @needs_dfuse + @needs_dfuse_with_opt(caching=False) def test_complex_rename(self): """Test for rename semantics - Check that that rename is correctly updating the dfuse data for the moved file. + Check that that rename is correctly updating the dfuse data for the moved file. For this + test to work correctly with caching then the attr-timeout needs to be shorter than the + time it takes to spin up the second DFuse instance, so do not this with caching on. # Create a file, read/write to it. # Check fstat works.