diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index 80ff5926442..8a2fc1f0fa2 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2022-2023 Intel Corporation. + * (C) Copyright 2022-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -61,7 +61,7 @@ #define MAX_DAOS_MT (8) #define READ_DIR_BATCH_SIZE (96) -#define MAX_FD_DUP2ED (8) +#define MAX_FD_DUP2ED (16) #define MAX_MMAP_BLOCK (64) @@ -78,6 +78,14 @@ #define MAX_EQ 64 +/* the default min fd that will be used by DAOS */ +#define DAOS_MIN_FD 10 + +/* the number of low fd reserved */ +static uint16_t low_fd_count; +/* the list of low fd reserved */ +static int low_fd_list[DAOS_MIN_FD]; + /* In case of fork(), only the parent process could destroy daos env. */ static bool context_reset; static __thread daos_handle_t td_eqh; @@ -126,7 +134,7 @@ static _Atomic uint32_t daos_init_cnt; static bool report; static long int page_size; -static bool daos_inited; +static _Atomic bool daos_inited; static bool daos_debug_inited; static int num_dfs; static struct dfs_mt dfs_list[MAX_DAOS_MT]; @@ -185,7 +193,6 @@ struct mmap_obj { struct fd_dup2 { int fd_src, fd_dest; - bool dest_closed; }; /* Add the data structure for statx_timestamp and statx @@ -233,7 +240,7 @@ struct statx { #endif /* working dir of current process */ -static char cur_dir[DFS_MAX_PATH] = ""; +static char cur_dir[DFS_MAX_PATH + 1] = ""; static bool segv_handler_inited; /* Old segv handler */ struct sigaction old_segv; @@ -241,6 +248,7 @@ struct sigaction old_segv; /* the flag to indicate whether initlization is finished or not */ static bool hook_enabled; static bool hook_enabled_bak; +static pthread_mutex_t lock_reserve_fd; static pthread_mutex_t lock_dfs; static pthread_mutex_t lock_fd; static pthread_mutex_t lock_dirfd; @@ -403,6 +411,8 @@ static ssize_t (*next_pwrite)(int fd, const void *buf, size_t size, off_t offset static off_t (*libc_lseek)(int fd, off_t offset, int whence); static off_t (*pthread_lseek)(int fd, off_t offset, int whence); +static int new_fxstat(int vers, int fd, struct stat *buf); + static int (*next_fxstat)(int vers, int fd, struct stat *buf); static int (*next_fstat)(int fd, struct stat *buf); @@ -532,6 +542,8 @@ remove_dot_dot(char path[], int *len); static int remove_dot_and_cleanup(char szPath[], int len); +/* reference count of fake fd duplicated by real fd with dup2() */ +static int dup_ref_count[MAX_OPENED_FILE]; static struct file_obj *file_list[MAX_OPENED_FILE]; static struct dir_obj *dir_list[MAX_OPENED_DIR]; static struct mmap_obj mmap_list[MAX_MMAP_BLOCK]; @@ -548,7 +560,7 @@ find_next_available_dirfd(struct dir_obj *obj, int *new_fd); static int find_next_available_map(int *idx); static void -free_fd(int idx); +free_fd(int idx, bool closing_dup_fd); static void free_dirfd(int idx); static void @@ -899,7 +911,7 @@ child_hdlr(void) int rc; /* daos is not initialized yet */ - if (!daos_inited) + if (atomic_load_relaxed(&daos_inited) == false) return; daos_eq_lib_reset_after_fork(); @@ -913,6 +925,58 @@ child_hdlr(void) context_reset = true; } +/* only free the reserved low fds when application exits or encounters error */ +static void +free_reserved_low_fd(void) +{ + int i; + + for (i = 0; i < low_fd_count; i++) + libc_close(low_fd_list[i]); + low_fd_count = 0; +} + +/* some applications especially bash scripts use specific low fds directly. + * It would be safer to avoid using such low fds (fd < DAOS_MIN_FD) in daos. + * We consume such low fds before any daos calls and close them only when + * application exits or encounters error. + */ + +static int +consume_low_fd(void) +{ + int rc = 0; + + if (atomic_load_relaxed(&daos_inited) == true) + return 0; + + D_MUTEX_LOCK(&lock_reserve_fd); + low_fd_count = 0; + low_fd_list[low_fd_count] = libc_open("/", O_PATH | O_DIRECTORY); + while (1) { + if (low_fd_list[low_fd_count] < 0) { + DS_ERROR(errno, "failed to reserve a low fd"); + goto err; + } else if (low_fd_list[low_fd_count] >= DAOS_MIN_FD) { + libc_close(low_fd_list[low_fd_count]); + break; + } else { + low_fd_count++; + } + low_fd_list[low_fd_count] = libc_open("/", O_RDONLY); + } + + D_MUTEX_UNLOCK(&lock_reserve_fd); + return rc; + +err: + rc = errno; + free_reserved_low_fd(); + D_MUTEX_UNLOCK(&lock_reserve_fd); + + return rc; +} + /** determine whether a path (both relative and absolute) is on DAOS or not. If yes, * returns parent object, item name, full path of parent dir, full absolute path, and * the pointer to struct dfs_mt. @@ -962,7 +1026,7 @@ query_path(const char *szInput, int *is_target_path, dfs_obj_t **parent, char *i if (strncmp(szInput, ".", 2) == 0) { /* special case for current work directory */ - pt_end = stpncpy(full_path_parse, cur_dir, DFS_MAX_PATH); + pt_end = stpncpy(full_path_parse, cur_dir, DFS_MAX_PATH + 1); len = (int)(pt_end - full_path_parse); if (len >= DFS_MAX_PATH) { D_DEBUG(DB_ANY, "full_path_parse[] is not large enough: %d (%s)\n", @@ -1005,24 +1069,37 @@ query_path(const char *szInput, int *is_target_path, dfs_obj_t **parent, char *i if (idx_dfs >= 0) { /* trying to avoid lock as much as possible */ - if (!daos_inited) { + if (atomic_load_relaxed(&daos_inited) == false) { /* daos_init() is expensive to call. We call it only when necessary. */ + + rc = consume_low_fd(); + if (rc) { + DS_ERROR(rc, "consume_low_fd() failed"); + *is_target_path = 0; + goto out_normal; + } + rc = daos_init(); if (rc) { DL_ERROR(rc, "daos_init() failed"); *is_target_path = 0; goto out_normal; } + if (eq_count_max) { - rc = daos_eq_create(&td_eqh); - if (rc) - DL_WARN(rc, "daos_eq_create() failed"); - main_eqh = td_eqh; - rc = pthread_atfork(NULL, NULL, &child_hdlr); - D_ASSERT(rc == 0); + D_MUTEX_LOCK(&lock_eqh); + if (daos_handle_is_inval(main_eqh)) { + rc = daos_eq_create(&td_eqh); + if (rc) + DL_WARN(rc, "daos_eq_create() failed"); + main_eqh = td_eqh; + rc = pthread_atfork(NULL, NULL, &child_hdlr); + D_ASSERT(rc == 0); + } + D_MUTEX_UNLOCK(&lock_eqh); } - daos_inited = true; + atomic_store_relaxed(&daos_inited, true); atomic_fetch_add_relaxed(&daos_init_cnt, 1); } @@ -1100,6 +1177,7 @@ query_path(const char *szInput, int *is_target_path, dfs_obj_t **parent, char *i D_GOTO(out_err, rc); } } else { + strncpy(*full_path, full_path_parse, len + 1); *is_target_path = 0; item_name[0] = '\0'; } @@ -1296,6 +1374,7 @@ find_next_available_fd(struct file_obj *obj, int *new_fd) new_obj->ref_count++; file_list[idx] = new_obj; } + dup_ref_count[idx] = 0; if (next_free_fd > last_fd) last_fd = next_free_fd; next_free_fd = -1; @@ -1315,6 +1394,24 @@ find_next_available_fd(struct file_obj *obj, int *new_fd) return 0; } +static void +inc_dup_ref_count(int fd) +{ + D_MUTEX_LOCK(&lock_fd); + dup_ref_count[fd - FD_FILE_BASE]++; + file_list[fd - FD_FILE_BASE]->ref_count++; + D_MUTEX_UNLOCK(&lock_fd); +} + +static void +dec_dup_ref_count(int fd) +{ + D_MUTEX_LOCK(&lock_fd); + dup_ref_count[fd - FD_FILE_BASE]--; + file_list[fd - FD_FILE_BASE]->ref_count--; + D_MUTEX_UNLOCK(&lock_fd); +} + static int find_next_available_dirfd(struct dir_obj *obj, int *new_dir_fd) { @@ -1399,7 +1496,7 @@ find_next_available_map(int *idx) /* May need to support duplicated fd as duplicated dirfd too. */ static void -free_fd(int idx) +free_fd(int idx, bool closing_dup_fd) { int i, rc; struct file_obj *saved_obj = NULL; @@ -1412,9 +1509,15 @@ free_fd(int idx) return; } + if (closing_dup_fd) + dup_ref_count[idx]--; file_list[idx]->ref_count--; if (file_list[idx]->ref_count == 0) saved_obj = file_list[idx]; + if (dup_ref_count[idx] > 0) { + D_MUTEX_UNLOCK(&lock_fd); + return; + } file_list[idx] = NULL; if (idx < next_free_fd) @@ -1506,7 +1609,7 @@ free_map(int idx) mmap_list[idx].addr = NULL; /* Need to call free_fd(). */ if (file_list[mmap_list[idx].fd - FD_FILE_BASE]->idx_mmap >= MAX_MMAP_BLOCK) - free_fd(mmap_list[idx].fd - FD_FILE_BASE); + free_fd(mmap_list[idx].fd - FD_FILE_BASE, false); mmap_list[idx].fd = -1; if (idx < next_free_map) @@ -1547,46 +1650,23 @@ get_fd_redirected(int fd) return fd_ret; } -/* This fd is a fake fd. There exists a associated kernel fd with dup2. - * Need to check whether fd is in fd_dup2_list[], set dest_closed true - * if yes. Otherwise, close the fake fd. - */ -static void -close_dup_fd_dest_fakefd(int fd) -{ - int i; - - if (fd < FD_FILE_BASE) - return; - - D_MUTEX_LOCK(&lock_fd_dup2ed); - if (num_fd_dup2ed > 0) { - for (i = 0; i < MAX_FD_DUP2ED; i++) { - if (fd_dup2_list[i].fd_dest == fd) { - fd_dup2_list[i].dest_closed = true; - D_MUTEX_UNLOCK(&lock_fd_dup2ed); - return; - } - } - } - D_MUTEX_UNLOCK(&lock_fd_dup2ed); - - free_fd(fd - FD_FILE_BASE); -} - /* This fd is a fd from kernel and it is associated with a fake fd. - * Need to 1) close(fd) 2) remove the entry in fd_dup2_list[] 3) close - * the fake fd if dest_closed is true. + * Need to 1) close(fd) 2) remove the entry in fd_dup2_list[] 3) decrease + * the dup reference count of the fake fd. */ + static int -close_dup_fd_src(int (*next_close)(int fd), int fd) +close_dup_fd(int (*next_close)(int fd), int fd, bool close_fd) { int i, rc, idx_dup = -1, fd_dest = -1; - /* close the fd from kernel */ - rc = next_close(fd); - if (rc != 0) - return (-1); + if (close_fd) { + /* close the fd from kernel */ + assert(fd < FD_FILE_BASE); + rc = next_close(fd); + if (rc != 0) + return (-1); + } /* remove the fd_dup entry */ D_MUTEX_LOCK(&lock_fd_dup2ed); @@ -1594,12 +1674,10 @@ close_dup_fd_src(int (*next_close)(int fd), int fd) for (i = 0; i < MAX_FD_DUP2ED; i++) { if (fd_dup2_list[i].fd_src == fd) { idx_dup = i; - if (fd_dup2_list[i].dest_closed) - fd_dest = fd_dup2_list[i].fd_dest; + fd_dest = fd_dup2_list[i].fd_dest; /* clear the value to free */ fd_dup2_list[i].fd_src = -1; fd_dup2_list[i].fd_dest = -1; - fd_dup2_list[i].dest_closed = false; num_fd_dup2ed--; break; } @@ -1613,8 +1691,7 @@ close_dup_fd_src(int (*next_close)(int fd), int fd) errno = EINVAL; return (-1); } - if (fd_dest > 0) - free_fd(fd_dest - FD_FILE_BASE); + free_fd(fd_dest - FD_FILE_BASE, true); return 0; } @@ -1628,7 +1705,6 @@ init_fd_dup2_list(void) for (i = 0; i < MAX_FD_DUP2ED; i++) { fd_dup2_list[i].fd_src = -1; fd_dup2_list[i].fd_dest = -1; - fd_dup2_list[i].dest_closed = false; } D_MUTEX_UNLOCK(&lock_fd_dup2ed); } @@ -1638,6 +1714,9 @@ allocate_dup2ed_fd(const int fd_src, const int fd_dest) { int i; + /* increase reference count of the fake fd */ + inc_dup_ref_count(fd_dest); + /* Not many applications use dup2(). Normally the number of fd duped is small. */ D_MUTEX_LOCK(&lock_fd_dup2ed); if (num_fd_dup2ed < MAX_FD_DUP2ED) { @@ -1645,7 +1724,6 @@ allocate_dup2ed_fd(const int fd_src, const int fd_dest) if (fd_dup2_list[i].fd_src == -1) { fd_dup2_list[i].fd_src = fd_src; fd_dup2_list[i].fd_dest = fd_dest; - fd_dup2_list[i].dest_closed = false; num_fd_dup2ed++; D_MUTEX_UNLOCK(&lock_fd_dup2ed); return i; @@ -1654,6 +1732,8 @@ allocate_dup2ed_fd(const int fd_src, const int fd_dest) } D_MUTEX_UNLOCK(&lock_fd_dup2ed); + /* decrease dup reference count in error */ + dec_dup_ref_count(fd_dest); DS_ERROR(EMFILE, "fd_dup2_list[] is out of space"); errno = EMFILE; return (-1); @@ -1695,7 +1775,7 @@ close_all_duped_fd(void) /* Only the main thread will call this function in the destruction phase */ for (i = 0; i < MAX_FD_DUP2ED; i++) { if (fd_dup2_list[i].fd_src >= 0) - close_dup_fd_src(libc_close, fd_dup2_list[i].fd_src); + close_dup_fd(libc_close, fd_dup2_list[i].fd_src, true); } num_fd_dup2ed = 0; } @@ -1765,7 +1845,7 @@ check_path_with_dirfd(int dirfd, char **full_path_out, const char *rel_path, int free(*full_path_out); *full_path_out = NULL; } - DS_ERROR(errno, "readlink() failed"); + D_DEBUG(DB_ANY, "readlink() failed: %d (%s)\n", errno, strerror(errno)); return (-1); } @@ -1780,8 +1860,8 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char mode_t mode_query = 0, mode_parent = 0; struct dfs_mt *dfs_mt; char item_name[DFS_MAX_NAME]; - char *parent_dir = NULL; - char *full_path = NULL; + char *parent_dir = NULL; + char *full_path = NULL; if (pathname == NULL) { errno = EFAULT; @@ -1839,14 +1919,15 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char } /* file/dir should be handled by DFS */ if (oflags & O_CREAT) { - rc = dfs_open(dfs_mt->dfs, parent, item_name, mode | S_IFREG, oflags, 0, 0, NULL, - &dfs_obj); + rc = dfs_open(dfs_mt->dfs, parent, item_name, mode | S_IFREG, oflags & (~O_APPEND), + 0, 0, NULL, &dfs_obj); mode_query = S_IFREG; } else if (!parent && (strncmp(item_name, "/", 2) == 0)) { - rc = dfs_lookup(dfs_mt->dfs, "/", oflags, &dfs_obj, &mode_query, NULL); + rc = + dfs_lookup(dfs_mt->dfs, "/", oflags & (~O_APPEND), &dfs_obj, &mode_query, NULL); } else { - rc = dfs_lookup_rel(dfs_mt->dfs, parent, item_name, oflags, &dfs_obj, &mode_query, - NULL); + rc = dfs_lookup_rel(dfs_mt->dfs, parent, item_name, oflags & (~O_APPEND), &dfs_obj, + &mode_query, NULL); } if (rc) @@ -1908,6 +1989,15 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char FREE(parent_dir); + if (oflags & O_APPEND) { + struct stat fstat; + + rc = new_fxstat(1, idx_fd + FD_FILE_BASE, &fstat); + if (rc != 0) + return (-1); + file_list[idx_fd]->offset = fstat.st_size; + } + return (idx_fd + FD_FILE_BASE); org_func: @@ -2011,10 +2101,10 @@ new_close_common(int (*next_close)(int fd), int fd) } else if (fd_directed >= FD_FILE_BASE) { /* This fd is a kernel fd. There was a duplicate fd created. */ if (fd < FD_FILE_BASE) - return close_dup_fd_src(next_close, fd); + return close_dup_fd(next_close, fd, true); /* This fd is a fake fd. There exists a associated kernel fd with dup2. */ - close_dup_fd_dest_fakefd(fd); + free_fd(fd - FD_FILE_BASE, false); return 0; } @@ -2498,6 +2588,17 @@ new_fxstatat(int ver, int dirfd, const char *path, struct stat *stat_buf, int fl return new_xstat(1, path, stat_buf); } + if (dirfd >= FD_FILE_BASE && dirfd < FD_DIR_BASE) { + if (path[0] == 0 && flags & AT_EMPTY_PATH) + /* same as fstat for a file. May need further work to handle flags */ + return new_fxstat(ver, dirfd, stat_buf); + else if (path[0] == 0) + error = ENOENT; + else + error = ENOTDIR; + goto out_err; + } + idx_dfs = check_path_with_dirfd(dirfd, &full_path, path, &error); if (error) goto out_err; @@ -2540,6 +2641,17 @@ new_fstatat(int dirfd, const char *__restrict path, struct stat *__restrict stat return new_xstat(1, path, stat_buf); } + if (dirfd >= FD_FILE_BASE && dirfd < FD_DIR_BASE) { + if (path[0] == 0 && flags & AT_EMPTY_PATH) + /* same as fstat for a file. May need further work to handle flags */ + return fstat(dirfd, stat_buf); + else if (path[0] == 0) + error = ENOENT; + else + error = ENOTDIR; + goto out_err; + } + idx_dfs = check_path_with_dirfd(dirfd, &full_path, path, &error); if (error) goto out_err; @@ -3955,6 +4067,8 @@ getcwd(char *buf, size_t size) if (buf == NULL) { size_t len; + if (size == 0) + size = PATH_MAX; len = strnlen(cur_dir, size); if (len >= size) { errno = ERANGE; @@ -4075,14 +4189,12 @@ faccessat(int dirfd, const char *path, int mode, int flags) int chdir(const char *path) { - int is_target_path, rc, len_str, errno_save; + int is_target_path, rc, len_str; dfs_obj_t *parent; - struct stat stat_buf; struct dfs_mt *dfs_mt; char item_name[DFS_MAX_NAME]; - char *parent_dir = NULL; - char *full_path = NULL; - bool is_root; + char *parent_dir = NULL; + char *full_path = NULL; if (next_chdir == NULL) { next_chdir = dlsym(RTLD_NEXT, "chdir"); @@ -4095,36 +4207,22 @@ chdir(const char *path) &full_path, &dfs_mt); if (rc) D_GOTO(out_err, rc); - if (!is_target_path) { - FREE(parent_dir); - rc = next_chdir(path); - errno_save = errno; - if (rc == 0) - update_cwd(); - errno = errno_save; - return rc; - } - if (!parent && (strncmp(item_name, "/", 2) == 0)) { - is_root = true; - rc = dfs_stat(dfs_mt->dfs, NULL, NULL, &stat_buf); - } else { - is_root = false; - rc = dfs_stat(dfs_mt->dfs, parent, item_name, &stat_buf); - } + rc = next_chdir(path); if (rc) - D_GOTO(out_err, rc); - if (!S_ISDIR(stat_buf.st_mode)) { - D_DEBUG(DB_ANY, "%s is not a directory: %d (%s)\n", path, ENOTDIR, - strerror(ENOTDIR)); - D_GOTO(out_err, rc = ENOTDIR); + D_GOTO(out_err, rc = errno); + + if (!is_target_path) { + strncpy(cur_dir, full_path, DFS_MAX_PATH); + if (cur_dir[DFS_MAX_PATH - 1] != 0) { + D_DEBUG(DB_ANY, "path is too long: %d (%s)\n", ENAMETOOLONG, + strerror(ENAMETOOLONG)); + D_GOTO(out_err, rc = ENAMETOOLONG); + } + D_GOTO(out, rc); } - if (is_root) - rc = dfs_access(dfs_mt->dfs, NULL, NULL, X_OK); - else - rc = dfs_access(dfs_mt->dfs, parent, item_name, X_OK); - if (rc) - D_GOTO(out_err, rc); + + /* assuming the path exists and it is backed by dfuse */ len_str = snprintf(cur_dir, DFS_MAX_PATH, "%s%s", dfs_mt->fs_root, full_path); if (len_str >= DFS_MAX_PATH) { D_DEBUG(DB_ANY, "path is too long: %d (%s)\n", ENAMETOOLONG, @@ -4132,6 +4230,7 @@ chdir(const char *path) D_GOTO(out_err, rc = ENAMETOOLONG); } +out: FREE(parent_dir); return 0; @@ -4144,6 +4243,7 @@ chdir(const char *path) int fchdir(int dirfd) { + int rc; char *pt_end = NULL; if (next_fchdir == NULL) { @@ -4156,6 +4256,15 @@ fchdir(int dirfd) if (dirfd < FD_DIR_BASE) return next_fchdir(dirfd); + /* assume dfuse is running. call chdir() to update cwd. */ + if (next_chdir == NULL) { + next_chdir = dlsym(RTLD_NEXT, "chdir"); + D_ASSERT(next_chdir != NULL); + } + rc = next_chdir(dir_list[dirfd - FD_DIR_BASE]->path); + if (rc) + return rc; + pt_end = stpncpy(cur_dir, dir_list[dirfd - FD_DIR_BASE]->path, DFS_MAX_PATH - 1); if ((long int)(pt_end - cur_dir) >= DFS_MAX_PATH - 1) { D_DEBUG(DB_ANY, "path is too long: %d (%s)\n", ENAMETOOLONG, @@ -4979,7 +5088,7 @@ dup(int oldfd) int dup2(int oldfd, int newfd) { - int fd, fd_directed, idx, rc, errno_save; + int fd, oldfd_directed, newfd_directed, fd_directed, idx, rc, errno_save; /* Need more work later. */ if (next_dup2 == NULL) { @@ -4995,16 +5104,27 @@ dup2(int oldfd, int newfd) else return newfd; } - if ((oldfd < FD_FILE_BASE) && (newfd < FD_FILE_BASE)) + oldfd_directed = query_fd_forward_dest(oldfd); + newfd_directed = query_fd_forward_dest(newfd); + if ((oldfd_directed < FD_FILE_BASE) && (oldfd < FD_FILE_BASE) && + (newfd_directed < FD_FILE_BASE) && (newfd < FD_FILE_BASE)) return next_dup2(oldfd, newfd); + if (oldfd_directed >= FD_FILE_BASE && oldfd < FD_FILE_BASE) + oldfd = oldfd_directed; + if (newfd >= FD_FILE_BASE) { DS_ERROR(ENOTSUP, "unimplemented yet for newfd >= FD_FILE_BASE"); errno = ENOTSUP; return -1; } fd_directed = query_fd_forward_dest(newfd); - if (fd_directed >= FD_FILE_BASE) { + if (fd_directed >= FD_FILE_BASE && newfd < FD_FILE_BASE && oldfd_directed < FD_FILE_BASE && + oldfd < FD_FILE_BASE) { + /* need to remove newfd from forward list and decrease refcount in file_list[] */ + close_dup_fd(libc_close, newfd, false); + return next_dup2(oldfd, newfd); + } else if (fd_directed >= FD_FILE_BASE) { DS_ERROR(ENOTSUP, "unimplemented yet for fd_directed >= FD_FILE_BASE"); errno = ENOTSUP; return -1; @@ -5015,13 +5135,22 @@ dup2(int oldfd, int newfd) else fd_directed = query_fd_forward_dest(oldfd); if (fd_directed >= FD_FILE_BASE) { - rc = close(newfd); - if (rc != 0 && errno != EBADF) - return -1; - fd = allocate_a_fd_from_kernel(); + int fd_tmp; + + fd_tmp = allocate_a_fd_from_kernel(); + if (fd_tmp < 0) { + /* failed to allocate an fd from kernel */ + errno_save = errno; + DS_ERROR(errno_save, "failed to get a fd from kernel"); + errno = errno_save; + return (-1); + } + /* rely on dup2() to get the desired fd */ + fd = next_dup2(fd_tmp, newfd); if (fd < 0) { /* failed to allocate an fd from kernel */ errno_save = errno; + close(fd_tmp); DS_ERROR(errno_save, "failed to get a fd from kernel"); errno = errno_save; return (-1); @@ -5031,6 +5160,9 @@ dup2(int oldfd, int newfd) errno = EBUSY; return (-1); } + rc = libc_close(fd_tmp); + if (rc != 0) + return -1; idx = allocate_dup2ed_fd(fd, fd_directed); if (idx >= 0) return fd; @@ -5542,6 +5674,10 @@ init_myhook(void) } update_cwd(); + rc = D_MUTEX_INIT(&lock_reserve_fd, NULL); + if (rc) + return; + rc = D_MUTEX_INIT(&lock_dfs, NULL); if (rc) return; @@ -5673,7 +5809,7 @@ close_all_fd(void) for (i = 0; i <= last_fd; i++) { if (file_list[i]) - free_fd(i); + free_fd(i, false); } } @@ -5723,6 +5859,7 @@ finalize_myhook(void) finalize_dfs(); D_MUTEX_DESTROY(&lock_eqh); + D_MUTEX_DESTROY(&lock_reserve_fd); D_MUTEX_DESTROY(&lock_dfs); D_MUTEX_DESTROY(&lock_dirfd); D_MUTEX_DESTROY(&lock_fd); @@ -5837,9 +5974,10 @@ finalize_dfs(void) D_FREE(dfs_list[i].fs_root); } - if (daos_inited) { + if (atomic_load_relaxed(&daos_inited)) { uint32_t init_cnt, j; + free_reserved_low_fd(); init_cnt = atomic_load_relaxed(&daos_init_cnt); for (j = 0; j < init_cnt; j++) { rc = daos_fini(); diff --git a/src/tests/ftest/daos_test/dfuse.py b/src/tests/ftest/daos_test/dfuse.py index 2af81e67427..39d7e4359f5 100644 --- a/src/tests/ftest/daos_test/dfuse.py +++ b/src/tests/ftest/daos_test/dfuse.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2021-2023 Intel Corporation. + (C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -93,6 +93,8 @@ def run_test(self, il_lib=None): daos_test_env['D_LOG_MASK'] = 'INFO,IL=DEBUG' command = [self.daos_test, '--test-dir', mount_dir, '--io', '--stream'] + if use_dfuse: + command.append('--lowfd') if cache_mode != 'writeback': command.append('--metadata') diff --git a/src/tests/suite/dfuse_test.c b/src/tests/suite/dfuse_test.c index bcff68d5ecf..28cc137f45e 100644 --- a/src/tests/suite/dfuse_test.c +++ b/src/tests/suite/dfuse_test.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2021-2023 Intel Corporation. + * (C) Copyright 2021-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -34,7 +34,7 @@ /* Tests can be run by specifying the appropriate argument for a test or all will be run if no test * is specified. */ -static const char *all_tests = "ismd"; +static const char *all_tests = "ismdl"; static void print_usage() @@ -47,6 +47,7 @@ print_usage() print_message("dfuse_test -s|--stream\n"); print_message("dfuse_test -m|--metadata\n"); print_message("dfuse_test -d|--directory\n"); + print_message("dfuse_test -l|--lowfd\n"); print_message("Default runs all tests\n=============\n"); print_message("\n=============================\n"); } @@ -146,6 +147,21 @@ do_openat(void **state) assert_return_code(rc, errno); assert_int_equal(stbuf.st_size, stbuf0.st_size); + /* cornercase: fd for a regular file is passed into fstatat(). Path is empty. */ + rc = fstatat(fd, "", &stbuf0, AT_EMPTY_PATH); + assert_return_code(rc, errno); + assert_int_equal(stbuf.st_size, stbuf0.st_size); + + /* expected to fail */ + rc = fstatat(fd, "", &stbuf0, 0); + assert_int_equal(rc, -1); + assert_int_equal(errno, ENOENT); + + /* expected to fail */ + rc = fstatat(fd, "entry", &stbuf0, 0); + assert_int_equal(rc, -1); + assert_int_equal(errno, ENOTDIR); + rc = close(fd); assert_return_code(rc, errno); @@ -494,6 +510,67 @@ do_directory(void **state) assert_return_code(rc, errno); } +#define MIN_DAOS_FD 10 +/* + * Check whether daos network context uses low fds 0~9. + */ +void +do_lowfd(void **state) +{ + int fd; + int rc; + int i; + bool pil4dfs_loaded = false; + char *env_ldpreload; + char fd_path[64]; + char *path; + + env_ldpreload = getenv("LD_PRELOAD"); + if (env_ldpreload == NULL) + return; + + if (strstr(env_ldpreload, "libpil4dfs.so")) + pil4dfs_loaded = true; + else + /* libioil cannot pass this test since low fds are only temporarily blocked */ + return; + + /* first time access a dir on DFS mount to trigger daos_init() */ + fd = open(test_dir, O_PATH | O_DIRECTORY); + assert_return_code(fd, errno); + + rc = close(fd); + assert_return_code(rc, errno); + + /* open the root dir and print fd */ + fd = open("/", O_PATH | O_DIRECTORY); + assert_return_code(fd, errno); + printf("fd = %d\n", fd); + rc = close(fd); + assert_return_code(rc, errno); + if (pil4dfs_loaded) + assert_true(fd >= MIN_DAOS_FD); + + /* now check whether daos uses low fds */ + path = malloc(PATH_MAX); + assert_non_null(path); + for (i = 0; i < MIN_DAOS_FD; i++) { + snprintf(fd_path, sizeof(fd_path) - 1, "/proc/self/fd/%d", i); + rc = readlink(fd_path, path, PATH_MAX - 1); + /* libioil only temporarily block low fds during daos_init(). + * libpil4dfs blocks low fds before daos_init() and does not free + * them until applications end. + */ + if (!pil4dfs_loaded && rc == -1 && errno == ENOENT) + continue; + assert_true(rc > 0); + path[rc] = 0; + assert_true(strstr(path, "socket:") == NULL); + assert_true(strstr(path, "anon_inode:") == NULL); + } + free(path); +} + static int run_specified_tests(const char *tests, int *sub_tests, int sub_tests_size) { @@ -543,6 +620,15 @@ run_specified_tests(const char *tests, int *sub_tests, int sub_tests_size) }; nr_failed += cmocka_run_group_tests(readdir_tests, NULL, NULL); break; + case 'l': + printf("\n\n================="); + printf("dfuse low fd tests"); + printf("=====================\n"); + const struct CMUnitTest lowfd_tests[] = { + cmocka_unit_test(do_lowfd), + }; + nr_failed += cmocka_run_group_tests(lowfd_tests, NULL, NULL); + break; default: assert_true(0); @@ -568,9 +654,10 @@ main(int argc, char **argv) {"stream", no_argument, NULL, 's'}, {"metadata", no_argument, NULL, 'm'}, {"directory", no_argument, NULL, 'd'}, + {"lowfd", no_argument, NULL, 'l'}, {NULL, 0, NULL, 0}}; - while ((opt = getopt_long(argc, argv, "aM:imsd", long_options, &index)) != -1) { + while ((opt = getopt_long(argc, argv, "aM:imsdl", long_options, &index)) != -1) { if (strchr(all_tests, opt) != NULL) { tests[ntests] = opt; ntests++;