From 20eeba60dbc2f0929dca9e4572268bc3ce0b93e2 Mon Sep 17 00:00:00 2001 From: Wei Ouyang Date: Mon, 28 Oct 2024 12:45:36 -0700 Subject: [PATCH] Improve file listing --- docs/artifact-manager.md | 22 +++++++++++++++++++++ hypha/VERSION | 2 +- hypha/artifact.py | 41 ++++++++++++++++++++++++++++++---------- tests/test_artifact.py | 2 +- 4 files changed, 55 insertions(+), 12 deletions(-) diff --git a/docs/artifact-manager.md b/docs/artifact-manager.md index ead53cb1..ffbe3de4 100644 --- a/docs/artifact-manager.md +++ b/docs/artifact-manager.md @@ -426,6 +426,25 @@ get_url = await artifact_manager.get_file(prefix="collections/dataset-gallery/ex --- +### `list_files(prefix: str, dir_path: str=None) -> list` + +Lists all files in the artifact. + +**Parameters:** + +- `prefix`: The path of the artifact, it can be a prefix relative to the current workspace (e.g., `"collections/dataset-gallery/example-dataset"`) or an absolute prefix with the workspace id (e.g., `"/my_workspace_id/collections/dataset-gallery/example-dataset"`). +- `dir_path`: Optional. The directory path within the artifact to list files. Default is `None`. + +**Returns:** A list of files in the artifact. + +**Example:** + +```python +files = await artifact_manager.list_files(prefix="collections/dataset-gallery/example-dataset") +``` + +--- + ### `read(prefix: str, stage: bool = False, silent: bool = False) -> dict` Reads and returns the manifest of an artifact or collection. If in staging mode, reads from `_manifest.yaml`. @@ -547,6 +566,7 @@ The `Artifact Manager` provides an HTTP endpoint for retrieving artifact manifes - `/{workspace}/artifacts/{prefix:path}` for fetching the artifact manifest. - `/{workspace}/artifacts/{prefix:path}/__children__` for listing all artifacts in a collection. + - `/{workspace}/artifacts/{prefix:path}/__files__` for listing all files in the artifact. - `/{workspace}/artifacts/{prefix:path}/__files__/{file_path:path}` for downloading a file from the artifact (will be redirected to a pre-signed URL). ### Path Parameters: @@ -572,6 +592,8 @@ For `/{workspace}/artifacts/{prefix:path}`, the response will be a JSON object r For `/{workspace}/artifacts/{prefix:path}/__children__`, the response will be a list of artifacts in the collection. +For `/{workspace}/artifacts/{prefix:path}/__files__`, the response will be a list of files in the artifact, each file is a dictionary with the `name` and `type` fields. + For `/{workspace}/artifacts/{prefix:path}/__files__/{file_path:path}`, the response will be a pre-signed URL to download the file. ### Example: Fetching a public artifact with download statistics diff --git a/hypha/VERSION b/hypha/VERSION index fe03e29d..0206ac54 100644 --- a/hypha/VERSION +++ b/hypha/VERSION @@ -1,3 +1,3 @@ { - "version": "0.20.38.post13" + "version": "0.20.38.post14" } diff --git a/hypha/artifact.py b/hypha/artifact.py index 13dd549e..92345799 100644 --- a/hypha/artifact.py +++ b/hypha/artifact.py @@ -118,14 +118,21 @@ async def get_artifact( ) if "/__files__/" in prefix: - prefix, file_path = prefix.split("/__files__/") - url = await self.get_file( - prefix, - file_path, - context={"ws": workspace, "user": user_info.model_dump()}, - ) - # Redirect to the pre-signed URL - return RedirectResponse(url=url) + prefix, path = prefix.split("/__files__/") + try: + url = await self.get_file( + prefix, + path, + context={"ws": workspace, "user": user_info.model_dump()}, + ) + # Redirect to the pre-signed URL + return RedirectResponse(url=url) + except FileNotFoundError as e: + return await self.list_files( + prefix, + path, + context={"ws": workspace, "user": user_info.model_dump()}, + ) if prefix.endswith("/__children__"): assert not stage, "Cannot list children of a staged artifact." @@ -746,7 +753,11 @@ async def _delete_s3_files(self, ws, prefix): await remove_objects_async(s3_client, self.workspace_bucket, artifact_path) async def list_files( - self, prefix, max_length=1000, stage=False, context: dict = None + self, + prefix: str, + dir_path: str = None, + max_length: int = 1000, + context: dict = None, ): """List files in the specified S3 prefix.""" if context is None or "ws" not in context: @@ -759,7 +770,10 @@ async def list_files( user_info = UserInfo.model_validate(context["user"]) await self._get_artifact_with_permission(ws, user_info, prefix, "list_files") async with self.s3_controller.create_client_async() as s3_client: - full_path = safe_join(ws, prefix) + "/" + if dir_path: + full_path = safe_join(ws, prefix, dir_path) + "/" + else: + full_path = safe_join(ws, prefix) + "/" items = await list_objects_async( s3_client, self.workspace_bucket, full_path, max_length=max_length ) @@ -960,6 +974,13 @@ async def get_file(self, prefix, path, options: dict = None, context: dict = Non ) async with self.s3_controller.create_client_async() as s3_client: file_key = safe_join(ws, f"{prefix}/{path}") + # check if the file exists + try: + await s3_client.head_object(Bucket=self.workspace_bucket, Key=file_key) + except ClientError: + raise FileNotFoundError( + f"File '{path}' does not exist in the artifact." + ) presigned_url = await s3_client.generate_presigned_url( "get_object", Params={"Bucket": self.workspace_bucket, "Key": file_key}, diff --git a/tests/test_artifact.py b/tests/test_artifact.py index af3f8192..23a2c7d0 100644 --- a/tests/test_artifact.py +++ b/tests/test_artifact.py @@ -572,7 +572,7 @@ async def test_artifact_manager_with_collection(minio_server, fastapi_server): assert manifest_data["id"] == "test-dataset" files = await artifact_manager.list_files( - prefix="collections/test-collection/test-dataset", stage=True + prefix="collections/test-collection/test-dataset" ) assert find_item(files, "name", "test.txt")