From 9c5dd3157bbaa3c791d53b6e9444a3a1895bb2a4 Mon Sep 17 00:00:00 2001 From: Wei Ouyang Date: Mon, 28 Oct 2024 17:56:00 -0700 Subject: [PATCH] add orphan flag and support order_by --- docs/artifact-manager.md | 36 +++++++++---- hypha/VERSION | 2 +- hypha/artifact.py | 114 +++++++++++++++++++++++++++++---------- tests/test_artifact.py | 40 +++++++++++++- 4 files changed, 151 insertions(+), 41 deletions(-) diff --git a/docs/artifact-manager.md b/docs/artifact-manager.md index ffbe3de4..bc1eb568 100644 --- a/docs/artifact-manager.md +++ b/docs/artifact-manager.md @@ -39,7 +39,8 @@ gallery_manifest = { } # Create the collection with read permission for everyone and create permission for all authenticated users -await artifact_manager.create(prefix="collections/dataset-gallery", manifest=gallery_manifest, permissions={"*": "r", "@": "r+"}) +# We set orphan=True to create a collection without a parent +await artifact_manager.create(prefix="collections/dataset-gallery", manifest=gallery_manifest, permissions={"*": "r", "@": "r+"}, orphan=True) print("Dataset Gallery created.") ``` @@ -126,7 +127,8 @@ async def main(): "collection": [], } # Create the collection with read permission for everyone and create permission for all authenticated users - await artifact_manager.create(prefix="collections/dataset-gallery", manifest=gallery_manifest, permissions={"*": "r+", "@": "r+"}) + # We set orphan=True to create a collection without a parent + await artifact_manager.create(prefix="collections/dataset-gallery", manifest=gallery_manifest, permissions={"*": "r+", "@": "r+"}, orphan=True) print("Dataset Gallery created.") # Create a new dataset inside the Dataset Gallery @@ -197,7 +199,7 @@ gallery_manifest = { "collection": [], } # Create the collection with read permission for everyone and create permission for all authenticated users -await artifact_manager.create(prefix="collections/schema-dataset-gallery", manifest=gallery_manifest, permissions={"*": "r+", "@": "r+"}) +await artifact_manager.create(prefix="collections/schema-dataset-gallery", manifest=gallery_manifest, permissions={"*": "r+", "@": "r+"}, orphan=True) print("Schema-based Dataset Gallery created.") ``` @@ -226,7 +228,7 @@ print("Valid dataset committed.") ## API References -### `create(prefix: str, manifest: dict, permissions: dict=None, stage: bool = False) -> None` +### `create(prefix: str, manifest: dict, permissions: dict=None, stage: bool = False, orphan: bool = False) -> None` Creates a new artifact or collection with the specified manifest. The artifact is staged until committed. For collections, the `collection` field should be an empty list. @@ -236,11 +238,17 @@ Creates a new artifact or collection with the specified manifest. The artifact i - `manifest`: The manifest of the new artifact. Ensure the manifest follows the required schema if applicable (e.g., for collections). - `permissions`: Optional. A dictionary containing user permissions. For example `{"*": "r+"}` gives read and create access to everyone, `{"@": "rw+"}` allows all authenticated users to read/write/create, and `{"user_id_1": "r+"}` grants read and create permissions to a specific user. You can also set permissions for specific operations, such as `{"user_id_1": ["read", "create"]}`. See detailed explanation about permissions below. - `stage`: Optional. A boolean flag to stage the artifact. Default is `False`. +- `orphan`: Optional. A boolean flag to create the artifact without a parent collection. Default is `False`. If `True`, the artifact will not be associated with any collection. This is mainly used for creating top-level collections, and making sure the artifact is not associated with any parent collection (with inheritance of permissions). + +**Note 1: If you set `stage=True`, you must call `commit()` to finalize the artifact.** + +**Note 2: If you set `orphan=True`, the artifact will not be associated with any collection. An non-orphan artifact must have a parent collection.** **Example:** ```python -await artifact_manager.create(prefix="collections/dataset-gallery/example-dataset", manifest=dataset_manifest, stage=True) +# Assuming we have already created a dataset-gallery collection, otherwise create it first or set orphan=True +await artifact_manager.create(prefix="collections/dataset-gallery/example-dataset", manifest=dataset_manifest, stage=True, orphan=False) ``` ### Permissions @@ -351,18 +359,22 @@ await artifact_manager.commit(prefix="collections/dataset-gallery/example-datase --- -### `delete(prefix: str) -> None` +### `delete(prefix: str, delete_files: bool = False, recursive: bool = False) -> None` Deletes an artifact, its manifest, and all associated files from both the database and S3 storage. **Parameters:** - `prefix`: The path of the artifact, it can be a prefix relative to the current workspace (e.g., `"collections/dataset-gallery/example-dataset"`) or an absolute prefix with the workspace id (e.g., `"/my_workspace_id/collections/dataset-gallery/example-dataset"`). +- `delete_files`: Optional. A boolean flag to delete all files associated with the artifact. Default is `False`. +- `recursive`: Optional. A boolean flag to delete all child artifacts recursively. Default is `False`. + +**Warning: If `delete_files` is set to `True`, `recursive` must be set to `True`, all child artifacts will be deleted, and all files associated with the child artifacts will be permanently deleted from the S3 storage. This operation is irreversible.** **Example:** ```python -await artifact_manager.delete(prefix="collections/dataset-gallery/example-dataset") +await artifact_manager.delete(prefix="collections/dataset-gallery/example-dataset", delete_files=True) ``` --- @@ -465,9 +477,9 @@ manifest = await artifact_manager.read(prefix="collections/dataset-gallery/examp --- -### `list(prefix: str, keywords: list = None, filters: dict = None, mode: str = "AND", page: int = 0, page_size: int = 100) -> list` +### `list(prefix: str, keywords: list = None, filters: dict = None, mode: str = "AND", page: int = 0, page_size: int = 100, order_by: str = None, stage: bool = False, silent: bool = False) -> list` -List or search for artifacts within a collection based on keywords or filters, supporting both `AND` and `OR` modes. +List or search for child artifacts within a collection based on keywords or filters, supporting both `AND` and `OR` modes. **Parameters:** @@ -477,6 +489,9 @@ List or search for artifacts within a collection based on keywords or filters, s - `mode`: Either `"AND"` or `"OR"` to combine conditions. Default is `"AND"`. - `page`: Optional. The page number for paginated results. Default is `0`. - `page_size`: Optional. The number of items per page. Default is `100`. +- `order_by`: Optional. The field to order results by. Default is `None` (ascending order by prefix). The available fields are `view_count`, `download_count`, `last_modified`, `created_at`, and `prefix`. You can also append `<` or `>` to the field name for ascending or descending order. For example, `view_count<` will order by view count in ascending order. +- `stage`: Optional. If `True`, lists staged artifacts. Default is `False`. +- `silent`: Optional. If `True`, suppresses the view count increment. Default is `False`. **Returns:** A list of matching artifacts with summary fields. @@ -522,7 +537,8 @@ gallery_manifest = { "collection": [], } # Create the collection with read permission for everyone and create permission for all authenticated users -await artifact_manager.create(prefix="collections/dataset-gallery", manifest=gallery_manifest, permissions={"*": "r", "@": "r+"}) +# We set orphan=True to create a collection without a parent +await artifact_manager.create(prefix="collections/dataset-gallery", manifest=gallery_manifest, permissions={"*": "r", "@": "r+"}, orphan=True) # Step 3: Add a dataset to the gallery dataset_manifest = { diff --git a/hypha/VERSION b/hypha/VERSION index 0206ac54..d093226d 100644 --- a/hypha/VERSION +++ b/hypha/VERSION @@ -1,3 +1,3 @@ { - "version": "0.20.38.post14" + "version": "0.20.38.post15" } diff --git a/hypha/artifact.py b/hypha/artifact.py index 92345799..bbe7e37a 100644 --- a/hypha/artifact.py +++ b/hypha/artifact.py @@ -3,6 +3,7 @@ import sys import json import copy +import traceback from sqlalchemy import ( event, Column, @@ -163,7 +164,7 @@ async def get_artifact( except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) + raise HTTPException(status_code=500, detail=str(traceback.format_exc())) self.store.set_artifact_manager(self) self.store.register_public_service(self.get_artifact_service()) @@ -459,6 +460,7 @@ async def create( manifest: dict, overwrite=False, stage=False, + orphan=False, permissions: dict = None, context: dict = None, ): @@ -504,11 +506,25 @@ async def create( parent_artifact = await self._get_artifact( session, ws, parent_prefix ) + if parent_artifact and not parent_artifact.manifest: + raise ValueError( + f"Parent artifact under prefix '{parent_prefix}' must be committed before creating a child artifact." + ) if parent_artifact and permissions is None: permissions = parent_artifact.permissions else: parent_artifact = None + if not orphan and not parent_artifact: + raise ValueError( + f"Parent artifact not found (prefix: {parent_prefix}) for non-orphan artifact, please create the parent artifact first or set orphan=True." + ) + + if parent_artifact and orphan: + raise ValueError( + f"Parent artifact found (prefix: {parent_prefix}) for orphan artifact, please set orphan=False." + ) + existing_artifact = await self._get_artifact(session, ws, prefix) if existing_artifact: @@ -718,7 +734,9 @@ async def commit(self, prefix, context: dict): finally: await session.close() - async def delete(self, prefix, context: dict): + async def delete( + self, prefix, delete_files=False, recursive=False, context: dict = None + ): """Delete an artifact from the database and S3.""" if context is None or "ws" not in context: raise ValueError("Context must include 'ws' (workspace).") @@ -728,10 +746,30 @@ async def delete(self, prefix, context: dict): else: ws = context["ws"] + if delete_files and not recursive: + raise ValueError("Delete files requires recursive=True.") + user_info = UserInfo.model_validate(context["user"]) artifact = await self._get_artifact_with_permission( ws, user_info, prefix, "delete" ) + + if recursive: + # Remove all child artifacts + children = await self.list_children(prefix, context=context) + for child in children: + await self.delete( + child["_prefix"], delete_files=delete_files, context=context + ) + + if delete_files: + # Remove all files in the artifact's S3 prefix + artifact_path = safe_join(ws, f"{prefix}") + "/" + async with self.s3_controller.create_client_async() as s3_client: + await remove_objects_async( + s3_client, self.workspace_bucket, artifact_path + ) + session = await self._get_session() try: async with session.begin(): @@ -743,15 +781,6 @@ async def delete(self, prefix, context: dict): finally: await session.close() - # Remove files from S3 - await self._delete_s3_files(ws, prefix) - - async def _delete_s3_files(self, ws, prefix): - """Helper method to delete files associated with an artifact in S3.""" - artifact_path = safe_join(ws, f"{prefix}") + "/" - async with self.s3_controller.create_client_async() as s3_client: - await remove_objects_async(s3_client, self.workspace_bucket, artifact_path) - async def list_files( self, prefix: str, @@ -788,17 +817,13 @@ async def list_children( mode="AND", page: int = 0, page_size: int = 100, + order_by=None, + stage=False, silent=False, context: dict = None, ): """ List artifacts within a collection under a specific prefix. - Supports: - - `keywords`: list of fuzzy search terms across all manifest fields. - - `filters`: dictionary of exact or fuzzy match for specific fields. - - `mode`: either 'AND' or 'OR' to combine conditions. - - `page`: the page number (0-indexed) for pagination. - - `page_size`: the number of results per page. """ if context is None or "ws" not in context: raise ValueError("Context must include 'ws' (workspace).") @@ -824,6 +849,12 @@ async def list_children( ArtifactModel.workspace == ws, ArtifactModel.prefix.like(f"{prefix}/%"), ) + + if stage: + base_query = base_query.filter(ArtifactModel.stage_manifest != None) + else: + base_query = base_query.filter(ArtifactModel.manifest != None) + # Handle keyword-based search (fuzzy search across all manifest fields) conditions = [] if keywords: @@ -867,12 +898,37 @@ async def list_children( query = base_query offset = page * page_size - query = ( - query.order_by(ArtifactModel.last_modified.desc()) - .limit(page_size) - .offset(offset) - ) + if order_by is None: + query = query.order_by(ArtifactModel.prefix.asc()) + elif order_by.startswith("view_count"): + if order_by.endswith("<"): + query = query.order_by(ArtifactModel.view_count.asc()) + else: + query = query.order_by(ArtifactModel.view_count.desc()) + elif order_by.startswith("download_count"): + if order_by.endswith("<"): + query = query.order_by(ArtifactModel.download_count.asc()) + else: + query = query.order_by(ArtifactModel.download_count.desc()) + elif order_by.startswith("last_modified"): + if order_by.endswith("<"): + query = query.order_by(ArtifactModel.last_modified.asc()) + else: + query = query.order_by(ArtifactModel.last_modified.desc()) + elif order_by.startswith("created_at"): + if order_by.endswith("<"): + query = query.order_by(ArtifactModel.created_at.asc()) + else: + query = query.order_by(ArtifactModel.created_at.desc()) + elif order_by.startswith("prefix"): + if order_by.endswith("<"): + query = query.order_by(ArtifactModel.prefix.asc()) + else: + query = query.order_by(ArtifactModel.prefix.desc()) + else: + raise ValueError(f"Invalid order_by field: {order_by}") + query = query.limit(page_size).offset(offset) # Execute the query result = await session.execute(query) artifacts = result.scalars().all() @@ -886,20 +942,22 @@ async def list_children( summary_fields = DEFAULT_SUMMARY_FIELDS results = [] for artifact in artifacts: - if not artifact.manifest: + manifest = artifact.stage_manifest if stage else artifact.manifest + if not manifest: continue summary = {"_prefix": f"/{ws}/{artifact.prefix}"} for field in summary_fields: - summary[field] = artifact.manifest.get(field) + summary[field] = manifest.get(field) if "_metadata" in summary_fields: summary["_metadata"] = self._generate_metadata(artifact) results.append(summary) - # Increment the view count for the parent artifact - await self._read_manifest( - parent_artifact, stage=False, increment_view_count=not silent - ) + if not stage: + # Increment the view count for the parent artifact + await self._read_manifest( + parent_artifact, stage=False, increment_view_count=not silent + ) return results except Exception as e: diff --git a/tests/test_artifact.py b/tests/test_artifact.py index 23a2c7d0..60111c93 100644 --- a/tests/test_artifact.py +++ b/tests/test_artifact.py @@ -48,6 +48,7 @@ async def test_serve_artifact_endpoint(minio_server, fastapi_server): manifest=collection_manifest, # allow public read access and create access for authenticated users permissions={"*": "r", "@": "r+"}, + orphan=True, ) # Create an artifact inside the public collection @@ -88,6 +89,7 @@ async def test_serve_artifact_endpoint(minio_server, fastapi_server): prefix="collections/private-dataset-gallery", manifest=private_collection_manifest, permissions={}, + orphan=True, ) # Create an artifact inside the private collection @@ -148,6 +150,7 @@ async def test_artifact_permissions( manifest=collection_manifest, # allow public read access and create access for authenticated users permissions={"*": "r", "@": "r+"}, + orphan=True, ) await artifact_manager.reset_stats( prefix=f"/{api.config.workspace}/public/collections/dataset-gallery" @@ -228,6 +231,7 @@ async def test_http_artifact_endpoint(minio_server, fastapi_server): manifest=collection_manifest, stage=False, permissions={"*": "r", "@": "r+"}, + orphan=True, ) dataset_manifest = { @@ -307,6 +311,7 @@ async def test_edit_existing_artifact(minio_server, fastapi_server): prefix="collections/edit-test-collection", manifest=collection_manifest, stage=False, + orphan=True, ) # Create an artifact (a dataset in this case) within the collection @@ -329,7 +334,9 @@ async def test_edit_existing_artifact(minio_server, fastapi_server): ) # Ensure that the dataset appears in the collection's index - items = await artifact_manager.list(prefix="collections/edit-test-collection") + items = await artifact_manager.list( + prefix="collections/edit-test-collection", order_by="last_modified" + ) assert find_item( items, "_prefix", @@ -430,6 +437,7 @@ async def test_artifact_schema_validation(minio_server, fastapi_server): prefix="collections/schema-test-collection", manifest=collection_manifest, stage=False, + orphan=True, ) # Create a valid dataset artifact that conforms to the schema @@ -525,6 +533,7 @@ async def test_artifact_manager_with_collection(minio_server, fastapi_server): manifest=collection_manifest, stage=False, permissions={"*": "r", "@": "r+"}, + orphan=True, ) # get the collection via http @@ -576,6 +585,15 @@ async def test_artifact_manager_with_collection(minio_server, fastapi_server): ) assert find_item(files, "name", "test.txt") + items = await artifact_manager.list( + prefix="collections/test-collection", stage=True + ) + assert find_item( + items, + "_prefix", + f"/{api.config.workspace}/collections/test-collection/test-dataset", + ) + # Commit the artifact (finalize it) await artifact_manager.commit(prefix="collections/test-collection/test-dataset") @@ -672,10 +690,23 @@ async def test_artifact_edge_cases_with_collection(minio_server, fastapi_server) "type": "collection", "collection": [], } + + with pytest.raises( + Exception, + match=r".*Parent artifact not found.*", + ): + await artifact_manager.create( + prefix="collections/edge-case-collection", + manifest=collection_manifest, + stage=False, + orphan=False, + ) + await artifact_manager.create( prefix="collections/edge-case-collection", manifest=collection_manifest, stage=False, + orphan=True, ) # Try to create an artifact that already exists within the collection without overwriting @@ -782,6 +813,7 @@ async def test_artifact_search_in_manifest(minio_server, fastapi_server): prefix="collections/search-test-collection", manifest=collection_manifest, stage=False, + orphan=True, ) # Create multiple artifacts inside the collection @@ -856,6 +888,7 @@ async def test_artifact_search_with_filters(minio_server, fastapi_server): prefix="collections/search-test-collection", manifest=collection_manifest, stage=False, + orphan=True, ) # Create multiple artifacts inside the collection @@ -942,6 +975,7 @@ async def test_download_count(minio_server, fastapi_server): prefix="collections/download-test-collection", manifest=collection_manifest, stage=False, + orphan=True, ) # Create an artifact inside the collection @@ -1068,6 +1102,8 @@ async def test_download_count(minio_server, fastapi_server): # Clean up by deleting the dataset and the collection await artifact_manager.delete( - prefix="collections/download-test-collection/download-test-dataset" + prefix="collections/download-test-collection/download-test-dataset", + delete_files=True, + recursive=True, ) await artifact_manager.delete(prefix="collections/download-test-collection")