From ca46dcafe7122c8950efffafef4fa2f2e34bd922 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:45:05 -0500 Subject: [PATCH 01/43] First version backup --- .../clp_package_utils/scripts/start_clp.py | 25 +++++- .../clp-py-utils/clp_py_utils/clp_config.py | 53 +++++++++++-- .../clp-py-utils/clp_py_utils/result.py | 10 +++ .../executor/compress/fs_compression_task.py | 76 ++++++++++++++++--- .../job_orchestration/executor/s3_utils.py | 35 +++++++++ components/job-orchestration/pyproject.toml | 1 + 6 files changed, 181 insertions(+), 19 deletions(-) create mode 100644 components/clp-py-utils/clp_py_utils/result.py create mode 100644 components/job-orchestration/job_orchestration/executor/s3_utils.py diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 8097929f1..77f6a006a 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -624,6 +624,8 @@ def start_compression_worker( num_cpus: int, mounts: CLPDockerMounts, ): + print(clp_config) + print(container_clp_config) celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" generic_start_worker( @@ -713,6 +715,7 @@ def generic_start_worker( "-w", str(CONTAINER_CLP_HOME), "--name", container_name, "--log-driver", "local", + "-u", f"{os.getuid()}:{os.getgid()}", "-e", f"PYTHONPATH={clp_site_packages_dir}", "-e", ( f"BROKER_URL=amqp://" @@ -729,13 +732,26 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - "-u", f"{os.getuid()}:{os.getgid()}", ] if worker_specific_env: for env_name, env_value in worker_specific_env.items(): container_start_cmd.append("-e") container_start_cmd.append(f"{env_name}={env_value}") + s3_config = clp_config.archive_output.s3_config + if s3_config is not None: + container_start_cmd += [ + "-e", f"ENABLE_S3_ARCHIVE=1", + "-e", f"REGION_NAME={s3_config.region_name}", + "-e", f"BUCKET={s3_config.bucket}", + "-e", f"KEY_PREFIX={s3_config.key_prefix}" + ] + if s3_config.secret_access_key is not None and s3_config.secret_access_key is not None: + container_start_cmd += [ + "-e", f"ACCESS_KEY_ID={s3_config.access_key_id}", + "-e", f"SECRET_ACCESS_KEY={s3_config.secret_access_key}" + ] + # fmt: on necessary_mounts = [ mounts.clp_home, @@ -1126,6 +1142,13 @@ def main(argv): ): validate_and_load_redis_credentials_file(clp_config, clp_home, True) + # Might be a good place to verification for s3 config. + # if target in ( + # ALL_TARGET_NAME, + # COMPRESSION_WORKER_COMPONENT_NAME + # ): + # validate_s3_config(clp_config.archive_output, clp_home) + clp_config.validate_data_dir() clp_config.validate_logs_dir() except: diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 79a94505d..07983d9f2 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -1,6 +1,6 @@ import pathlib -import typing from enum import auto +from typing import Optional from dotenv import dotenv_values from pydantic import BaseModel, PrivateAttr, validator @@ -69,12 +69,12 @@ class Database(BaseModel): host: str = "localhost" port: int = 3306 name: str = "clp-db" - ssl_cert: typing.Optional[str] = None + ssl_cert: Optional[str] = None auto_commit: bool = False compress: bool = True - username: typing.Optional[str] = None - password: typing.Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None @validator("type") def validate_database_type(cls, field): @@ -227,7 +227,7 @@ class Redis(BaseModel): query_backend_database: int = 0 compression_backend_database: int = 1 # redis can perform authentication without a username - password: typing.Optional[str] + password: Optional[str] @validator("host") def validate_host(cls, field): @@ -300,8 +300,44 @@ class Queue(BaseModel): host: str = "localhost" port: int = 5672 - username: typing.Optional[str] - password: typing.Optional[str] + username: Optional[str] + password: Optional[str] + + +class S3Config(BaseModel): + # TODO: need to think of a way to verify the account and + # keys. Maybe need to be outside of the config because every config + # can have different privilege + # Things to verify: + # 1. Can only be enabled if clp-s is used. + # 2. Does key_prefix need to end with '/'? maybe it doesn't but will cause some issue for list bucket. + + # Required fields + region_name: str + bucket: str + key_prefix: str + + # Optional fields + access_key_id: Optional[str] = None + secret_access_key: Optional[str] = None + + @validator("region_name") + def validate_region_name(cls, field): + if field == "": + raise ValueError("region_name is not provided") + return field + + @validator("bucket") + def validate_bucket(cls, field): + if field == "": + raise ValueError("bucket is not provided") + return field + + @validator("key_prefix") + def validate_key_prefix(cls, field): + if field == "": + raise ValueError("key_prefix is not provided") + return field class ArchiveOutput(BaseModel): @@ -310,6 +346,7 @@ class ArchiveOutput(BaseModel): target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB target_segment_size: int = 256 * 1024 * 1024 # 256 MB + s3_config: Optional[S3Config] = None @validator("target_archive_size") def validate_target_archive_size(cls, field): @@ -408,7 +445,7 @@ def validate_port(cls, field): class CLPConfig(BaseModel): - execution_container: typing.Optional[str] + execution_container: Optional[str] input_logs_directory: pathlib.Path = pathlib.Path("/") diff --git a/components/clp-py-utils/clp_py_utils/result.py b/components/clp-py-utils/clp_py_utils/result.py new file mode 100644 index 000000000..9c72cebf9 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/result.py @@ -0,0 +1,10 @@ +from typing import Optional + + +class Result: + def __init__(self, success: bool, error: Optional[str] = None): + self.success = success + self.error = error + + def __repr__(self): + return f"Result(success={self.success}, error={self.error})" diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index ce88ad185..772a358d7 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -4,6 +4,7 @@ import pathlib import subprocess from contextlib import closing +from typing import Any, Dict, Optional import yaml from celery.app.task import Task @@ -12,11 +13,14 @@ COMPRESSION_JOBS_TABLE_NAME, COMPRESSION_TASKS_TABLE_NAME, Database, + S3Config, StorageEngine, ) from clp_py_utils.clp_logging import set_logging_level +from clp_py_utils.result import Result from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app +from job_orchestration.executor.s3_utils import s3_put from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult @@ -71,6 +75,40 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi ) +def upload_single_file_archive_to_s3( + archive_stats: Dict[str, Any], + archive_dir: pathlib.Path, + s3_config: S3Config, +) -> Result: + logger.info("Starting to upload to S3...") + archive_id = archive_stats["id"] + src_file = archive_dir / archive_id + result = s3_put(s3_config, src_file, archive_id) + if not result.success: + logger.error(f"Failed to upload to S3: {result.error}") + return result + + logger.info("Finished uploading to S3...") + src_file.unlink() + return Result(success=True) + + +def get_s3_config() -> Optional[S3Config]: + enable_s3_write = os.getenv("ENABLE_S3_ARCHIVE") + if enable_s3_write is None: + return None + + # TODO: this method is very errorprone since it doesn't check individual members + # Let's leave this for now before we properly implement the config file. + s3_config = S3Config( + region_name=os.getenv("REGION_NAME"), + bucket=os.getenv("BUCKET"), + key_prefix=os.getenv("KEY_PREFIX"), + access_key_id=os.getenv("ACCESS_KEY_ID"), + secret_access_key=os.getenv("SECRET_ACCESS_KEY") + ) + return s3_config + def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, @@ -113,6 +151,7 @@ def make_clp_s_command( compression_cmd = [ str(clp_home / "bin" / "clp-s"), "c", str(archive_output_dir), + "--single-file-archive", "--print-archive-stats", "--target-encoded-size", str(clp_config.output.target_segment_size + clp_config.output.target_dictionaries_size), @@ -212,18 +251,31 @@ def run_clp( # Compute the total amount of data compressed last_archive_stats = None - total_uncompressed_size = 0 - total_compressed_size = 0 + worker_output = { + "total_uncompressed_size": 0, + "total_compressed_size": 0, + } + + s3_config = get_s3_config() + while True: line = proc.stdout.readline() if not line: break stats = json.loads(line.decode("ascii")) if last_archive_stats is not None and stats["id"] != last_archive_stats["id"]: + if s3_config is not None: + result = upload_single_file_archive_to_s3( + last_archive_stats, archive_output_dir, s3_config + ) + if not result.success: + # TODO: think about how to handle S3 only error + continue + # We've started a new archive so add the previous archive's last # reported size to the total - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] + worker_output["total_uncompressed_size"] += last_archive_stats["uncompressed_size"] + worker_output["total_compressed_size"] += last_archive_stats["size"] with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: @@ -239,9 +291,17 @@ def run_clp( last_archive_stats = stats if last_archive_stats is not None: + if s3_config is not None: + result = upload_single_file_archive_to_s3( + last_archive_stats, archive_output_dir, s3_config + ) + if not result.success: + logger.error(f"THINK ABOUT WHAT TO DO HERE") + # TODO: think about how to handle S3 only error + # Add the last archive's last reported size - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] + worker_output["total_uncompressed_size"] += last_archive_stats["uncompressed_size"] + worker_output["total_compressed_size"] += last_archive_stats["size"] with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: @@ -270,10 +330,6 @@ def run_clp( # Close stderr log file stderr_log_file.close() - worker_output = { - "total_uncompressed_size": total_uncompressed_size, - "total_compressed_size": total_compressed_size, - } if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output else: diff --git a/components/job-orchestration/job_orchestration/executor/s3_utils.py b/components/job-orchestration/job_orchestration/executor/s3_utils.py new file mode 100644 index 000000000..6909150d5 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/s3_utils.py @@ -0,0 +1,35 @@ +from pathlib import Path + +import boto3 +from botocore.exceptions import ClientError +from clp_py_utils.clp_config import S3Config +from clp_py_utils.result import Result + +def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: + + if not src_file.exists(): + return Result(success=False, error=f"{src_file} doesn't exist") + if not src_file.is_file(): + return Result(success=False, error=f"{src_file} is not a file") + + s3_client_args = { + "region_name": s3_config.region_name, + "aws_access_key_id": s3_config.access_key_id, + "aws_secret_access_key": s3_config.secret_access_key + } + + my_s3_client = boto3.client("s3", **s3_client_args) + + with open(src_file, "rb") as file_data: + try: + my_s3_client.put_object( + Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name + ) + except ClientError as e: + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + return Result(success=False, error=f"ClientError: {error_code} - {error_message}") + except Exception as e: + return Result(success=False, error=f"An unexpected error occurred: {e}") + + return Result(success=True) diff --git a/components/job-orchestration/pyproject.toml b/components/job-orchestration/pyproject.toml index c89d84dec..2db22cab0 100644 --- a/components/job-orchestration/pyproject.toml +++ b/components/job-orchestration/pyproject.toml @@ -10,6 +10,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" +boto3 = "^1.35.76" Brotli = "^1.1.0" celery = "^5.3.6" # mariadb version must be compatible with libmariadev installed in runtime env. From b763e8b63c017083adb7097a8465eb87fe8a8341 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:02:10 -0500 Subject: [PATCH 02/43] Small refactor --- .../clp_package_utils/scripts/start_clp.py | 9 ++- .../executor/compress/fs_compression_task.py | 62 ++++++++----------- .../job_orchestration/executor/s3_utils.py | 4 +- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 77f6a006a..0acd3a4c3 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -728,7 +728,6 @@ def generic_start_worker( ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}", - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", @@ -751,18 +750,24 @@ def generic_start_worker( "-e", f"ACCESS_KEY_ID={s3_config.access_key_id}", "-e", f"SECRET_ACCESS_KEY={s3_config.secret_access_key}" ] + else: + container_start_cmd += [ + "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", + ] # fmt: on necessary_mounts = [ mounts.clp_home, mounts.data_dir, mounts.logs_dir, - mounts.archives_output_dir, mounts.input_logs_dir, ] if worker_specific_mount: necessary_mounts.extend(worker_specific_mount) + if s3_config is None: + necessary_mounts.append(mounts.archives_output_dir) + for mount in necessary_mounts: if not mount: raise ValueError(f"Required mount configuration is empty: {necessary_mounts}") diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 772a358d7..7eb8e258d 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -105,10 +105,11 @@ def get_s3_config() -> Optional[S3Config]: bucket=os.getenv("BUCKET"), key_prefix=os.getenv("KEY_PREFIX"), access_key_id=os.getenv("ACCESS_KEY_ID"), - secret_access_key=os.getenv("SECRET_ACCESS_KEY") + secret_access_key=os.getenv("SECRET_ACCESS_KEY"), ) return s3_config + def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, @@ -205,6 +206,10 @@ def run_clp( yaml.safe_dump(clp_metadata_db_connection_config, db_config_file) db_config_file.close() + # Get s3 config + s3_config = get_s3_config() + s3_upload_failed = False + if StorageEngine.CLP == clp_storage_engine: compression_cmd = make_clp_command( clp_home=clp_home, @@ -251,26 +256,32 @@ def run_clp( # Compute the total amount of data compressed last_archive_stats = None + last_line_decoded = False worker_output = { "total_uncompressed_size": 0, "total_compressed_size": 0, } - s3_config = get_s3_config() - - while True: + while not last_line_decoded: + stats = None line = proc.stdout.readline() - if not line: - break - stats = json.loads(line.decode("ascii")) - if last_archive_stats is not None and stats["id"] != last_archive_stats["id"]: + if line: + stats = json.loads(line.decode("ascii")) + else: + last_line_decoded = True + + if last_archive_stats is not None and ( + last_line_decoded or stats["id"] != last_archive_stats["id"] + ): if s3_config is not None: result = upload_single_file_archive_to_s3( last_archive_stats, archive_output_dir, s3_config ) if not result.success: - # TODO: think about how to handle S3 only error - continue + worker_output["error_message"] = result.error + s3_upload_failed = True + # Upon failure, skip updating the archive tags and job metadata. + break # We've started a new archive so add the previous archive's last # reported size to the total @@ -290,30 +301,6 @@ def run_clp( last_archive_stats = stats - if last_archive_stats is not None: - if s3_config is not None: - result = upload_single_file_archive_to_s3( - last_archive_stats, archive_output_dir, s3_config - ) - if not result.success: - logger.error(f"THINK ABOUT WHAT TO DO HERE") - # TODO: think about how to handle S3 only error - - # Add the last archive's last reported size - worker_output["total_uncompressed_size"] += last_archive_stats["uncompressed_size"] - worker_output["total_compressed_size"] += last_archive_stats["size"] - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_job_metadata_and_tags( - db_cursor, - job_id, - clp_metadata_db_connection_config["table_prefix"], - tag_ids, - last_archive_stats, - ) - db_conn.commit() - # Wait for compression to finish return_code = proc.wait() if 0 != return_code: @@ -330,6 +317,8 @@ def run_clp( # Close stderr log file stderr_log_file.close() + if s3_upload_failed: + return CompressionTaskStatus.FAILED, worker_output if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output else: @@ -349,9 +338,12 @@ def compress( ): clp_home_str = os.getenv("CLP_HOME") data_dir_str = os.getenv("CLP_DATA_DIR") - archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") logs_dir_str = os.getenv("CLP_LOGS_DIR") + archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") + if archive_output_dir_str is None: + archive_output_dir_str = "/tmp/archives" + # Set logging level clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) set_logging_level(logger, clp_logging_level) diff --git a/components/job-orchestration/job_orchestration/executor/s3_utils.py b/components/job-orchestration/job_orchestration/executor/s3_utils.py index 6909150d5..3a3be0ef0 100644 --- a/components/job-orchestration/job_orchestration/executor/s3_utils.py +++ b/components/job-orchestration/job_orchestration/executor/s3_utils.py @@ -5,6 +5,7 @@ from clp_py_utils.clp_config import S3Config from clp_py_utils.result import Result + def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: if not src_file.exists(): @@ -15,12 +16,13 @@ def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: s3_client_args = { "region_name": s3_config.region_name, "aws_access_key_id": s3_config.access_key_id, - "aws_secret_access_key": s3_config.secret_access_key + "aws_secret_access_key": s3_config.secret_access_key, } my_s3_client = boto3.client("s3", **s3_client_args) with open(src_file, "rb") as file_data: + # TODO: Consider retry? try: my_s3_client.put_object( Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name From 4e9529c4e9d2497e3d23bb90edad9e4afceb4acd Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:51:38 -0500 Subject: [PATCH 03/43] First trial for new config --- .../clp_package_utils/general.py | 12 +-- .../clp_package_utils/scripts/start_clp.py | 17 ++-- .../clp-py-utils/clp_py_utils/clp_config.py | 81 ++++++++++++++++--- .../package-template/src/etc/clp-config.yml | 4 +- 4 files changed, 83 insertions(+), 31 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 5fae8166f..64b684fe1 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -239,17 +239,19 @@ def generate_container_config( DockerMountType.BIND, clp_config.logs_directory, container_clp_config.logs_directory ) - container_clp_config.archive_output.directory = pathlib.Path("/") / "mnt" / "archive-output" + container_clp_config.archive_output.set_archive_directory( + pathlib.Path("/") / "mnt" / "archive-output" + ) if not is_path_already_mounted( clp_home, CONTAINER_CLP_HOME, - clp_config.archive_output.directory, - container_clp_config.archive_output.directory, + clp_config.archive_output.archive_directory(), + container_clp_config.archive_output.archive_directory(), ): docker_mounts.archives_output_dir = DockerMount( DockerMountType.BIND, - clp_config.archive_output.directory, - container_clp_config.archive_output.directory, + clp_config.archive_output.archive_directory(), + container_clp_config.archive_output.archive_directory(), ) container_clp_config.stream_output.directory = pathlib.Path("/") / "mnt" / "stream-output" diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 0acd3a4c3..6e519e63d 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -624,8 +624,6 @@ def start_compression_worker( num_cpus: int, mounts: CLPDockerMounts, ): - print(clp_config) - print(container_clp_config) celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" generic_start_worker( @@ -703,7 +701,7 @@ def generic_start_worker( container_logs_dir = container_clp_config.logs_directory / component_name # Create necessary directories - clp_config.archive_output.directory.mkdir(parents=True, exist_ok=True) + clp_config.archive_output.archive_directory().mkdir(parents=True, exist_ok=True) clp_config.stream_output.directory.mkdir(parents=True, exist_ok=True) clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages" @@ -731,14 +729,15 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", + "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.archive_directory()}", ] if worker_specific_env: for env_name, env_value in worker_specific_env.items(): container_start_cmd.append("-e") container_start_cmd.append(f"{env_name}={env_value}") - s3_config = clp_config.archive_output.s3_config - if s3_config is not None: + if "s3" == clp_config.archive_output.storage.type: + s3_config = clp_config.archive_output.storage.s3_config container_start_cmd += [ "-e", f"ENABLE_S3_ARCHIVE=1", "-e", f"REGION_NAME={s3_config.region_name}", @@ -750,10 +749,6 @@ def generic_start_worker( "-e", f"ACCESS_KEY_ID={s3_config.access_key_id}", "-e", f"SECRET_ACCESS_KEY={s3_config.secret_access_key}" ] - else: - container_start_cmd += [ - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", - ] # fmt: on necessary_mounts = [ @@ -761,13 +756,11 @@ def generic_start_worker( mounts.data_dir, mounts.logs_dir, mounts.input_logs_dir, + mounts.archives_output_dir ] if worker_specific_mount: necessary_mounts.extend(worker_specific_mount) - if s3_config is None: - necessary_mounts.append(mounts.archives_output_dir) - for mount in necessary_mounts: if not mount: raise ValueError(f"Required mount configuration is empty: {necessary_mounts}") diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 07983d9f2..56e3caf99 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -1,6 +1,6 @@ import pathlib from enum import auto -from typing import Optional +from typing import Literal, Optional, Union from dotenv import dotenv_values from pydantic import BaseModel, PrivateAttr, validator @@ -340,13 +340,64 @@ def validate_key_prefix(cls, field): return field -class ArchiveOutput(BaseModel): +class FSStorage(BaseModel): + type: Literal["fs"] directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" + + @validator("directory") + def validate_directory(cls, field): + if "" == field: + raise ValueError("directory can not be empty") + return field + + def make_config_path_absolute(self, clp_home: pathlib.Path): + self.directory = make_config_path_absolute(clp_home, self.directory) + + def dump_to_primitive_dict(self): + d = self.dict() + d["directory"] = str(d["directory"]) + return d + + +class S3Storage(BaseModel): + type: Literal["s3"] + staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives" + s3_config: S3Config + + @validator("staging_directory") + def validate_staging_directory(cls, field): + if "" == field: + raise ValueError("staging_directory can not be empty") + return field + + @validator("s3_config") + def validate_s3_config(cls, field): + if None == field: + raise ValueError("s3_config must be provided") + return field + + def make_config_path_absolute(self, clp_home: pathlib.Path): + self.staging_directory = make_config_path_absolute(clp_home, self.staging_directory) + + def dump_to_primitive_dict(self): + d = self.dict() + d["staging_directory"] = str(d["staging_directory"]) + return d + + +class ArchiveOutput(BaseModel): + # TODO: For whatever weird reason, must first assign it to Null + storage: Union[FSStorage, S3Storage, None] target_archive_size: int = 256 * 1024 * 1024 # 256 MB target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB target_segment_size: int = 256 * 1024 * 1024 # 256 MB - s3_config: Optional[S3Config] = None + + @validator("storage") + def validate_storage(cls, field) -> bool: + if None == field: + raise ValueError("storage must be provided") + return field @validator("target_archive_size") def validate_target_archive_size(cls, field): @@ -372,14 +423,18 @@ def validate_target_segment_size(cls, field): raise ValueError("target_segment_size must be greater than 0") return field - def make_config_paths_absolute(self, clp_home: pathlib.Path): - self.directory = make_config_path_absolute(clp_home, self.directory) + def set_archive_directory(self, directory: pathlib.Path) -> None: + storage_config = self.storage + if "fs" == storage_config.type: + storage_config.directory = directory + else: + storage_config.staging_directory = directory - def dump_to_primitive_dict(self): - d = self.dict() - # Turn directory (pathlib.Path) into a primitive string - d["directory"] = str(d["directory"]) - return d + def archive_directory(self) -> pathlib.Path: + storage_config = self.storage + if "fs" == storage_config.type: + return storage_config.directory + return storage_config.staging_directory class StreamOutput(BaseModel): @@ -473,7 +528,7 @@ class CLPConfig(BaseModel): def make_config_paths_absolute(self, clp_home: pathlib.Path): self.input_logs_directory = make_config_path_absolute(clp_home, self.input_logs_directory) self.credentials_file_path = make_config_path_absolute(clp_home, self.credentials_file_path) - self.archive_output.make_config_paths_absolute(clp_home) + self.archive_output.storage.make_config_path_absolute(clp_home) self.stream_output.make_config_paths_absolute(clp_home) self.data_directory = make_config_path_absolute(clp_home, self.data_directory) self.logs_directory = make_config_path_absolute(clp_home, self.logs_directory) @@ -490,7 +545,7 @@ def validate_input_logs_dir(self): def validate_archive_output_dir(self): try: - validate_path_could_be_dir(self.archive_output.directory) + validate_path_could_be_dir(self.archive_output.archive_directory()) except ValueError as ex: raise ValueError(f"archive_output.directory is invalid: {ex}") @@ -566,7 +621,7 @@ def load_redis_credentials_from_file(self): def dump_to_primitive_dict(self): d = self.dict() - d["archive_output"] = self.archive_output.dump_to_primitive_dict() + d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() d["stream_output"] = self.stream_output.dump_to_primitive_dict() # Turn paths into primitive strings d["input_logs_directory"] = str(self.input_logs_directory) diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index f19b93463..22b03b889 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -66,7 +66,9 @@ # ## Where archives should be output to #archive_output: -# directory: "var/data/archives" +# storage: +# type: "fs" +# directory: "var/data/archives" # # # How much data CLP should try to compress into each archive # target_archive_size: 268435456 # 256 MB From e9cdea4055106d486af82b7c7754e43d1b7014c9 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:09:20 -0500 Subject: [PATCH 04/43] Further refactor and polishing --- .../clp_package_utils/general.py | 21 +++++++----- .../clp_package_utils/scripts/start_clp.py | 14 +++----- .../clp-py-utils/clp_py_utils/clp_config.py | 32 +++++++++++++------ .../clp-py-utils/clp_py_utils/s3_utils.py | 13 ++++++++ .../executor/compress/fs_compression_task.py | 7 ++-- 5 files changed, 54 insertions(+), 33 deletions(-) create mode 100644 components/clp-py-utils/clp_py_utils/s3_utils.py diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 64b684fe1..fbeb5de64 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -20,6 +20,7 @@ REDIS_COMPONENT_NAME, REDUCER_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + StorageType, WEBUI_COMPONENT_NAME, ) from clp_py_utils.core import ( @@ -28,6 +29,7 @@ read_yaml_config_file, validate_path_could_be_dir, ) +from clp_py_utils.s3_utils import verify_s3_config_for_archive_output from strenum import KebabCaseStrEnum # CONSTANTS @@ -239,19 +241,17 @@ def generate_container_config( DockerMountType.BIND, clp_config.logs_directory, container_clp_config.logs_directory ) - container_clp_config.archive_output.set_archive_directory( - pathlib.Path("/") / "mnt" / "archive-output" - ) + container_clp_config.archive_output.set_directory(pathlib.Path("/") / "mnt" / "archive-output") if not is_path_already_mounted( clp_home, CONTAINER_CLP_HOME, - clp_config.archive_output.archive_directory(), - container_clp_config.archive_output.archive_directory(), + clp_config.archive_output.get_directory(), + container_clp_config.archive_output.get_directory(), ): docker_mounts.archives_output_dir = DockerMount( DockerMountType.BIND, - clp_config.archive_output.archive_directory(), - container_clp_config.archive_output.archive_directory(), + clp_config.archive_output.get_directory(), + container_clp_config.archive_output.get_directory(), ) container_clp_config.stream_output.directory = pathlib.Path("/") / "mnt" / "stream-output" @@ -484,8 +484,13 @@ def validate_results_cache_config( def validate_worker_config(clp_config: CLPConfig): clp_config.validate_input_logs_dir() - clp_config.validate_archive_output_dir() + clp_config.validate_archive_output_config() clp_config.validate_stream_output_dir() + storage_config = clp_config.archive_output.storage + if StorageType.S3 == storage_config.type: + result = verify_s3_config_for_archive_output(storage_config.s3_config) + if not result.success: + raise ValueError(f"S3 config verification failed: {result.error}") def validate_webui_config( diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 6e519e63d..da609a23a 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -701,7 +701,7 @@ def generic_start_worker( container_logs_dir = container_clp_config.logs_directory / component_name # Create necessary directories - clp_config.archive_output.archive_directory().mkdir(parents=True, exist_ok=True) + clp_config.archive_output.get_directory().mkdir(parents=True, exist_ok=True) clp_config.stream_output.directory.mkdir(parents=True, exist_ok=True) clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages" @@ -729,7 +729,8 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.archive_directory()}", + # need a way to remove this maybe? + "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.get_directory()}", ] if worker_specific_env: for env_name, env_value in worker_specific_env.items(): @@ -755,8 +756,8 @@ def generic_start_worker( mounts.clp_home, mounts.data_dir, mounts.logs_dir, + mounts.archives_output_dir, mounts.input_logs_dir, - mounts.archives_output_dir ] if worker_specific_mount: necessary_mounts.extend(worker_specific_mount) @@ -1140,13 +1141,6 @@ def main(argv): ): validate_and_load_redis_credentials_file(clp_config, clp_home, True) - # Might be a good place to verification for s3 config. - # if target in ( - # ALL_TARGET_NAME, - # COMPRESSION_WORKER_COMPONENT_NAME - # ): - # validate_s3_config(clp_config.archive_output, clp_home) - clp_config.validate_data_dir() clp_config.validate_logs_dir() except: diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 56e3caf99..00d0b6dbb 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -4,7 +4,7 @@ from dotenv import dotenv_values from pydantic import BaseModel, PrivateAttr, validator -from strenum import KebabCaseStrEnum +from strenum import KebabCaseStrEnum, LowercaseStrEnum from .clp_logging import get_valid_logging_level, is_valid_logging_level from .core import ( @@ -48,6 +48,11 @@ class StorageEngine(KebabCaseStrEnum): CLP_S = auto() +class StorageType(LowercaseStrEnum): + FS = auto() + S3 = auto() + + VALID_STORAGE_ENGINES = [storage_engine.value for storage_engine in StorageEngine] @@ -341,7 +346,7 @@ def validate_key_prefix(cls, field): class FSStorage(BaseModel): - type: Literal["fs"] + type: Literal[StorageType.FS.value] directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" @validator("directory") @@ -360,7 +365,7 @@ def dump_to_primitive_dict(self): class S3Storage(BaseModel): - type: Literal["s3"] + type: Literal[StorageType.S3.value] staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives" s3_config: S3Config @@ -423,16 +428,16 @@ def validate_target_segment_size(cls, field): raise ValueError("target_segment_size must be greater than 0") return field - def set_archive_directory(self, directory: pathlib.Path) -> None: + def set_directory(self, directory: pathlib.Path) -> None: storage_config = self.storage - if "fs" == storage_config.type: + if StorageType.FS == storage_config.type: storage_config.directory = directory else: storage_config.staging_directory = directory - def archive_directory(self) -> pathlib.Path: + def get_directory(self) -> pathlib.Path: storage_config = self.storage - if "fs" == storage_config.type: + if StorageType.FS == storage_config.type: return storage_config.directory return storage_config.staging_directory @@ -543,11 +548,18 @@ def validate_input_logs_dir(self): if not input_logs_dir.is_dir(): raise ValueError(f"input_logs_directory '{input_logs_dir}' is not a directory.") - def validate_archive_output_dir(self): + def validate_archive_output_config(self): + if ( + StorageType.S3 == self.archive_output.storage.type + and StorageEngine.CLP_S != self.package.storage_engine + ): + raise ValueError( + f"S3 storage is only supported with storage engine: {StorageEngine.CLP_S}" + ) try: - validate_path_could_be_dir(self.archive_output.archive_directory()) + validate_path_could_be_dir(self.archive_output.get_directory()) except ValueError as ex: - raise ValueError(f"archive_output.directory is invalid: {ex}") + raise ValueError(f"directory of storage config is invalid: {ex}") def validate_stream_output_dir(self): try: diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py new file mode 100644 index 000000000..905e30fac --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -0,0 +1,13 @@ +import boto3 +from botocore.exceptions import ClientError + +from clp_py_utils.clp_config import S3Config +from clp_py_utils.result import Result + + +def verify_s3_config_for_archive_output(s3_config: S3Config) -> Result: + # TODO: need to verify: + # 1. Have write priveldge so archive can be compressed + # 2. Have read priviledge so archive can be readed + # 3. bucket and region are the same, this should run into issue but not sure + return Result(success=True) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 7eb8e258d..87a079876 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -263,8 +263,8 @@ def run_clp( } while not last_line_decoded: - stats = None line = proc.stdout.readline() + stats: Dict[str, Any] = None if line: stats = json.loads(line.decode("ascii")) else: @@ -338,11 +338,8 @@ def compress( ): clp_home_str = os.getenv("CLP_HOME") data_dir_str = os.getenv("CLP_DATA_DIR") - logs_dir_str = os.getenv("CLP_LOGS_DIR") - archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") - if archive_output_dir_str is None: - archive_output_dir_str = "/tmp/archives" + logs_dir_str = os.getenv("CLP_LOGS_DIR") # Set logging level clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) From 9ba0a38f6a7134728ccd757274138e8bb338b223 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 22:40:47 -0500 Subject: [PATCH 05/43] Another small refactor --- .../clp_package_utils/general.py | 1 + .../clp_package_utils/scripts/start_clp.py | 12 +++++++++--- .../clp-py-utils/clp_py_utils/clp_config.py | 15 +++++---------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index fbeb5de64..05a67d497 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -486,6 +486,7 @@ def validate_worker_config(clp_config: CLPConfig): clp_config.validate_input_logs_dir() clp_config.validate_archive_output_config() clp_config.validate_stream_output_dir() + storage_config = clp_config.archive_output.storage if StorageType.S3 == storage_config.type: result = verify_s3_config_for_archive_output(storage_config.s3_config) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index da609a23a..8351a8836 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -694,8 +694,6 @@ def generic_start_worker( if container_exists(container_name): return - validate_worker_config(clp_config) - logs_dir = clp_config.logs_directory / component_name logs_dir.mkdir(parents=True, exist_ok=True) container_logs_dir = container_clp_config.logs_directory / component_name @@ -729,7 +727,7 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - # need a way to remove this maybe? + # need a way to remove this maybe "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.get_directory()}", ] if worker_specific_env: @@ -756,6 +754,8 @@ def generic_start_worker( mounts.clp_home, mounts.data_dir, mounts.logs_dir, + # need a way to remove this maybe, since reader doesn't need it if it is staged. + # one option is to move it to the worker_specific_mount mounts.archives_output_dir, mounts.input_logs_dir, ] @@ -1140,6 +1140,12 @@ def main(argv): QUERY_WORKER_COMPONENT_NAME, ): validate_and_load_redis_credentials_file(clp_config, clp_home, True) + if target in ( + ALL_TARGET_NAME, + COMPRESSION_WORKER_COMPONENT_NAME, + QUERY_WORKER_COMPONENT_NAME, + ): + validate_worker_config(clp_config) clp_config.validate_data_dir() clp_config.validate_logs_dir() diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 00d0b6dbb..e2398b0be 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -310,12 +310,8 @@ class Queue(BaseModel): class S3Config(BaseModel): - # TODO: need to think of a way to verify the account and - # keys. Maybe need to be outside of the config because every config - # can have different privilege - # Things to verify: - # 1. Can only be enabled if clp-s is used. - # 2. Does key_prefix need to end with '/'? maybe it doesn't but will cause some issue for list bucket. + # Todo: + # Does key_prefix need to end with '/'? maybe it doesn't. # Required fields region_name: str @@ -346,7 +342,7 @@ def validate_key_prefix(cls, field): class FSStorage(BaseModel): - type: Literal[StorageType.FS.value] + type: Literal[StorageType.FS.value] = StorageType.FS.value directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" @validator("directory") @@ -365,7 +361,7 @@ def dump_to_primitive_dict(self): class S3Storage(BaseModel): - type: Literal[StorageType.S3.value] + type: Literal[StorageType.S3.value] = StorageType.S3.value staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives" s3_config: S3Config @@ -391,8 +387,7 @@ def dump_to_primitive_dict(self): class ArchiveOutput(BaseModel): - # TODO: For whatever weird reason, must first assign it to Null - storage: Union[FSStorage, S3Storage, None] + storage: Union[FSStorage, S3Storage] = FSStorage() target_archive_size: int = 256 * 1024 * 1024 # 256 MB target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB From 58befefe2390ba4655274c851108e36ba4a284de Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:52:23 -0500 Subject: [PATCH 06/43] small refactor again --- .../executor/compress/fs_compression_task.py | 4 ++-- .../job_orchestration/executor/s3_utils.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 87a079876..756c3f9b3 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -264,14 +264,14 @@ def run_clp( while not last_line_decoded: line = proc.stdout.readline() - stats: Dict[str, Any] = None + stats: Optional[Dict[str, Any]] = None if line: stats = json.loads(line.decode("ascii")) else: last_line_decoded = True if last_archive_stats is not None and ( - last_line_decoded or stats["id"] != last_archive_stats["id"] + None is stats or stats["id"] != last_archive_stats["id"] ): if s3_config is not None: result = upload_single_file_archive_to_s3( diff --git a/components/job-orchestration/job_orchestration/executor/s3_utils.py b/components/job-orchestration/job_orchestration/executor/s3_utils.py index 3a3be0ef0..dd6a215ba 100644 --- a/components/job-orchestration/job_orchestration/executor/s3_utils.py +++ b/components/job-orchestration/job_orchestration/executor/s3_utils.py @@ -1,12 +1,15 @@ from pathlib import Path import boto3 +from botocore.config import Config from botocore.exceptions import ClientError from clp_py_utils.clp_config import S3Config from clp_py_utils.result import Result -def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: +def s3_put( + s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 +) -> Result: if not src_file.exists(): return Result(success=False, error=f"{src_file} doesn't exist") @@ -19,10 +22,11 @@ def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: "aws_secret_access_key": s3_config.secret_access_key, } - my_s3_client = boto3.client("s3", **s3_client_args) + config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) + + my_s3_client = boto3.client("s3", **s3_client_args, config=config) with open(src_file, "rb") as file_data: - # TODO: Consider retry? try: my_s3_client.put_object( Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name From 35ec0c37569b6d558b994c3d4fb41c70eeeeba8c Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:47:17 -0500 Subject: [PATCH 07/43] Combine s3 utils --- .../clp-py-utils/clp_py_utils/s3_utils.py | 38 ++++++++++++++++- .../executor/compress/fs_compression_task.py | 2 +- .../job_orchestration/executor/s3_utils.py | 41 ------------------- 3 files changed, 38 insertions(+), 43 deletions(-) delete mode 100644 components/job-orchestration/job_orchestration/executor/s3_utils.py diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 905e30fac..83346e4de 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -1,6 +1,8 @@ +from pathlib import Path + import boto3 +from botocore.config import Config from botocore.exceptions import ClientError - from clp_py_utils.clp_config import S3Config from clp_py_utils.result import Result @@ -11,3 +13,37 @@ def verify_s3_config_for_archive_output(s3_config: S3Config) -> Result: # 2. Have read priviledge so archive can be readed # 3. bucket and region are the same, this should run into issue but not sure return Result(success=True) + + +def s3_put( + s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 +) -> Result: + + if not src_file.exists(): + return Result(success=False, error=f"{src_file} doesn't exist") + if not src_file.is_file(): + return Result(success=False, error=f"{src_file} is not a file") + + s3_client_args = { + "region_name": s3_config.region_name, + "aws_access_key_id": s3_config.access_key_id, + "aws_secret_access_key": s3_config.secret_access_key, + } + + config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) + + my_s3_client = boto3.client("s3", **s3_client_args, config=config) + + with open(src_file, "rb") as file_data: + try: + my_s3_client.put_object( + Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name + ) + except ClientError as e: + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + return Result(success=False, error=f"ClientError: {error_code} - {error_message}") + except Exception as e: + return Result(success=False, error=f"An unexpected error occurred: {e}") + + return Result(success=True) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 756c3f9b3..a042a3611 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -19,8 +19,8 @@ from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.result import Result from clp_py_utils.sql_adapter import SQL_Adapter +from clp_py_utils.s3_utils import s3_put from job_orchestration.executor.compress.celery import app -from job_orchestration.executor.s3_utils import s3_put from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult diff --git a/components/job-orchestration/job_orchestration/executor/s3_utils.py b/components/job-orchestration/job_orchestration/executor/s3_utils.py deleted file mode 100644 index dd6a215ba..000000000 --- a/components/job-orchestration/job_orchestration/executor/s3_utils.py +++ /dev/null @@ -1,41 +0,0 @@ -from pathlib import Path - -import boto3 -from botocore.config import Config -from botocore.exceptions import ClientError -from clp_py_utils.clp_config import S3Config -from clp_py_utils.result import Result - - -def s3_put( - s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 -) -> Result: - - if not src_file.exists(): - return Result(success=False, error=f"{src_file} doesn't exist") - if not src_file.is_file(): - return Result(success=False, error=f"{src_file} is not a file") - - s3_client_args = { - "region_name": s3_config.region_name, - "aws_access_key_id": s3_config.access_key_id, - "aws_secret_access_key": s3_config.secret_access_key, - } - - config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) - - my_s3_client = boto3.client("s3", **s3_client_args, config=config) - - with open(src_file, "rb") as file_data: - try: - my_s3_client.put_object( - Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name - ) - except ClientError as e: - error_code = e.response["Error"]["Code"] - error_message = e.response["Error"]["Message"] - return Result(success=False, error=f"ClientError: {error_code} - {error_message}") - except Exception as e: - return Result(success=False, error=f"An unexpected error occurred: {e}") - - return Result(success=True) From 5d57b101217be05d54a9bde581a4021686f668d2 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:47:46 -0500 Subject: [PATCH 08/43] Support handling S3 error message --- .../initialize-orchestration-db.py | 2 +- .../compress/compression_scheduler.py | 22 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py index 1ed727367..2c8133e8a 100644 --- a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py +++ b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py @@ -52,7 +52,7 @@ def main(argv): CREATE TABLE IF NOT EXISTS `{COMPRESSION_JOBS_TABLE_NAME}` ( `id` INT NOT NULL AUTO_INCREMENT, `status` INT NOT NULL DEFAULT '{CompressionJobStatus.PENDING}', - `status_msg` VARCHAR(255) NOT NULL DEFAULT '', + `status_msg` VARCHAR(512) NOT NULL DEFAULT '', `creation_time` DATETIME(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), `start_time` DATETIME(3) NULL DEFAULT NULL, `duration` FLOAT NULL DEFAULT NULL, diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index 62b7a27fc..2de3e835b 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -53,13 +53,14 @@ def update_compression_task_metadata(db_cursor, task_id, kv): logger.error("Must specify at least one field to update") raise ValueError - field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + field_set_expressions = [f"{k} = %s" for k in kv.keys()] query = f""" - UPDATE {COMPRESSION_TASKS_TABLE_NAME} - SET {", ".join(field_set_expressions)} - WHERE id={task_id} + UPDATE {COMPRESSION_TASKS_TABLE_NAME} + SET {", ".join(field_set_expressions)} + WHERE id = %s """ - db_cursor.execute(query) + values = [v for v in kv.values()] + [task_id] + db_cursor.execute(query, values) def update_compression_job_metadata(db_cursor, job_id, kv): @@ -67,13 +68,14 @@ def update_compression_job_metadata(db_cursor, job_id, kv): logger.error("Must specify at least one field to update") raise ValueError - field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + field_set_expressions = [f"{k} = %s" for k in kv.keys()] query = f""" - UPDATE {COMPRESSION_JOBS_TABLE_NAME} - SET {", ".join(field_set_expressions)} - WHERE id={job_id} + UPDATE {COMPRESSION_JOBS_TABLE_NAME} + SET {", ".join(field_set_expressions)} + WHERE id = %s """ - db_cursor.execute(query) + values = [v for v in kv.values()] + [job_id] + db_cursor.execute(query, values) def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection_config): From 999130766a457480a3b0f948ba6ff459715c3797 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:12:06 -0500 Subject: [PATCH 09/43] Slight logging modification --- .../executor/compress/fs_compression_task.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index a042a3611..f808a27ee 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -80,15 +80,16 @@ def upload_single_file_archive_to_s3( archive_dir: pathlib.Path, s3_config: S3Config, ) -> Result: - logger.info("Starting to upload to S3...") archive_id = archive_stats["id"] + + logger.info(f"Starting to upload archive {archive_id} to S3...") src_file = archive_dir / archive_id result = s3_put(s3_config, src_file, archive_id) if not result.success: - logger.error(f"Failed to upload to S3: {result.error}") + logger.error(f"Failed to upload archive {archive_id}: {result.error}") return result - logger.info("Finished uploading to S3...") + logger.info(f"Finished uploading archive {archive_id} to S3...") src_file.unlink() return Result(success=True) @@ -318,6 +319,7 @@ def run_clp( stderr_log_file.close() if s3_upload_failed: + logger.error(f"Failed to upload to S3.") return CompressionTaskStatus.FAILED, worker_output if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output From 5d237907d0e6d679865f71e68bcb0a8d79339109 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:12:32 -0500 Subject: [PATCH 10/43] Linter --- components/clp-py-utils/clp_py_utils/s3_utils.py | 1 + .../job_orchestration/executor/compress/fs_compression_task.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 83346e4de..df2633214 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -3,6 +3,7 @@ import boto3 from botocore.config import Config from botocore.exceptions import ClientError + from clp_py_utils.clp_config import S3Config from clp_py_utils.result import Result diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index f808a27ee..120f9179a 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -18,8 +18,8 @@ ) from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.result import Result -from clp_py_utils.sql_adapter import SQL_Adapter from clp_py_utils.s3_utils import s3_put +from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress From b4bb2af1ecb0002b23049986e90769c9f87e3268 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:15:52 -0500 Subject: [PATCH 11/43] Add extra verification --- components/clp-py-utils/clp_py_utils/clp_config.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index e2398b0be..205a65ec5 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -310,15 +310,10 @@ class Queue(BaseModel): class S3Config(BaseModel): - # Todo: - # Does key_prefix need to end with '/'? maybe it doesn't. - - # Required fields region_name: str bucket: str key_prefix: str - # Optional fields access_key_id: Optional[str] = None secret_access_key: Optional[str] = None @@ -338,6 +333,8 @@ def validate_bucket(cls, field): def validate_key_prefix(cls, field): if field == "": raise ValueError("key_prefix is not provided") + if not field.endswith("/"): + raise ValueError(r'key_prefix must end with "/"') return field From f41c5587755fc672430c33e4fcd5a190c12a4ed0 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:17:11 -0500 Subject: [PATCH 12/43] Update components/clp-py-utils/clp_py_utils/clp_config.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- components/clp-py-utils/clp_py_utils/clp_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 205a65ec5..3694c8312 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -497,7 +497,7 @@ def validate_port(cls, field): class CLPConfig(BaseModel): - execution_container: Optional[str] + execution_container: Optional[str] = None input_logs_directory: pathlib.Path = pathlib.Path("/") From ce5a667aa372c906aceca1c2880452f1f10807c7 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:22:27 -0500 Subject: [PATCH 13/43] do nothing for now --- components/clp-py-utils/clp_py_utils/s3_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index df2633214..0c4356407 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -9,10 +9,7 @@ def verify_s3_config_for_archive_output(s3_config: S3Config) -> Result: - # TODO: need to verify: - # 1. Have write priveldge so archive can be compressed - # 2. Have read priviledge so archive can be readed - # 3. bucket and region are the same, this should run into issue but not sure + # TODO: properly verify the s3 config return Result(success=True) From f05dc889237cc632f72865a13d3a111a09722aae Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:53:54 -0500 Subject: [PATCH 14/43] backup changes for worker config --- .../clp_package_utils/general.py | 11 ++++ .../clp_package_utils/scripts/start_clp.py | 44 ++++----------- .../clp-py-utils/clp_py_utils/clp_config.py | 17 ++++++ .../executor/compress/fs_compression_task.py | 55 ++++++++++--------- 4 files changed, 68 insertions(+), 59 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 05a67d497..507e1e434 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -14,6 +14,7 @@ from clp_py_utils.clp_config import ( CLP_DEFAULT_CREDENTIALS_FILE_PATH, CLPConfig, + WorkerConfig, DB_COMPONENT_NAME, LOG_VIEWER_WEBUI_COMPONENT_NAME, QUEUE_COMPONENT_NAME, @@ -270,6 +271,16 @@ def generate_container_config( return container_clp_config, docker_mounts +def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig: + worker_config = WorkerConfig() + worker_config.archive_output = clp_config.archive_output.copy(deep=True) + worker_config.stream_output = clp_config.stream_output.copy(deep=True) + worker_config.package = clp_config.package.copy(deep=True) + worker_config.data_directory = clp_config.data_directory + + return worker_config + + def dump_container_config( container_clp_config: CLPConfig, clp_config: CLPConfig, container_name: str ) -> Tuple[pathlib.Path, pathlib.Path]: diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 8351a8836..f94fefb0b 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -42,6 +42,7 @@ DockerMount, DockerMountType, generate_container_config, + generate_worker_config, get_clp_home, is_container_exited, is_container_running, @@ -638,7 +639,6 @@ def start_compression_worker( num_cpus, mounts, None, - None, ) @@ -653,10 +653,6 @@ def start_query_worker( celery_route = f"{QueueName.QUERY}" query_worker_mount = [mounts.stream_output_dir] - query_worker_env = { - "CLP_STREAM_OUTPUT_DIR": container_clp_config.stream_output.directory, - "CLP_STREAM_COLLECTION_NAME": clp_config.results_cache.stream_collection_name, - } generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -669,7 +665,6 @@ def start_query_worker( clp_config.redis.query_backend_database, num_cpus, mounts, - query_worker_env, query_worker_mount, ) @@ -685,8 +680,7 @@ def generic_start_worker( redis_database: int, num_cpus: int, mounts: CLPDockerMounts, - worker_specific_env: Dict[str, Any], - worker_specific_mount: List[Optional[DockerMount]], + worker_specific_mount: Optional[List[Optional[DockerMount]]], ): logger.info(f"Starting {component_name}...") @@ -694,6 +688,12 @@ def generic_start_worker( if container_exists(container_name): return + container_config_filename = f"{container_name}.yml" + container_config_file_path = clp_config.logs_directory / container_config_filename + container_worker_config = generate_worker_config(container_clp_config) + with open(container_config_file_path, "w") as f: + yaml.safe_dump(container_worker_config.dump_to_primitive_dict(), f) + logs_dir = clp_config.logs_directory / component_name logs_dir.mkdir(parents=True, exist_ok=True) container_logs_dir = container_clp_config.logs_directory / component_name @@ -723,39 +723,19 @@ def generic_start_worker( f"{container_clp_config.redis.host}:{container_clp_config.redis.port}/{redis_database}" ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", - "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}", + "-e", f"WORKER_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", + # Follow the other method. "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", - "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - # need a way to remove this maybe - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.get_directory()}", ] - if worker_specific_env: - for env_name, env_value in worker_specific_env.items(): - container_start_cmd.append("-e") - container_start_cmd.append(f"{env_name}={env_value}") - - if "s3" == clp_config.archive_output.storage.type: - s3_config = clp_config.archive_output.storage.s3_config - container_start_cmd += [ - "-e", f"ENABLE_S3_ARCHIVE=1", - "-e", f"REGION_NAME={s3_config.region_name}", - "-e", f"BUCKET={s3_config.bucket}", - "-e", f"KEY_PREFIX={s3_config.key_prefix}" - ] - if s3_config.secret_access_key is not None and s3_config.secret_access_key is not None: - container_start_cmd += [ - "-e", f"ACCESS_KEY_ID={s3_config.access_key_id}", - "-e", f"SECRET_ACCESS_KEY={s3_config.secret_access_key}" - ] # fmt: on necessary_mounts = [ mounts.clp_home, mounts.data_dir, mounts.logs_dir, - # need a way to remove this maybe, since reader doesn't need it if it is staged. - # one option is to move it to the worker_specific_mount + # TODO: need a way to remove this maybe, since reader doesn't need it if it + # is staged. one option is to move it to the worker_specific_mount mounts.archives_output_dir, mounts.input_logs_dir, ] diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 3694c8312..e33c29edf 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -633,3 +633,20 @@ def dump_to_primitive_dict(self): d["data_directory"] = str(self.data_directory) d["logs_directory"] = str(self.logs_directory) return d + + +class WorkerConfig(BaseModel): + package: Package = Package() + archive_output: ArchiveOutput = ArchiveOutput() + # Only needed by query worker. Consider inheriting at some point. + stream_output: StreamOutput = StreamOutput() + + data_directory: pathlib.Path = pathlib.Path("var") / "data" + + def dump_to_primitive_dict(self): + d = self.dict() + d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() + d["stream_output"] = self.stream_output.dump_to_primitive_dict() + # Turn paths into primitive strings + d["data_directory"] = str(self.data_directory) + return d \ No newline at end of file diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 120f9179a..66a88ad18 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -15,8 +15,11 @@ Database, S3Config, StorageEngine, + StorageType, + WorkerConfig ) from clp_py_utils.clp_logging import set_logging_level +from clp_py_utils.core import read_yaml_config_file from clp_py_utils.result import Result from clp_py_utils.s3_utils import s3_put from clp_py_utils.sql_adapter import SQL_Adapter @@ -24,6 +27,7 @@ from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult +from pydantic import ValidationError # Setup logging logger = get_task_logger(__name__) @@ -94,23 +98,6 @@ def upload_single_file_archive_to_s3( return Result(success=True) -def get_s3_config() -> Optional[S3Config]: - enable_s3_write = os.getenv("ENABLE_S3_ARCHIVE") - if enable_s3_write is None: - return None - - # TODO: this method is very errorprone since it doesn't check individual members - # Let's leave this for now before we properly implement the config file. - s3_config = S3Config( - region_name=os.getenv("REGION_NAME"), - bucket=os.getenv("BUCKET"), - key_prefix=os.getenv("KEY_PREFIX"), - access_key_id=os.getenv("ACCESS_KEY_ID"), - secret_access_key=os.getenv("SECRET_ACCESS_KEY"), - ) - return s3_config - - def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, @@ -169,6 +156,7 @@ def make_clp_s_command( def run_clp( + worker_config: WorkerConfig, clp_config: ClpIoConfig, clp_home: pathlib.Path, data_dir: pathlib.Path, @@ -184,6 +172,7 @@ def run_clp( """ Compresses files from an FS into archives on an FS + :param worker_config: WorkerConfig :param clp_config: ClpIoConfig :param clp_home: :param data_dir: @@ -197,7 +186,7 @@ def run_clp( :param clp_metadata_db_connection_config :return: tuple -- (whether compression was successful, output messages) """ - clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + clp_storage_engine = worker_config.package.storage_engine instance_id_str = f"compression-job-{job_id}-task-{task_id}" @@ -208,7 +197,8 @@ def run_clp( db_config_file.close() # Get s3 config - s3_config = get_s3_config() + storage_config = worker_config.archive_output.storage + s3_config = storage_config.s3_config if StorageType.S3 == storage_config.type else None s3_upload_failed = False if StorageEngine.CLP == clp_storage_engine: @@ -338,10 +328,20 @@ def compress( paths_to_compress_json: str, clp_metadata_db_connection_config, ): - clp_home_str = os.getenv("CLP_HOME") - data_dir_str = os.getenv("CLP_DATA_DIR") - archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") - logs_dir_str = os.getenv("CLP_LOGS_DIR") + clp_home = pathlib.Path(os.getenv("CLP_HOME")) + logs_dir = pathlib.Path(os.getenv("CLP_LOGS_DIR")) + + # Load configuration + worker_config_path = pathlib.Path(os.getenv("WORKER_CONFIG_PATH")) + try: + worker_config = WorkerConfig.parse_obj(read_yaml_config_file(worker_config_path)) + except ValidationError as err: + logger.error(err) + return -1 + except Exception as ex: + logger.error(ex) + return -1 + # Set logging level clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) @@ -355,11 +355,12 @@ def compress( start_time = datetime.datetime.now() logger.info(f"[job_id={job_id} task_id={task_id}] COMPRESSION STARTED.") compression_task_status, worker_output = run_clp( + worker_config, clp_io_config, - pathlib.Path(clp_home_str), - pathlib.Path(data_dir_str), - pathlib.Path(archive_output_dir_str), - pathlib.Path(logs_dir_str), + clp_home, + worker_config.data_directory, + worker_config.archive_output.get_directory(), + logs_dir, job_id, task_id, tag_ids, From abf5dde5d337136b2acffc100808a92b79c06e88 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:18:12 -0500 Subject: [PATCH 15/43] More support --- .../clp_package_utils/general.py | 1 + .../clp-py-utils/clp_py_utils/clp_config.py | 7 +- .../executor/compress/fs_compression_task.py | 64 +++++++++++-------- .../executor/query/extract_stream_task.py | 46 ++++++++----- .../executor/query/fs_search_task.py | 38 +++++++---- .../job_orchestration/executor/query/utils.py | 5 +- .../job_orchestration/executor/utils.py | 18 ++++++ 7 files changed, 117 insertions(+), 62 deletions(-) create mode 100644 components/job-orchestration/job_orchestration/executor/utils.py diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 507e1e434..81bc238aa 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -276,6 +276,7 @@ def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig: worker_config.archive_output = clp_config.archive_output.copy(deep=True) worker_config.stream_output = clp_config.stream_output.copy(deep=True) worker_config.package = clp_config.package.copy(deep=True) + worker_config.results_cache = clp_config.results_cache.copy(deep=True) worker_config.data_directory = clp_config.data_directory return worker_config diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index e33c29edf..a4ca08a7d 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -638,11 +638,12 @@ def dump_to_primitive_dict(self): class WorkerConfig(BaseModel): package: Package = Package() archive_output: ArchiveOutput = ArchiveOutput() - # Only needed by query worker. Consider inheriting at some point. - stream_output: StreamOutput = StreamOutput() - data_directory: pathlib.Path = pathlib.Path("var") / "data" + # Only needed by query worker. Consider inheriting at some point. + stream_output = StreamOutput() + results_cache: ResultsCache = ResultsCache() + def dump_to_primitive_dict(self): d = self.dict() d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 66a88ad18..244b77ebe 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -135,12 +135,12 @@ def make_clp_s_command( archive_output_dir: pathlib.Path, clp_config: ClpIoConfig, db_config_file_path: pathlib.Path, + enable_s3_write: bool, ): # fmt: off compression_cmd = [ str(clp_home / "bin" / "clp-s"), "c", str(archive_output_dir), - "--single-file-archive", "--print-archive-stats", "--target-encoded-size", str(clp_config.output.target_segment_size + clp_config.output.target_dictionaries_size), @@ -148,6 +148,9 @@ def make_clp_s_command( ] # fmt: on + if enable_s3_write: + compression_cmd.append("--single-file-archive") + if clp_config.input.timestamp_key is not None: compression_cmd.append("--timestamp-key") compression_cmd.append(clp_config.input.timestamp_key) @@ -159,8 +162,6 @@ def run_clp( worker_config: WorkerConfig, clp_config: ClpIoConfig, clp_home: pathlib.Path, - data_dir: pathlib.Path, - archive_output_dir: pathlib.Path, logs_dir: pathlib.Path, job_id: int, task_id: int, @@ -175,8 +176,6 @@ def run_clp( :param worker_config: WorkerConfig :param clp_config: ClpIoConfig :param clp_home: - :param data_dir: - :param archive_output_dir: :param logs_dir: :param job_id: :param task_id: @@ -186,10 +185,12 @@ def run_clp( :param clp_metadata_db_connection_config :return: tuple -- (whether compression was successful, output messages) """ - clp_storage_engine = worker_config.package.storage_engine - instance_id_str = f"compression-job-{job_id}-task-{task_id}" + clp_storage_engine = worker_config.package.storage_engine + data_dir = worker_config.data_directory + archive_output_dir = worker_config.archive_output.get_directory() + # Generate database config file for clp db_config_file_path = data_dir / f"{instance_id_str}-db-config.yml" db_config_file = open(db_config_file_path, "w") @@ -197,9 +198,18 @@ def run_clp( db_config_file.close() # Get s3 config - storage_config = worker_config.archive_output.storage - s3_config = storage_config.s3_config if StorageType.S3 == storage_config.type else None - s3_upload_failed = False + s3_config = None + enable_s3_write = False + s3_write_failed = False + storage_type = worker_config.archive_output.storage.type + if StorageType.S3 == storage_type: + # This should be caught by start-clp and could be redundant, but let's be safe for now. + if StorageEngine.CLP == clp_storage_engine: + logger.error(f"S3 is not supported for {clp_storage_engine}") + return False, {"error_message": f"S3 is not supported for {clp_storage_engine}"} + + s3_config = worker_config.archive_output.storage.s3_config + enable_s3_write = True if StorageEngine.CLP == clp_storage_engine: compression_cmd = make_clp_command( @@ -214,6 +224,7 @@ def run_clp( archive_output_dir=archive_output_dir, clp_config=clp_config, db_config_file_path=db_config_file_path, + enable_s3_write=enable_s3_write ) else: logger.error(f"Unsupported storage engine {clp_storage_engine}") @@ -264,13 +275,13 @@ def run_clp( if last_archive_stats is not None and ( None is stats or stats["id"] != last_archive_stats["id"] ): - if s3_config is not None: + if enable_s3_write: result = upload_single_file_archive_to_s3( last_archive_stats, archive_output_dir, s3_config ) if not result.success: worker_output["error_message"] = result.error - s3_upload_failed = True + s3_write_failed = True # Upon failure, skip updating the archive tags and job metadata. break @@ -308,7 +319,7 @@ def run_clp( # Close stderr log file stderr_log_file.close() - if s3_upload_failed: + if s3_write_failed: logger.error(f"Failed to upload to S3.") return CompressionTaskStatus.FAILED, worker_output if compression_successful: @@ -329,23 +340,24 @@ def compress( clp_metadata_db_connection_config, ): clp_home = pathlib.Path(os.getenv("CLP_HOME")) + + # Set logging level logs_dir = pathlib.Path(os.getenv("CLP_LOGS_DIR")) + clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) + set_logging_level(logger, clp_logging_level) # Load configuration - worker_config_path = pathlib.Path(os.getenv("WORKER_CONFIG_PATH")) try: - worker_config = WorkerConfig.parse_obj(read_yaml_config_file(worker_config_path)) - except ValidationError as err: - logger.error(err) - return -1 + worker_config = WorkerConfig.parse_obj(read_yaml_config_file(pathlib.Path(os.getenv("WORKER_CONFIG_PATH")))) except Exception as ex: - logger.error(ex) - return -1 - - - # Set logging level - clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) - set_logging_level(logger, clp_logging_level) + error_msg = "Failed to load worker config" + logger.exception(error_msg) + return CompressionTaskResult( + task_id=task_id, + status=CompressionTaskStatus.FAILED, + duration=0, + error_message=error_msg + ) clp_io_config = ClpIoConfig.parse_raw(clp_io_config_json) paths_to_compress = PathsToCompress.parse_raw(paths_to_compress_json) @@ -358,8 +370,6 @@ def compress( worker_config, clp_io_config, clp_home, - worker_config.data_directory, - worker_config.archive_output.get_directory(), logs_dir, job_id, task_id, diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index 423ebb757..10c98b457 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -5,12 +5,13 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter +from job_orchestration.executor.utils import try_load_worker_config from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - report_command_creation_failure, + report_task_failure, run_query_task, ) from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig @@ -21,15 +22,17 @@ def make_command( - storage_engine: str, clp_home: Path, - archives_dir: Path, + worker_config: WorkerConfig, archive_id: str, - stream_output_dir: Path, job_config: dict, results_cache_uri: str, - stream_collection_name: str, ) -> Optional[List[str]]: + storage_engine = worker_config.package.storage_engine + archives_dir = worker_config.archive_output.get_directory() + stream_output_dir = worker_config.stream_output.directory + stream_collection_name = worker_config.results_cache.stream_collection_name + if StorageEngine.CLP == storage_engine: logger.info("Starting IR extraction") extract_ir_config = ExtractIrJobConfig.parse_obj(job_config) @@ -97,28 +100,37 @@ def extract_stream( task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + # Load configuration + worker_config = try_load_worker_config(os.getenv("WORKER_CONFIG_PATH"), logger) + if worker_config is None: + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + + if worker_config.archive_output.storage.type == StorageType.S3: + logger.error(f"Extraction is not supported for S3 storage type") + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + # Make task_command clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") - stream_output_dir = Path(os.getenv("CLP_STREAM_OUTPUT_DIR")) - stream_collection_name = os.getenv("CLP_STREAM_COLLECTION_NAME") task_command = make_command( - storage_engine=clp_storage_engine, clp_home=clp_home, - archives_dir=archive_directory, + worker_config=worker_config, archive_id=archive_id, - stream_output_dir=stream_output_dir, job_config=job_config, results_cache_uri=results_cache_uri, - stream_collection_name=stream_collection_name, ) if not task_command: - return report_command_creation_failure( + logger.error(f"Error creating {task_name} command") + return report_task_failure( sql_adapter=sql_adapter, - logger=logger, - task_name=task_name, task_id=task_id, start_time=start_time, ) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 598bfdcfc..6f0337b08 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -5,14 +5,15 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - report_command_creation_failure, + report_task_failure, run_query_task, ) +from job_orchestration.executor.utils import try_load_worker_config from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -21,14 +22,16 @@ def make_command( - storage_engine: str, clp_home: Path, - archives_dir: Path, + worker_config: WorkerConfig, archive_id: str, search_config: SearchJobConfig, results_cache_uri: str, results_collection: str, ) -> Optional[List[str]]: + storage_engine = worker_config.package.storage_engine + archives_dir = worker_config.archive_output.get_directory() + if StorageEngine.CLP == storage_engine: command = [str(clp_home / "bin" / "clo"), "s", str(archives_dir / archive_id)] if search_config.path_filter is not None: @@ -116,26 +119,39 @@ def search( task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + # Load configuration + worker_config = try_load_worker_config(os.getenv("WORKER_CONFIG_PATH"), logger) + if worker_config is None: + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + + if worker_config.archive_output.storage.type == StorageType.S3: + logger.error(f"Search is not supported for S3 storage type") + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + # Make task_command clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") search_config = SearchJobConfig.parse_obj(job_config) task_command = make_command( - storage_engine=clp_storage_engine, clp_home=clp_home, - archives_dir=archive_directory, + worker_config=worker_config, archive_id=archive_id, search_config=search_config, results_cache_uri=results_cache_uri, results_collection=job_id, ) if not task_command: - return report_command_creation_failure( + logger.error(f"Error creating {task_name} command") + return report_task_failure( sql_adapter=sql_adapter, - logger=logger, - task_name=task_name, task_id=task_id, start_time=start_time, ) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index 69d22398e..523abbe00 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -19,14 +19,11 @@ def get_task_log_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Pat return worker_logs_dir / f"{task_id}-clo.log" -def report_command_creation_failure( +def report_task_failure( sql_adapter: SQL_Adapter, - logger: Logger, - task_name: str, task_id: int, start_time: datetime.datetime, ): - logger.error(f"Error creating {task_name} command") task_status = QueryTaskStatus.FAILED update_query_task_metadata( sql_adapter, diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py new file mode 100644 index 000000000..fd5cc27ed --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -0,0 +1,18 @@ +from typing import Optional +from logging import Logger +from clp_py_utils.clp_config import WorkerConfig +from clp_py_utils.core import read_yaml_config_file +from pathlib import Path +def try_load_worker_config( + config_path_str: str, + logger: Logger, +) -> Optional[WorkerConfig]: + if config_path_str is None: + logger.error("config_path_str can't be empty") + return None + + try: + return WorkerConfig.parse_obj(read_yaml_config_file(Path(config_path_str))) + except Exception: + logger.exception("Failed to load worker config") + return None \ No newline at end of file From 7d344561054f4f306ce56270f4796b7fe34a7a97 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:22:44 -0500 Subject: [PATCH 16/43] Remove unnecssary change --- .../clp-package-utils/clp_package_utils/scripts/start_clp.py | 2 +- .../job_orchestration/executor/compress/fs_compression_task.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index f94fefb0b..0b665c717 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -724,9 +724,9 @@ def generic_start_worker( ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", "-e", f"WORKER_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", - # Follow the other method. "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", + "-u", f"{os.getuid()}:{os.getgid()}", ] # fmt: on diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 244b77ebe..b13a2257b 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -27,7 +27,6 @@ from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult -from pydantic import ValidationError # Setup logging logger = get_task_logger(__name__) From a7afd0d87805e141e0ec837de533ca54b59fdb10 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:24:25 -0500 Subject: [PATCH 17/43] Linter --- .../clp-package-utils/clp_package_utils/general.py | 2 +- components/clp-py-utils/clp_py_utils/clp_config.py | 2 +- .../executor/compress/fs_compression_task.py | 10 ++++++---- .../executor/query/extract_stream_task.py | 2 +- .../job_orchestration/executor/utils.py | 9 ++++++--- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 81bc238aa..e1ffee795 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -14,7 +14,6 @@ from clp_py_utils.clp_config import ( CLP_DEFAULT_CREDENTIALS_FILE_PATH, CLPConfig, - WorkerConfig, DB_COMPONENT_NAME, LOG_VIEWER_WEBUI_COMPONENT_NAME, QUEUE_COMPONENT_NAME, @@ -23,6 +22,7 @@ RESULTS_CACHE_COMPONENT_NAME, StorageType, WEBUI_COMPONENT_NAME, + WorkerConfig, ) from clp_py_utils.core import ( get_config_value, diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index a4ca08a7d..226d49d2d 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -650,4 +650,4 @@ def dump_to_primitive_dict(self): d["stream_output"] = self.stream_output.dump_to_primitive_dict() # Turn paths into primitive strings d["data_directory"] = str(self.data_directory) - return d \ No newline at end of file + return d diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index b13a2257b..f96ba72f1 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -16,7 +16,7 @@ S3Config, StorageEngine, StorageType, - WorkerConfig + WorkerConfig, ) from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.core import read_yaml_config_file @@ -223,7 +223,7 @@ def run_clp( archive_output_dir=archive_output_dir, clp_config=clp_config, db_config_file_path=db_config_file_path, - enable_s3_write=enable_s3_write + enable_s3_write=enable_s3_write, ) else: logger.error(f"Unsupported storage engine {clp_storage_engine}") @@ -347,7 +347,9 @@ def compress( # Load configuration try: - worker_config = WorkerConfig.parse_obj(read_yaml_config_file(pathlib.Path(os.getenv("WORKER_CONFIG_PATH")))) + worker_config = WorkerConfig.parse_obj( + read_yaml_config_file(pathlib.Path(os.getenv("WORKER_CONFIG_PATH"))) + ) except Exception as ex: error_msg = "Failed to load worker config" logger.exception(error_msg) @@ -355,7 +357,7 @@ def compress( task_id=task_id, status=CompressionTaskStatus.FAILED, duration=0, - error_message=error_msg + error_message=error_msg, ) clp_io_config = ClpIoConfig.parse_raw(clp_io_config_json) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index 10c98b457..a1df2d5bc 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -8,12 +8,12 @@ from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter -from job_orchestration.executor.utils import try_load_worker_config from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( report_task_failure, run_query_task, ) +from job_orchestration.executor.utils import try_load_worker_config from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py index fd5cc27ed..e00e7ca21 100644 --- a/components/job-orchestration/job_orchestration/executor/utils.py +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -1,8 +1,11 @@ -from typing import Optional from logging import Logger +from pathlib import Path +from typing import Optional + from clp_py_utils.clp_config import WorkerConfig from clp_py_utils.core import read_yaml_config_file -from pathlib import Path + + def try_load_worker_config( config_path_str: str, logger: Logger, @@ -15,4 +18,4 @@ def try_load_worker_config( return WorkerConfig.parse_obj(read_yaml_config_file(Path(config_path_str))) except Exception: logger.exception("Failed to load worker config") - return None \ No newline at end of file + return None From 99d30940391b0a603221d89d7133990070efd2d2 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:41:48 -0500 Subject: [PATCH 18/43] Handle mount for fs & S3 --- .../clp_package_utils/scripts/start_clp.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 0b665c717..7e33f9f55 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -29,6 +29,7 @@ REDIS_COMPONENT_NAME, REDUCER_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + StorageType, WEBUI_COMPONENT_NAME, ) from job_orchestration.scheduler.constants import QueueName @@ -627,6 +628,9 @@ def start_compression_worker( ): celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" + + compression_worker_mount = [mounts.archives_output_dir] + generic_start_worker( COMPRESSION_WORKER_COMPONENT_NAME, instance_id, @@ -638,7 +642,7 @@ def start_compression_worker( clp_config.redis.compression_backend_database, num_cpus, mounts, - None, + compression_worker_mount, ) @@ -653,6 +657,9 @@ def start_query_worker( celery_route = f"{QueueName.QUERY}" query_worker_mount = [mounts.stream_output_dir] + if clp_config.archive_output.storage.type == StorageType.FS: + query_worker_mount.append(mounts.archives_output_dir) + generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -734,9 +741,6 @@ def generic_start_worker( mounts.clp_home, mounts.data_dir, mounts.logs_dir, - # TODO: need a way to remove this maybe, since reader doesn't need it if it - # is staged. one option is to move it to the worker_specific_mount - mounts.archives_output_dir, mounts.input_logs_dir, ] if worker_specific_mount: From 1afed1a1a35556dd3db35b44398145cba5559335 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:44:48 -0500 Subject: [PATCH 19/43] Linter --- .../clp-package-utils/clp_package_utils/scripts/start_clp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 7e33f9f55..e708b45ec 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -660,7 +660,6 @@ def start_query_worker( if clp_config.archive_output.storage.type == StorageType.FS: query_worker_mount.append(mounts.archives_output_dir) - generic_start_worker( QUERY_WORKER_COMPONENT_NAME, instance_id, From 1de661a2576bd8cde4f59d0437439c91c0253ec8 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:21:01 -0500 Subject: [PATCH 20/43] Remove unused functions --- components/clp-package-utils/clp_package_utils/general.py | 7 ------- components/clp-py-utils/clp_py_utils/s3_utils.py | 5 ----- 2 files changed, 12 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index e1ffee795..7cfa396f3 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -30,7 +30,6 @@ read_yaml_config_file, validate_path_could_be_dir, ) -from clp_py_utils.s3_utils import verify_s3_config_for_archive_output from strenum import KebabCaseStrEnum # CONSTANTS @@ -499,12 +498,6 @@ def validate_worker_config(clp_config: CLPConfig): clp_config.validate_archive_output_config() clp_config.validate_stream_output_dir() - storage_config = clp_config.archive_output.storage - if StorageType.S3 == storage_config.type: - result = verify_s3_config_for_archive_output(storage_config.s3_config) - if not result.success: - raise ValueError(f"S3 config verification failed: {result.error}") - def validate_webui_config( clp_config: CLPConfig, logs_dir: pathlib.Path, settings_json_path: pathlib.Path diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 0c4356407..8003beb8c 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -8,11 +8,6 @@ from clp_py_utils.result import Result -def verify_s3_config_for_archive_output(s3_config: S3Config) -> Result: - # TODO: properly verify the s3 config - return Result(success=True) - - def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 ) -> Result: From ce3de982fa5e98df5dcd87ea9a1629f7159281cb Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:23:56 -0500 Subject: [PATCH 21/43] Update components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py --- .../job_orchestration/executor/compress/fs_compression_task.py | 1 - 1 file changed, 1 deletion(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index f96ba72f1..1057527e1 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -281,7 +281,6 @@ def run_clp( if not result.success: worker_output["error_message"] = result.error s3_write_failed = True - # Upon failure, skip updating the archive tags and job metadata. break # We've started a new archive so add the previous archive's last From f49664fcba7d046693a76b460d80173bdda83141 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:33:24 -0500 Subject: [PATCH 22/43] simplify worker config --- .../clp-package-utils/clp_package_utils/general.py | 7 ++++--- components/clp-py-utils/clp_py_utils/clp_config.py | 10 +++++----- .../executor/query/extract_stream_task.py | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 7cfa396f3..60f1053f8 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -272,12 +272,13 @@ def generate_container_config( def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig: worker_config = WorkerConfig() - worker_config.archive_output = clp_config.archive_output.copy(deep=True) - worker_config.stream_output = clp_config.stream_output.copy(deep=True) worker_config.package = clp_config.package.copy(deep=True) - worker_config.results_cache = clp_config.results_cache.copy(deep=True) + worker_config.archive_output = clp_config.archive_output.copy(deep=True) worker_config.data_directory = clp_config.data_directory + worker_config.stream_output_dir = clp_config.stream_output.directory + worker_config.stream_collection_name = clp_config.results_cache.stream_collection_name + return worker_config diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 226d49d2d..cb2c07f18 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -638,16 +638,16 @@ def dump_to_primitive_dict(self): class WorkerConfig(BaseModel): package: Package = Package() archive_output: ArchiveOutput = ArchiveOutput() - data_directory: pathlib.Path = pathlib.Path("var") / "data" + data_directory: pathlib.Path = CLPConfig().data_directory - # Only needed by query worker. Consider inheriting at some point. - stream_output = StreamOutput() - results_cache: ResultsCache = ResultsCache() + # Only needed by query workers. + stream_output_dir: pathlib.Path = StreamOutput().directory + stream_collection_name: str = ResultsCache().stream_collection_name def dump_to_primitive_dict(self): d = self.dict() d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() - d["stream_output"] = self.stream_output.dump_to_primitive_dict() # Turn paths into primitive strings d["data_directory"] = str(self.data_directory) + d["stream_output_dir"] = str(self.stream_output_dir) return d diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index a1df2d5bc..bd3c720d3 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -30,8 +30,8 @@ def make_command( ) -> Optional[List[str]]: storage_engine = worker_config.package.storage_engine archives_dir = worker_config.archive_output.get_directory() - stream_output_dir = worker_config.stream_output.directory - stream_collection_name = worker_config.results_cache.stream_collection_name + stream_output_dir = worker_config.stream_output_dir + stream_collection_name = worker_config.stream_collection_name if StorageEngine.CLP == storage_engine: logger.info("Starting IR extraction") From 046cdcb2cf0c1ca8bdc633cfbaac31b538a5fdba Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:37:27 -0500 Subject: [PATCH 23/43] polishing --- components/clp-py-utils/clp_py_utils/clp_config.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index cb2c07f18..ed607c75c 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -433,6 +433,11 @@ def get_directory(self) -> pathlib.Path: return storage_config.directory return storage_config.staging_directory + def dump_to_primitive_dict(self): + d = self.dict() + # Turn directory (pathlib.Path) into a primitive string + d["storage"] = self.storage.dump_to_primitive_dict() + return d class StreamOutput(BaseModel): directory: pathlib.Path = pathlib.Path("var") / "data" / "streams" @@ -625,7 +630,7 @@ def load_redis_credentials_from_file(self): def dump_to_primitive_dict(self): d = self.dict() - d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() + d["archive_output"] = self.archive_output.dump_to_primitive_dict() d["stream_output"] = self.stream_output.dump_to_primitive_dict() # Turn paths into primitive strings d["input_logs_directory"] = str(self.input_logs_directory) @@ -646,7 +651,7 @@ class WorkerConfig(BaseModel): def dump_to_primitive_dict(self): d = self.dict() - d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() + d["archive_output"] = self.archive_output.dump_to_primitive_dict() # Turn paths into primitive strings d["data_directory"] = str(self.data_directory) d["stream_output_dir"] = str(self.stream_output_dir) From 242dec2d9676a9a87002166b74d0a51220497bb7 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Sat, 14 Dec 2024 12:42:09 -0500 Subject: [PATCH 24/43] linter --- components/clp-py-utils/clp_py_utils/clp_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index ed607c75c..2d9be98f0 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -439,6 +439,7 @@ def dump_to_primitive_dict(self): d["storage"] = self.storage.dump_to_primitive_dict() return d + class StreamOutput(BaseModel): directory: pathlib.Path = pathlib.Path("var") / "data" / "streams" target_uncompressed_size: int = 128 * 1024 * 1024 From ed280cb2a9b7a3d89f2e491c1418a33e467939fb Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:33:07 -0500 Subject: [PATCH 25/43] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../clp_package_utils/scripts/start_clp.py | 2 -- .../clp-py-utils/clp_py_utils/clp_config.py | 15 +++++++++------ .../executor/compress/fs_compression_task.py | 13 ++++++------- .../executor/query/extract_stream_task.py | 2 +- .../executor/query/fs_search_task.py | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index e708b45ec..5f12659d8 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -628,9 +628,7 @@ def start_compression_worker( ): celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" - compression_worker_mount = [mounts.archives_output_dir] - generic_start_worker( COMPRESSION_WORKER_COMPONENT_NAME, instance_id, diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 2d9be98f0..79d2a9588 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -334,7 +334,7 @@ def validate_key_prefix(cls, field): if field == "": raise ValueError("key_prefix is not provided") if not field.endswith("/"): - raise ValueError(r'key_prefix must end with "/"') + raise ValueError('key_prefix must end with "/"') return field @@ -345,7 +345,7 @@ class FSStorage(BaseModel): @validator("directory") def validate_directory(cls, field): if "" == field: - raise ValueError("directory can not be empty") + raise ValueError("directory cannot be empty") return field def make_config_path_absolute(self, clp_home: pathlib.Path): @@ -365,7 +365,7 @@ class S3Storage(BaseModel): @validator("staging_directory") def validate_staging_directory(cls, field): if "" == field: - raise ValueError("staging_directory can not be empty") + raise ValueError("staging_directory cannot be empty") return field @validator("s3_config") @@ -420,7 +420,7 @@ def validate_target_segment_size(cls, field): raise ValueError("target_segment_size must be greater than 0") return field - def set_directory(self, directory: pathlib.Path) -> None: + def set_directory(self, directory: pathlib.Path): storage_config = self.storage if StorageType.FS == storage_config.type: storage_config.directory = directory @@ -552,12 +552,13 @@ def validate_archive_output_config(self): and StorageEngine.CLP_S != self.package.storage_engine ): raise ValueError( - f"S3 storage is only supported with storage engine: {StorageEngine.CLP_S}" + f"archive_output.storage.type = 's3' is only supported with package.storage_engine" + f" = '{StorageEngine.CLP_S}'" ) try: validate_path_could_be_dir(self.archive_output.get_directory()) except ValueError as ex: - raise ValueError(f"directory of storage config is invalid: {ex}") + raise ValueError(f"archive_output.storage's directory is invalid: {ex}") def validate_stream_output_dir(self): try: @@ -653,7 +654,9 @@ class WorkerConfig(BaseModel): def dump_to_primitive_dict(self): d = self.dict() d["archive_output"] = self.archive_output.dump_to_primitive_dict() + # Turn paths into primitive strings d["data_directory"] = str(self.data_directory) d["stream_output_dir"] = str(self.stream_output_dir) + return d diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 1057527e1..d3a97223b 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -85,14 +85,14 @@ def upload_single_file_archive_to_s3( ) -> Result: archive_id = archive_stats["id"] - logger.info(f"Starting to upload archive {archive_id} to S3...") + logger.info(f"Uploading archive {archive_id} to S3...") src_file = archive_dir / archive_id result = s3_put(s3_config, src_file, archive_id) if not result.success: logger.error(f"Failed to upload archive {archive_id}: {result.error}") return result - logger.info(f"Finished uploading archive {archive_id} to S3...") + logger.info(f"Finished uploading archive {archive_id} to S3.") src_file.unlink() return Result(success=True) @@ -197,15 +197,15 @@ def run_clp( db_config_file.close() # Get s3 config - s3_config = None + s3_config: S3Config enable_s3_write = False s3_write_failed = False storage_type = worker_config.archive_output.storage.type if StorageType.S3 == storage_type: - # This should be caught by start-clp and could be redundant, but let's be safe for now. if StorageEngine.CLP == clp_storage_engine: - logger.error(f"S3 is not supported for {clp_storage_engine}") - return False, {"error_message": f"S3 is not supported for {clp_storage_engine}"} + error_msg = f"S3 storage is not supported for the {clp_storage_engine} storage engine." + logger.error(error_msg) + return False, {"error_message": error_msg} s3_config = worker_config.archive_output.storage.s3_config enable_s3_write = True @@ -262,7 +262,6 @@ def run_clp( "total_uncompressed_size": 0, "total_compressed_size": 0, } - while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index bd3c720d3..7b88d796d 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -110,7 +110,7 @@ def extract_stream( ) if worker_config.archive_output.storage.type == StorageType.S3: - logger.error(f"Extraction is not supported for S3 storage type") + logger.error(f"Stream extraction is not supported for the S3 storage type") return report_task_failure( sql_adapter=sql_adapter, task_id=task_id, diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 6f0337b08..b4c2f269b 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -129,7 +129,7 @@ def search( ) if worker_config.archive_output.storage.type == StorageType.S3: - logger.error(f"Search is not supported for S3 storage type") + logger.error(f"Search is not supported for the S3 storage type") return report_task_failure( sql_adapter=sql_adapter, task_id=task_id, From 0788e5953d458fadf39a01ed1a1e251655f78052 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:40:58 -0500 Subject: [PATCH 26/43] Fix easier ones --- .../scripts/native/decompress.py | 2 +- .../scripts/native/del_archives.py | 2 +- .../clp_package_utils/scripts/start_clp.py | 14 +++--- .../clp-py-utils/clp_py_utils/clp_config.py | 47 ++++++++---------- .../clp-py-utils/clp_py_utils/s3_utils.py | 14 +++--- .../executor/compress/fs_compression_task.py | 48 ++++++++++--------- .../executor/query/extract_stream_task.py | 5 +- .../executor/query/fs_search_task.py | 5 +- .../job_orchestration/executor/utils.py | 10 ++-- .../compress/compression_scheduler.py | 4 +- 10 files changed, 74 insertions(+), 77 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py index d16cdcb6f..f1183e10e 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py @@ -207,7 +207,7 @@ def handle_extract_file_cmd( list_path = parsed_args.files_from logs_dir = clp_config.logs_directory - archives_dir = clp_config.archive_output.directory + archives_dir = clp_config.archive_output.get_directory() # Generate database config file for clp db_config_file_path = logs_dir / f".decompress-db-config-{uuid.uuid4()}.yml" diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py index 735bf299d..c489c3806 100644 --- a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py @@ -54,7 +54,7 @@ def main(argv): return -1 database_config = clp_config.database - archives_dir = clp_config.archive_output.directory + archives_dir = clp_config.archive_output.get_directory() if not archives_dir.exists(): logger.error("`archive_output.directory` doesn't exist.") return -1 diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 5f12659d8..f4da04090 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -628,7 +628,7 @@ def start_compression_worker( ): celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" - compression_worker_mount = [mounts.archives_output_dir] + compression_worker_mounts = [mounts.archives_output_dir] generic_start_worker( COMPRESSION_WORKER_COMPONENT_NAME, instance_id, @@ -640,7 +640,7 @@ def start_compression_worker( clp_config.redis.compression_backend_database, num_cpus, mounts, - compression_worker_mount, + compression_worker_mounts, ) @@ -654,9 +654,9 @@ def start_query_worker( celery_method = "job_orchestration.executor.query" celery_route = f"{QueueName.QUERY}" - query_worker_mount = [mounts.stream_output_dir] + query_worker_mounts = [mounts.stream_output_dir] if clp_config.archive_output.storage.type == StorageType.FS: - query_worker_mount.append(mounts.archives_output_dir) + query_worker_mounts.append(mounts.archives_output_dir) generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -669,7 +669,7 @@ def start_query_worker( clp_config.redis.query_backend_database, num_cpus, mounts, - query_worker_mount, + query_worker_mounts, ) @@ -727,13 +727,13 @@ def generic_start_worker( f"{container_clp_config.redis.host}:{container_clp_config.redis.port}/{redis_database}" ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", - "-e", f"WORKER_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", + "-e", f"CLP_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-u", f"{os.getuid()}:{os.getgid()}", ] - # fmt: on + necessary_mounts = [ mounts.clp_home, mounts.data_dir, diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 79d2a9588..4d5ba7a47 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -310,35 +310,35 @@ class Queue(BaseModel): class S3Config(BaseModel): - region_name: str + region_code: str bucket: str key_prefix: str access_key_id: Optional[str] = None secret_access_key: Optional[str] = None - @validator("region_name") - def validate_region_name(cls, field): + @validator("region_code") + def validate_region_code(cls, field): if field == "": - raise ValueError("region_name is not provided") + raise ValueError("region_code can not be empty") return field @validator("bucket") def validate_bucket(cls, field): if field == "": - raise ValueError("bucket is not provided") + raise ValueError("bucket can not be empty") return field @validator("key_prefix") def validate_key_prefix(cls, field): if field == "": - raise ValueError("key_prefix is not provided") + raise ValueError("key_prefix can not be empty") if not field.endswith("/"): raise ValueError('key_prefix must end with "/"') return field -class FSStorage(BaseModel): +class FsStorage(BaseModel): type: Literal[StorageType.FS.value] = StorageType.FS.value directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" @@ -348,7 +348,7 @@ def validate_directory(cls, field): raise ValueError("directory cannot be empty") return field - def make_config_path_absolute(self, clp_home: pathlib.Path): + def make_config_paths_absolute(self, clp_home: pathlib.Path): self.directory = make_config_path_absolute(clp_home, self.directory) def dump_to_primitive_dict(self): @@ -368,13 +368,7 @@ def validate_staging_directory(cls, field): raise ValueError("staging_directory cannot be empty") return field - @validator("s3_config") - def validate_s3_config(cls, field): - if None == field: - raise ValueError("s3_config must be provided") - return field - - def make_config_path_absolute(self, clp_home: pathlib.Path): + def make_config_paths_absolute(self, clp_home: pathlib.Path): self.staging_directory = make_config_path_absolute(clp_home, self.staging_directory) def dump_to_primitive_dict(self): @@ -384,18 +378,12 @@ def dump_to_primitive_dict(self): class ArchiveOutput(BaseModel): - storage: Union[FSStorage, S3Storage] = FSStorage() + storage: Union[FsStorage, S3Storage] = FsStorage() target_archive_size: int = 256 * 1024 * 1024 # 256 MB target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB target_segment_size: int = 256 * 1024 * 1024 # 256 MB - @validator("storage") - def validate_storage(cls, field) -> bool: - if None == field: - raise ValueError("storage must be provided") - return field - @validator("target_archive_size") def validate_target_archive_size(cls, field): if field <= 0: @@ -422,16 +410,23 @@ def validate_target_segment_size(cls, field): def set_directory(self, directory: pathlib.Path): storage_config = self.storage - if StorageType.FS == storage_config.type: + storage_type = storage_config.type + if StorageType.FS == storage_type: storage_config.directory = directory - else: + elif StorageType.S3 == storage_type: storage_config.staging_directory = directory + else: + raise NotImplementedError(f"storage.type {storage_type} is not supported") def get_directory(self) -> pathlib.Path: storage_config = self.storage + storage_type = storage_config.type if StorageType.FS == storage_config.type: return storage_config.directory - return storage_config.staging_directory + elif StorageType.S3 == storage_type: + return storage_config.staging_directory + else: + raise NotImplementedError(f"storage.type {storage_type} is not supported") def dump_to_primitive_dict(self): d = self.dict() @@ -531,7 +526,7 @@ class CLPConfig(BaseModel): def make_config_paths_absolute(self, clp_home: pathlib.Path): self.input_logs_directory = make_config_path_absolute(clp_home, self.input_logs_directory) self.credentials_file_path = make_config_path_absolute(clp_home, self.credentials_file_path) - self.archive_output.storage.make_config_path_absolute(clp_home) + self.archive_output.storage.make_config_paths_absolute(clp_home) self.stream_output.make_config_paths_absolute(clp_home) self.data_directory = make_config_path_absolute(clp_home, self.data_directory) self.logs_directory = make_config_path_absolute(clp_home, self.logs_directory) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 8003beb8c..3271cf985 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -17,15 +17,15 @@ def s3_put( if not src_file.is_file(): return Result(success=False, error=f"{src_file} is not a file") - s3_client_args = { - "region_name": s3_config.region_name, - "aws_access_key_id": s3_config.access_key_id, - "aws_secret_access_key": s3_config.secret_access_key, - } - config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) - my_s3_client = boto3.client("s3", **s3_client_args, config=config) + my_s3_client = boto3.client( + "s3", + region_name=s3_config.region_code, + aws_access_key_id=s3_config.access_key_id, + aws_secret_access_key=s3_config.secret_access_key, + config=config, + ) with open(src_file, "rb") as file_data: try: diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index d3a97223b..3a16c33ed 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -79,21 +79,17 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi def upload_single_file_archive_to_s3( - archive_stats: Dict[str, Any], - archive_dir: pathlib.Path, + archive_id: str, + src_archive_file: pathlib.Path, s3_config: S3Config, ) -> Result: - archive_id = archive_stats["id"] - logger.info(f"Uploading archive {archive_id} to S3...") - src_file = archive_dir / archive_id - result = s3_put(s3_config, src_file, archive_id) + result = s3_put(s3_config, src_archive_file, archive_id) if not result.success: logger.error(f"Failed to upload archive {archive_id}: {result.error}") return result logger.info(f"Finished uploading archive {archive_id} to S3.") - src_file.unlink() return Result(success=True) @@ -199,7 +195,7 @@ def run_clp( # Get s3 config s3_config: S3Config enable_s3_write = False - s3_write_failed = False + storage_type = worker_config.archive_output.storage.type if StorageType.S3 == storage_type: if StorageEngine.CLP == clp_storage_engine: @@ -258,10 +254,11 @@ def run_clp( # Compute the total amount of data compressed last_archive_stats = None last_line_decoded = False - worker_output = { - "total_uncompressed_size": 0, - "total_compressed_size": 0, - } + total_uncompressed_size = 0 + total_compressed_size = 0 + + # Handle job metadata update and s3 write if enabled + s3_error_msg = None while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None @@ -274,18 +271,19 @@ def run_clp( None is stats or stats["id"] != last_archive_stats["id"] ): if enable_s3_write: - result = upload_single_file_archive_to_s3( - last_archive_stats, archive_output_dir, s3_config - ) + archive_id = last_archive_stats["id"] + src_archive_file = archive_output_dir / archive_id + + result = upload_single_file_archive_to_s3(archive_id, src_archive_file, s3_config) if not result.success: - worker_output["error_message"] = result.error - s3_write_failed = True + s3_error_msg = result.error break + src_archive_file.unlink() # We've started a new archive so add the previous archive's last # reported size to the total - worker_output["total_uncompressed_size"] += last_archive_stats["uncompressed_size"] - worker_output["total_compressed_size"] += last_archive_stats["size"] + total_uncompressed_size += last_archive_stats["uncompressed_size"] + total_compressed_size += last_archive_stats["size"] with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: @@ -316,8 +314,14 @@ def run_clp( # Close stderr log file stderr_log_file.close() - if s3_write_failed: - logger.error(f"Failed to upload to S3.") + worker_output = { + "total_uncompressed_size": total_uncompressed_size, + "total_compressed_size": total_compressed_size, + } + + # TODO: think about how to deal with error messages + if s3_error_msg is not None: + worker_output["error_message"] = s3_error_msg return CompressionTaskStatus.FAILED, worker_output if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output @@ -346,7 +350,7 @@ def compress( # Load configuration try: worker_config = WorkerConfig.parse_obj( - read_yaml_config_file(pathlib.Path(os.getenv("WORKER_CONFIG_PATH"))) + read_yaml_config_file(pathlib.Path(os.getenv("CLP_CONFIG_PATH"))) ) except Exception as ex: error_msg = "Failed to load worker config" diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index 7b88d796d..58ae43450 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -13,7 +13,7 @@ report_task_failure, run_query_task, ) -from job_orchestration.executor.utils import try_load_worker_config +from job_orchestration.executor.utils import load_worker_config from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -101,7 +101,8 @@ def extract_stream( sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) # Load configuration - worker_config = try_load_worker_config(os.getenv("WORKER_CONFIG_PATH"), logger) + clp_config_path = Path(os.getenv("CLP_CONFIG_PATH")) + worker_config = load_worker_config(clp_config_path, logger) if worker_config is None: return report_task_failure( sql_adapter=sql_adapter, diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index b4c2f269b..7cf7b330f 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -13,7 +13,7 @@ report_task_failure, run_query_task, ) -from job_orchestration.executor.utils import try_load_worker_config +from job_orchestration.executor.utils import load_worker_config from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -120,7 +120,8 @@ def search( sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) # Load configuration - worker_config = try_load_worker_config(os.getenv("WORKER_CONFIG_PATH"), logger) + clp_config_path = Path(os.getenv("CLP_CONFIG_PATH")) + worker_config = load_worker_config(clp_config_path, logger) if worker_config is None: return report_task_failure( sql_adapter=sql_adapter, diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py index e00e7ca21..c9a0beaac 100644 --- a/components/job-orchestration/job_orchestration/executor/utils.py +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -6,16 +6,12 @@ from clp_py_utils.core import read_yaml_config_file -def try_load_worker_config( - config_path_str: str, +def load_worker_config( + config_path: Path, logger: Logger, ) -> Optional[WorkerConfig]: - if config_path_str is None: - logger.error("config_path_str can't be empty") - return None - try: - return WorkerConfig.parse_obj(read_yaml_config_file(Path(config_path_str))) + return WorkerConfig.parse_obj(read_yaml_config_file(config_path)) except Exception: logger.exception("Failed to load worker config") return None diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index 2de3e835b..bd793686b 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -59,7 +59,7 @@ def update_compression_task_metadata(db_cursor, task_id, kv): SET {", ".join(field_set_expressions)} WHERE id = %s """ - values = [v for v in kv.values()] + [task_id] + values = list(kv.values()) + [task_id] db_cursor.execute(query, values) @@ -74,7 +74,7 @@ def update_compression_job_metadata(db_cursor, job_id, kv): SET {", ".join(field_set_expressions)} WHERE id = %s """ - values = [v for v in kv.values()] + [job_id] + values = list(kv.values()) + [job_id] db_cursor.execute(query, values) From c198f27899b0deb73993a58548d959567b759284 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:49:00 -0500 Subject: [PATCH 27/43] Backup changes --- .../clp-py-utils/clp_py_utils/result.py | 10 --------- .../clp-py-utils/clp_py_utils/s3_utils.py | 14 ++++++------ components/clp-py-utils/pyproject.toml | 2 ++ .../executor/compress/fs_compression_task.py | 22 +++++++++++-------- components/job-orchestration/pyproject.toml | 1 - 5 files changed, 22 insertions(+), 27 deletions(-) delete mode 100644 components/clp-py-utils/clp_py_utils/result.py diff --git a/components/clp-py-utils/clp_py_utils/result.py b/components/clp-py-utils/clp_py_utils/result.py deleted file mode 100644 index 9c72cebf9..000000000 --- a/components/clp-py-utils/clp_py_utils/result.py +++ /dev/null @@ -1,10 +0,0 @@ -from typing import Optional - - -class Result: - def __init__(self, success: bool, error: Optional[str] = None): - self.success = success - self.error = error - - def __repr__(self): - return f"Result(success={self.success}, error={self.error})" diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 3271cf985..ed77d9a4a 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -5,17 +5,17 @@ from botocore.exceptions import ClientError from clp_py_utils.clp_config import S3Config -from clp_py_utils.result import Result +from result import Ok, Err, Result def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 -) -> Result: +) -> Result[bool, str]: if not src_file.exists(): - return Result(success=False, error=f"{src_file} doesn't exist") + return Err(f"{src_file} doesn't exist") if not src_file.is_file(): - return Result(success=False, error=f"{src_file} is not a file") + return Err(f"{src_file} is not a file") config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) @@ -35,8 +35,8 @@ def s3_put( except ClientError as e: error_code = e.response["Error"]["Code"] error_message = e.response["Error"]["Message"] - return Result(success=False, error=f"ClientError: {error_code} - {error_message}") + return Err(f"ClientError: {error_code} - {error_message}") except Exception as e: - return Result(success=False, error=f"An unexpected error occurred: {e}") + return Err(f"An unexpected error occurred: {e}") - return Result(success=True) + return Ok(True) diff --git a/components/clp-py-utils/pyproject.toml b/components/clp-py-utils/pyproject.toml index 4e827b926..34a97e7bb 100644 --- a/components/clp-py-utils/pyproject.toml +++ b/components/clp-py-utils/pyproject.toml @@ -10,6 +10,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" +boto3 = "^1.35.81" # mariadb version must be compatible with libmariadev installed in runtime env. # See https://mariadb.com/docs/server/connect/programming-languages/python/install/#Dependencies mariadb = "~1.0.11" @@ -19,6 +20,7 @@ python-dotenv = "^1.0.1" python-Levenshtein = "~0.22" sqlalchemy = "~2.0" PyYAML = "^6.0.1" +result = "~0.17.0" StrEnum = "^0.4.15" [build-system] diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 3a16c33ed..7c18ea287 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -2,6 +2,7 @@ import json import os import pathlib +from result import Result, is_err import subprocess from contextlib import closing from typing import Any, Dict, Optional @@ -20,7 +21,6 @@ ) from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.core import read_yaml_config_file -from clp_py_utils.result import Result from clp_py_utils.s3_utils import s3_put from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app @@ -82,15 +82,15 @@ def upload_single_file_archive_to_s3( archive_id: str, src_archive_file: pathlib.Path, s3_config: S3Config, -) -> Result: +) -> Result[bool, str]: logger.info(f"Uploading archive {archive_id} to S3...") result = s3_put(s3_config, src_archive_file, archive_id) - if not result.success: - logger.error(f"Failed to upload archive {archive_id}: {result.error}") + if result.is_err(): + logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") return result logger.info(f"Finished uploading archive {archive_id} to S3.") - return Result(success=True) + return result def make_clp_command( @@ -257,8 +257,11 @@ def run_clp( total_uncompressed_size = 0 total_compressed_size = 0 - # Handle job metadata update and s3 write if enabled + # Variables to track s3 write status. s3_error_msg = None + s3_write_failed = False + + # Handle job metadata update and s3 write if enabled while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None @@ -275,8 +278,9 @@ def run_clp( src_archive_file = archive_output_dir / archive_id result = upload_single_file_archive_to_s3(archive_id, src_archive_file, s3_config) - if not result.success: - s3_error_msg = result.error + if result.is_err(): + s3_write_failed = False + s3_error_msg = result.err_value break src_archive_file.unlink() @@ -320,7 +324,7 @@ def run_clp( } # TODO: think about how to deal with error messages - if s3_error_msg is not None: + if s3_write_failed: worker_output["error_message"] = s3_error_msg return CompressionTaskStatus.FAILED, worker_output if compression_successful: diff --git a/components/job-orchestration/pyproject.toml b/components/job-orchestration/pyproject.toml index 2db22cab0..c89d84dec 100644 --- a/components/job-orchestration/pyproject.toml +++ b/components/job-orchestration/pyproject.toml @@ -10,7 +10,6 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" -boto3 = "^1.35.76" Brotli = "^1.1.0" celery = "^5.3.6" # mariadb version must be compatible with libmariadev installed in runtime env. From 4819f7666ad037b7ae0606a524653ac03f92d70f Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:55:43 -0500 Subject: [PATCH 28/43] Small fixes --- .../clp-package-utils/clp_package_utils/scripts/start_clp.py | 1 - .../executor/compress/fs_compression_task.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index f4da04090..6de3174ff 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -715,7 +715,6 @@ def generic_start_worker( "-w", str(CONTAINER_CLP_HOME), "--name", container_name, "--log-driver", "local", - "-u", f"{os.getuid()}:{os.getgid()}", "-e", f"PYTHONPATH={clp_site_packages_dir}", "-e", ( f"BROKER_URL=amqp://" diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 7c18ea287..0678d05ea 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -2,7 +2,6 @@ import json import os import pathlib -from result import Result, is_err import subprocess from contextlib import closing from typing import Any, Dict, Optional @@ -27,6 +26,7 @@ from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult +from result import is_err, Result # Setup logging logger = get_task_logger(__name__) @@ -279,7 +279,7 @@ def run_clp( result = upload_single_file_archive_to_s3(archive_id, src_archive_file, s3_config) if result.is_err(): - s3_write_failed = False + s3_write_failed = True s3_error_msg = result.err_value break src_archive_file.unlink() From e5f43fbf58ca5a42513287ce0988d0d318654a1d Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:04:39 -0500 Subject: [PATCH 29/43] fixes --- .../clp-py-utils/clp_py_utils/s3_utils.py | 2 +- .../executor/compress/fs_compression_task.py | 34 ++++++------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index ed77d9a4a..cb98d24ab 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -3,9 +3,9 @@ import boto3 from botocore.config import Config from botocore.exceptions import ClientError +from result import Err, Ok, Result from clp_py_utils.clp_config import S3Config -from result import Ok, Err, Result def s3_put( diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 0678d05ea..f2b4a2268 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -78,21 +78,6 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi ) -def upload_single_file_archive_to_s3( - archive_id: str, - src_archive_file: pathlib.Path, - s3_config: S3Config, -) -> Result[bool, str]: - logger.info(f"Uploading archive {archive_id} to S3...") - result = s3_put(s3_config, src_archive_file, archive_id) - if result.is_err(): - logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") - return result - - logger.info(f"Finished uploading archive {archive_id} to S3.") - return result - - def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, @@ -257,11 +242,8 @@ def run_clp( total_uncompressed_size = 0 total_compressed_size = 0 - # Variables to track s3 write status. - s3_error_msg = None - s3_write_failed = False - # Handle job metadata update and s3 write if enabled + s3_error = None while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None @@ -277,11 +259,15 @@ def run_clp( archive_id = last_archive_stats["id"] src_archive_file = archive_output_dir / archive_id - result = upload_single_file_archive_to_s3(archive_id, src_archive_file, s3_config) + logger.info(f"Uploading archive {archive_id} to S3...") + result = s3_put(s3_config, src_archive_file, archive_id) + if result.is_err(): - s3_write_failed = True - s3_error_msg = result.err_value + logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") + s3_error = result.err_value break + + logger.info(f"Finished uploading archive {archive_id} to S3.") src_archive_file.unlink() # We've started a new archive so add the previous archive's last @@ -324,8 +310,8 @@ def run_clp( } # TODO: think about how to deal with error messages - if s3_write_failed: - worker_output["error_message"] = s3_error_msg + if s3_error is not None: + worker_output["error_message"] = s3_error return CompressionTaskStatus.FAILED, worker_output if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output From 12460629816c4ce070c430bfb6759b10bdfa9251 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 20:08:34 -0500 Subject: [PATCH 30/43] add safeguard for archive update failure --- .../executor/compress/fs_compression_task.py | 65 ++++++++++--------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index f2b4a2268..4fc949ab9 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -259,32 +259,37 @@ def run_clp( archive_id = last_archive_stats["id"] src_archive_file = archive_output_dir / archive_id - logger.info(f"Uploading archive {archive_id} to S3...") - result = s3_put(s3_config, src_archive_file, archive_id) + if s3_error is None: + logger.info(f"Uploading archive {archive_id} to S3...") + result = s3_put(s3_config, src_archive_file, archive_id) + + if result.is_err(): + logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") + s3_error = result.err_value + # Note: it's possible proc finishes before we call terminate() on it. In + # Which case the process will still return success. + proc.terminate() + else: + logger.info(f"Finished uploading archive {archive_id} to S3.") - if result.is_err(): - logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") - s3_error = result.err_value - break - - logger.info(f"Finished uploading archive {archive_id} to S3.") src_archive_file.unlink() - # We've started a new archive so add the previous archive's last - # reported size to the total - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_job_metadata_and_tags( - db_cursor, - job_id, - clp_metadata_db_connection_config["table_prefix"], - tag_ids, - last_archive_stats, - ) - db_conn.commit() + if s3_error is None: + # We've started a new archive so add the previous archive's last + # reported size to the total + total_uncompressed_size += last_archive_stats["uncompressed_size"] + total_compressed_size += last_archive_stats["size"] + with closing(sql_adapter.create_connection(True)) as db_conn, closing( + db_conn.cursor(dictionary=True) + ) as db_cursor: + update_job_metadata_and_tags( + db_cursor, + job_id, + clp_metadata_db_connection_config["table_prefix"], + tag_ids, + last_archive_stats, + ) + db_conn.commit() last_archive_stats = stats @@ -309,14 +314,16 @@ def run_clp( "total_compressed_size": total_compressed_size, } - # TODO: think about how to deal with error messages - if s3_error is not None: - worker_output["error_message"] = s3_error - return CompressionTaskStatus.FAILED, worker_output - if compression_successful: + if compression_successful and s3_error is None: return CompressionTaskStatus.SUCCEEDED, worker_output else: - worker_output["error_message"] = f"See logs {stderr_log_path}" + worker_output["error_message"] = "" + if compression_successful is False: + worker_output["error_message"] += f"See logs {stderr_log_path}" + if s3_error is not None: + if worker_output["error_message"]: + worker_output["error_message"] += "\n" + worker_output["error_message"] += s3_error return CompressionTaskStatus.FAILED, worker_output From 3b870a44438a191b02dbe5bb832a51d67a430718 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 20:24:32 -0500 Subject: [PATCH 31/43] Add docstrings --- components/clp-py-utils/clp_py_utils/s3_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index cb98d24ab..ddf386268 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -11,11 +11,21 @@ def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 ) -> Result[bool, str]: - + """ + Upload a local file to S3 bucket using AWS's PutObject operation. + :param s3_config: S3 configuration specifies the upload destination and credentials. + :param src_file: local file to upload. + :param dest_file_name: The name for uploaded file in the S3 bucket. + :param total_max_attempts: Maximum number of retry attempts for the upload. Defaults to 3. + :return: Result.OK(bool) on success. + Result.Err(str) with the error message if put operation fails. + """ if not src_file.exists(): return Err(f"{src_file} doesn't exist") if not src_file.is_file(): return Err(f"{src_file} is not a file") + if src_file.stat().st_size > 5 * 1024 * 1024 * 1024: + return Err("File size exceeds 5GB limit for single put_object operation") config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) From 214ae3f94523f5e4cfc842c3308c140053afc4fd Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:06:57 -0500 Subject: [PATCH 32/43] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- components/clp-py-utils/clp_py_utils/s3_utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index ddf386268..03717a445 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -12,20 +12,19 @@ def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 ) -> Result[bool, str]: """ - Upload a local file to S3 bucket using AWS's PutObject operation. - :param s3_config: S3 configuration specifies the upload destination and credentials. - :param src_file: local file to upload. - :param dest_file_name: The name for uploaded file in the S3 bucket. - :param total_max_attempts: Maximum number of retry attempts for the upload. Defaults to 3. - :return: Result.OK(bool) on success. - Result.Err(str) with the error message if put operation fails. + Uploads a local file to an S3 bucket using AWS's PutObject operation. + :param s3_config: S3 configuration specifying the upload destination and credentials. + :param src_file: Local file to upload. + :param dest_file_name: The name for the uploaded file in the S3 bucket. + :param total_max_attempts: Maximum number of retry attempts for the upload. + :return: Result.OK(bool) on success, or Result.Err(str) with the error message otherwise. """ if not src_file.exists(): return Err(f"{src_file} doesn't exist") if not src_file.is_file(): return Err(f"{src_file} is not a file") if src_file.stat().st_size > 5 * 1024 * 1024 * 1024: - return Err("File size exceeds 5GB limit for single put_object operation") + return Err(f"{src_file} is larger than the limit (5GiB) for a single PutObject operation.") config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) From 6ff92fc312476a0a12f2e48b1a7e3449aaca7a4e Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:07:34 -0500 Subject: [PATCH 33/43] Clean up --- components/clp-py-utils/clp_py_utils/clp_config.py | 8 ++++---- .../executor/compress/fs_compression_task.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 4d5ba7a47..f59de7647 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -320,19 +320,19 @@ class S3Config(BaseModel): @validator("region_code") def validate_region_code(cls, field): if field == "": - raise ValueError("region_code can not be empty") + raise ValueError("region_code cannot be empty") return field @validator("bucket") def validate_bucket(cls, field): if field == "": - raise ValueError("bucket can not be empty") + raise ValueError("bucket cannot be empty") return field @validator("key_prefix") def validate_key_prefix(cls, field): if field == "": - raise ValueError("key_prefix can not be empty") + raise ValueError("key_prefix cannot be empty") if not field.endswith("/"): raise ValueError('key_prefix must end with "/"') return field @@ -442,7 +442,7 @@ class StreamOutput(BaseModel): @validator("directory") def validate_directory(cls, field): if "" == field: - raise ValueError("directory can not be empty") + raise ValueError("directory cannot be empty") return field @validator("target_uncompressed_size") diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 4fc949ab9..57b3ca3d5 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -26,7 +26,6 @@ from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult -from result import is_err, Result # Setup logging logger = get_task_logger(__name__) From 9e07d37e355551f0c20596442217e7cc7590faba Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:08:06 -0500 Subject: [PATCH 34/43] update pyproject.toml --- components/clp-py-utils/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/clp-py-utils/pyproject.toml b/components/clp-py-utils/pyproject.toml index 34a97e7bb..6d68ceebe 100644 --- a/components/clp-py-utils/pyproject.toml +++ b/components/clp-py-utils/pyproject.toml @@ -20,7 +20,7 @@ python-dotenv = "^1.0.1" python-Levenshtein = "~0.22" sqlalchemy = "~2.0" PyYAML = "^6.0.1" -result = "~0.17.0" +result = "^0.17.0" StrEnum = "^0.4.15" [build-system] From 915b49d02610ea94a76e62633f2b7564e2622a29 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:21:27 -0500 Subject: [PATCH 35/43] Add docstrings --- .../job-orchestration/job_orchestration/executor/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py index c9a0beaac..e558cc704 100644 --- a/components/job-orchestration/job_orchestration/executor/utils.py +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -10,6 +10,12 @@ def load_worker_config( config_path: Path, logger: Logger, ) -> Optional[WorkerConfig]: + """ + Loads a WorkerConfig object from the specified configuration file. + :param config_path: Path to the configuration file. + :param logger: Logger instance for reporting error if loading fails. + :return: The loaded WorkerConfig object on success, None otherwise. + """ try: return WorkerConfig.parse_obj(read_yaml_config_file(config_path)) except Exception: From a061a29abbfc7958c326e2d9df1c15564ebb3a5f Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:20:13 -0500 Subject: [PATCH 36/43] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../executor/compress/fs_compression_task.py | 18 ++++++++---------- .../job_orchestration/executor/utils.py | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 57b3ca3d5..4e0f70d19 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -179,7 +179,6 @@ def run_clp( # Get s3 config s3_config: S3Config enable_s3_write = False - storage_type = worker_config.archive_output.storage.type if StorageType.S3 == storage_type: if StorageEngine.CLP == clp_storage_engine: @@ -265,8 +264,8 @@ def run_clp( if result.is_err(): logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") s3_error = result.err_value - # Note: it's possible proc finishes before we call terminate() on it. In - # Which case the process will still return success. + # NOTE: It's possible `proc` finishes before we call `terminate` on it, in + # which case the process will still return success. proc.terminate() else: logger.info(f"Finished uploading archive {archive_id} to S3.") @@ -274,8 +273,8 @@ def run_clp( src_archive_file.unlink() if s3_error is None: - # We've started a new archive so add the previous archive's last - # reported size to the total + # We've started a new archive so add the previous archive's last reported size to + # the total total_uncompressed_size += last_archive_stats["uncompressed_size"] total_compressed_size += last_archive_stats["size"] with closing(sql_adapter.create_connection(True)) as db_conn, closing( @@ -316,13 +315,12 @@ def run_clp( if compression_successful and s3_error is None: return CompressionTaskStatus.SUCCEEDED, worker_output else: - worker_output["error_message"] = "" + error_msgs = [] if compression_successful is False: - worker_output["error_message"] += f"See logs {stderr_log_path}" + error_msgs.append(f"See logs {stderr_log_path}") if s3_error is not None: - if worker_output["error_message"]: - worker_output["error_message"] += "\n" - worker_output["error_message"] += s3_error + error_msgs.append(s3_error) + worker_output["error_message"] = "\n".join(error_msgs) return CompressionTaskStatus.FAILED, worker_output diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py index e558cc704..47ea702ae 100644 --- a/components/job-orchestration/job_orchestration/executor/utils.py +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -13,7 +13,7 @@ def load_worker_config( """ Loads a WorkerConfig object from the specified configuration file. :param config_path: Path to the configuration file. - :param logger: Logger instance for reporting error if loading fails. + :param logger: Logger instance for reporting errors if loading fails. :return: The loaded WorkerConfig object on success, None otherwise. """ try: From 83017480290c676a319eb5112389c9c2996ea28b Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:24:11 -0500 Subject: [PATCH 37/43] Update name as suggested by the code review --- .../executor/compress/fs_compression_task.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 4e0f70d19..d140d3777 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -255,11 +255,11 @@ def run_clp( ): if enable_s3_write: archive_id = last_archive_stats["id"] - src_archive_file = archive_output_dir / archive_id + archive_path = archive_output_dir / archive_id if s3_error is None: logger.info(f"Uploading archive {archive_id} to S3...") - result = s3_put(s3_config, src_archive_file, archive_id) + result = s3_put(s3_config, archive_path, archive_id) if result.is_err(): logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") @@ -270,7 +270,7 @@ def run_clp( else: logger.info(f"Finished uploading archive {archive_id} to S3.") - src_archive_file.unlink() + archive_path.unlink() if s3_error is None: # We've started a new archive so add the previous archive's last reported size to From 2ada4646f7db277d3adab585f687a394c82e62c8 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:54:55 -0500 Subject: [PATCH 38/43] a few small fixes to ensure other scripts still work --- .../clp_package_utils/scripts/decompress.py | 12 +++++++++++- .../clp_package_utils/scripts/del_archives.py | 7 +++++++ .../clp_package_utils/scripts/native/decompress.py | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py index 325f2add6..9fdbeee34 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py @@ -5,7 +5,7 @@ import sys from typing import Optional -from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.clp_config import CLPConfig, StorageType from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -81,6 +81,11 @@ def handle_extract_file_cmd( if clp_config is None: return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"File extraction is not supported for storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.FILE_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( @@ -156,6 +161,11 @@ def handle_extract_stream_cmd( if clp_config is None: return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Stream extraction is not supported for storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.IR_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( diff --git a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py index 54d959771..5b9bc6d97 100644 --- a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py +++ b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py @@ -4,6 +4,8 @@ import sys from pathlib import Path +from clp_py_utils.clp_config import StorageType + from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, dump_container_config, @@ -57,6 +59,11 @@ def main(argv): logger.exception("Failed to load config.") return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Archive deletion is not supported for storage type: {storage_type}.") + return -1 + # Validate the input timestamp begin_ts = parsed_args.begin_ts end_ts = parsed_args.end_ts diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py index f1183e10e..7e3c7da6e 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py @@ -167,7 +167,7 @@ def validate_and_load_config_file( """ try: clp_config = load_config_file(config_file_path, default_config_file_path, clp_home) - clp_config.validate_archive_output_dir() + clp_config.validate_archive_output_config() clp_config.validate_logs_dir() return clp_config except Exception: From 6e5aad5ed4164528fb3e6862539b294fdc21471a Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:52:11 -0500 Subject: [PATCH 39/43] adding safeguard for empty stdout line from clp. --- .../executor/compress/fs_compression_task.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index d140d3777..fdd05263b 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -245,7 +245,11 @@ def run_clp( while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None - if line: + if "" == line: + # Skip empty lines that could be caused by potential errors in printing archive stats + continue + + if line is not None: stats = json.loads(line.decode("ascii")) else: last_line_decoded = True From 55c0f36a0ba0d794234016bfbc04a51060388696 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:23:54 -0500 Subject: [PATCH 40/43] add safe guard for search --- .../clp-package-utils/clp_package_utils/scripts/search.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index beb7fb0b0..af3693b16 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -7,6 +7,7 @@ import uuid import yaml +from clp_py_utils.clp_config import StorageType from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -74,6 +75,11 @@ def main(argv): logger.exception("Failed to load config.") return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Search is not supported for storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.SEARCH)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) From 2d7443e72ce0e29c110813a62f14ddc6f47d8cda Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:36:42 -0500 Subject: [PATCH 41/43] Polish error messages. --- .../clp_package_utils/scripts/decompress.py | 6 ++++-- .../clp-package-utils/clp_package_utils/scripts/search.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py index 9fdbeee34..092c339a6 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py @@ -83,7 +83,7 @@ def handle_extract_file_cmd( storage_type = clp_config.archive_output.storage.type if StorageType.FS != storage_type: - logger.error(f"File extraction is not supported for storage type: {storage_type}.") + logger.error(f"File extraction is not supported for archive storage type: {storage_type}.") return -1 container_name = generate_container_name(str(JobType.FILE_EXTRACTION)) @@ -163,7 +163,9 @@ def handle_extract_stream_cmd( storage_type = clp_config.archive_output.storage.type if StorageType.FS != storage_type: - logger.error(f"Stream extraction is not supported for storage type: {storage_type}.") + logger.error( + f"Stream extraction is not supported for archive storage type: {storage_type}." + ) return -1 container_name = generate_container_name(str(JobType.IR_EXTRACTION)) diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index af3693b16..ee50ecf02 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -77,7 +77,9 @@ def main(argv): storage_type = clp_config.archive_output.storage.type if StorageType.FS != storage_type: - logger.error(f"Search is not supported for storage type: {storage_type}.") + logger.error( + f"Search is not supported for archive storage type: {storage_type}." + ) return -1 container_name = generate_container_name(str(JobType.SEARCH)) From 6f907b22a388de32a5a56b195a86c67aacada355 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:51:30 -0500 Subject: [PATCH 42/43] Linter --- .../clp-package-utils/clp_package_utils/scripts/search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index ee50ecf02..38d528528 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -77,9 +77,7 @@ def main(argv): storage_type = clp_config.archive_output.storage.type if StorageType.FS != storage_type: - logger.error( - f"Search is not supported for archive storage type: {storage_type}." - ) + logger.error(f"Search is not supported for archive storage type: {storage_type}.") return -1 container_name = generate_container_name(str(JobType.SEARCH)) From 120ffec2ea3f5611b322521b1b092953ea46f732 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:12:16 -0500 Subject: [PATCH 43/43] Slighlty improve the error message --- .../executor/compress/fs_compression_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index fdd05263b..a5dbc0e35 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -181,8 +181,8 @@ def run_clp( enable_s3_write = False storage_type = worker_config.archive_output.storage.type if StorageType.S3 == storage_type: - if StorageEngine.CLP == clp_storage_engine: - error_msg = f"S3 storage is not supported for the {clp_storage_engine} storage engine." + if StorageEngine.CLP_S != clp_storage_engine: + error_msg = f"S3 storage is not supported for storage engine: {clp_storage_engine}." logger.error(error_msg) return False, {"error_message": error_msg}