From 2778084cf3388779115ec079c2436fa78ad1efb9 Mon Sep 17 00:00:00 2001 From: ThejasNU Date: Wed, 12 Feb 2025 15:33:44 +0530 Subject: [PATCH 1/4] add model download --- libs/agentc/agentc/auditor.py | 3 ++- libs/agentc_cli/agentc_cli/cmds/init.py | 5 +++++ libs/agentc_cli/agentc_cli/cmds/util.py | 19 +++++++++++++++++++ libs/agentc_cli/agentc_cli/main.py | 13 +++++++++---- .../agentc_core/learned/embedding.py | 17 +++++++++++------ 5 files changed, 46 insertions(+), 11 deletions(-) diff --git a/libs/agentc/agentc/auditor.py b/libs/agentc/agentc/auditor.py index 7015aa5..68ee493 100644 --- a/libs/agentc/agentc/auditor.py +++ b/libs/agentc/agentc/auditor.py @@ -149,7 +149,8 @@ def _initialize_auditor(self) -> typing.Self: if self.auditor_output is None and self.conn_string is None: error_message = textwrap.dedent(""" Could not initialize a local or remote auditor! - If this is a new project, please run the command `agentc index` before instantiating an auditor. + If this is a new project, please run the command `agentc init activity` before instantiating an auditor. + Execute `agentc init --help` for more information. If you are intending to use a remote-only auditor, please ensure that all of the relevant variables (i.e., conn_string, username, password, and bucket) are set. """) diff --git a/libs/agentc_cli/agentc_cli/cmds/init.py b/libs/agentc_cli/agentc_cli/cmds/init.py index 5c1188d..bd24f4d 100644 --- a/libs/agentc_cli/agentc_cli/cmds/init.py +++ b/libs/agentc_cli/agentc_cli/cmds/init.py @@ -5,6 +5,7 @@ from .util import init_db_catalog from .util import init_local_activity from .util import init_local_catalog +from .util import init_local_embedding_model from agentc_core.util.models import CouchbaseConnect from agentc_core.util.models import Keyspace from agentc_core.util.publish import get_connection @@ -25,6 +26,10 @@ def cmd_init( initialize_db = "db" in catalog_type initialize_catalog = "catalog" in type_metadata initialize_auditor = "auditor" in type_metadata + initialize_model = "model" in type_metadata + + if initialize_model: + init_local_embedding_model() if initialize_local: if initialize_catalog: diff --git a/libs/agentc_cli/agentc_cli/cmds/util.py b/libs/agentc_cli/agentc_cli/cmds/util.py index dde9564..71f7d9d 100644 --- a/libs/agentc_cli/agentc_cli/cmds/util.py +++ b/libs/agentc_cli/agentc_cli/cmds/util.py @@ -23,8 +23,10 @@ from agentc_core.defaults import DEFAULT_AUDIT_SCOPE from agentc_core.defaults import DEFAULT_CATALOG_COLLECTION_NAME from agentc_core.defaults import DEFAULT_CATALOG_NAME +from agentc_core.defaults import DEFAULT_EMBEDDING_MODEL from agentc_core.defaults import DEFAULT_MAX_ERRS from agentc_core.defaults import DEFAULT_META_COLLECTION_NAME +from agentc_core.defaults import DEFAULT_MODEL_CACHE_FOLDER from agentc_core.defaults import DEFAULT_SCAN_DIRECTORY_OPTS from agentc_core.learned.embedding import EmbeddingModel from agentc_core.util.ddl import create_gsi_indexes @@ -273,3 +275,20 @@ def init_db_auditor(ctx: Context, cluster: Cluster, keyspace_details: Keyspace): except CouchbaseException as e: click.secho("Analytics views could not be created.", fg="red") logger.warning("Analytics views could not be created: %s", e) + + +def init_local_embedding_model(): + # import only in this function to avoid large import times + import sentence_transformers + + try: + sentence_transformers.SentenceTransformer( + os.getenv("AGENT_CATALOG_EMBEDDING_MODEL_NAME", DEFAULT_EMBEDDING_MODEL), + tokenizer_kwargs={"clean_up_tokenization_spaces": True}, + cache_folder=DEFAULT_MODEL_CACHE_FOLDER, + local_files_only=False, + ) + except Exception as e: + raise RuntimeError( + f"Unable to download model {os.getenv("AGENT_CATALOG_EMBEDDING_MODEL_NAME", DEFAULT_EMBEDDING_MODEL)}!!\n{e}" + ) from None diff --git a/libs/agentc_cli/agentc_cli/main.py b/libs/agentc_cli/agentc_cli/main.py index c0f603a..a896bb4 100644 --- a/libs/agentc_cli/agentc_cli/main.py +++ b/libs/agentc_cli/agentc_cli/main.py @@ -131,7 +131,7 @@ def click_main(ctx, catalog, activity, verbose, interactive): ) @click.argument( "type_metadata", - type=click.Choice(["catalog", "auditor", "all"], case_sensitive=False), + type=click.Choice(["model", "catalog", "auditor", "all"], case_sensitive=False), ) @click.option( "--bucket", @@ -141,18 +141,23 @@ def click_main(ctx, catalog, activity, verbose, interactive): show_default=False, ) def init(ctx, catalog_type, type_metadata, bucket): - """Initialize the necessary files/collections for local/database catalog.""" + """Initialize the necessary files/collections for local/database catalog or download sentence-transformer model required for embedding.""" ctx_obj: Context = ctx.obj if not catalog_type: catalog_type = ["local", "db"] - type_metadata = ["catalog", "auditor"] if type_metadata == "all" else [type_metadata] + type_metadata = ["model", "catalog", "auditor"] if type_metadata == "all" else [type_metadata] connection_details_env = None keyspace_details = None if "db" in catalog_type: + if "model" in type_metadata: + raise ValueError( + "Model initialization can be used only with local keyword, db is not supported!!\nPlease execute separately 'agentc init local model' to download the model." + ) + # Load all Couchbase connection related data from env connection_details_env = CouchbaseConnect( connection_url=os.getenv("AGENT_CATALOG_CONN_STRING"), @@ -529,7 +534,7 @@ def find( @click.option( "-em", "--embedding-model-name", - default=DEFAULT_EMBEDDING_MODEL, + default=os.getenv("AGENT_CATALOG_EMBEDDING_MODEL_NAME", DEFAULT_EMBEDDING_MODEL), help="Name of the embedding model used when indexing source files into the local catalog.", show_default=True, ) diff --git a/libs/agentc_core/agentc_core/learned/embedding.py b/libs/agentc_core/agentc_core/learned/embedding.py index 0e49de4..34c8b11 100644 --- a/libs/agentc_core/agentc_core/learned/embedding.py +++ b/libs/agentc_core/agentc_core/learned/embedding.py @@ -182,12 +182,17 @@ def _encode(_text: str) -> list[float]: else: import sentence_transformers - sentence_transformers_model = sentence_transformers.SentenceTransformer( - self.embedding_model_name, - tokenizer_kwargs={"clean_up_tokenization_spaces": True}, - cache_folder=DEFAULT_MODEL_CACHE_FOLDER, - local_files_only=False, - ) + try: + sentence_transformers_model = sentence_transformers.SentenceTransformer( + self.embedding_model_name, + tokenizer_kwargs={"clean_up_tokenization_spaces": True}, + cache_folder=DEFAULT_MODEL_CACHE_FOLDER, + local_files_only=True, + ) + except OSError: + raise ValueError( + f"Unable to find local embedding model {self.embedding_model_name}!!!\nPlease execute 'agentc init local model' to download the model." + ) from None def _encode(_text: str) -> list[float]: return sentence_transformers_model.encode(_text, convert_to_tensor=False).tolist() From 5a51d6f9935a6337588913a25df218a2abdbbbb0 Mon Sep 17 00:00:00 2001 From: ThejasNU Date: Wed, 12 Feb 2025 15:44:22 +0530 Subject: [PATCH 2/4] fix core tests --- .../tests/embedding/test_embedding_local.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/libs/agentc_core/tests/embedding/test_embedding_local.py b/libs/agentc_core/tests/embedding/test_embedding_local.py index c1848da..1a9204a 100644 --- a/libs/agentc_core/tests/embedding/test_embedding_local.py +++ b/libs/agentc_core/tests/embedding/test_embedding_local.py @@ -1,11 +1,19 @@ import pytest +import sentence_transformers from agentc_core.defaults import DEFAULT_EMBEDDING_MODEL +from agentc_core.defaults import DEFAULT_MODEL_CACHE_FOLDER from agentc_core.learned.embedding import EmbeddingModel @pytest.mark.smoke def test_embedding_local_default(): + # download the model + sentence_transformers.SentenceTransformer( + DEFAULT_EMBEDDING_MODEL, cache_folder=DEFAULT_MODEL_CACHE_FOLDER, local_files_only=False + ) + + # execute the model embedding_model = EmbeddingModel( embedding_model_name=DEFAULT_EMBEDDING_MODEL, ) @@ -16,6 +24,12 @@ def test_embedding_local_default(): @pytest.mark.smoke def test_embedding_local_pretrained(): + # download the model + sentence_transformers.SentenceTransformer( + "paraphrase-albert-small-v2", cache_folder=DEFAULT_MODEL_CACHE_FOLDER, local_files_only=False + ) + + # execute the model embedding_model = EmbeddingModel( embedding_model_name="paraphrase-albert-small-v2", ) From d598f233d09fa9d9c48a916807de55a39fa366ca Mon Sep 17 00:00:00 2001 From: ThejasNU Date: Wed, 12 Feb 2025 15:55:34 +0530 Subject: [PATCH 3/4] fix cli tests --- libs/agentc_cli/tests/test_click.py | 23 +++++++++++++++++++--- libs/agentc_testing/agentc_testing/repo.py | 1 + 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/libs/agentc_cli/tests/test_click.py b/libs/agentc_cli/tests/test_click.py index 4b9447f..90c43b2 100644 --- a/libs/agentc_cli/tests/test_click.py +++ b/libs/agentc_cli/tests/test_click.py @@ -49,6 +49,7 @@ def test_index(tmp_path): pathlib.Path(tool_folder / tool.parent.name).mkdir(exist_ok=True) shutil.copy(tool, tool_folder / tool.parent.name / (uuid.uuid4().hex + tool.suffix)) shutil.copy(resources_folder / "_good_spec.json", tool_folder / "_good_spec.json") + runner.invoke(click_main, ["init", "local", "model"]) invocation = runner.invoke(click_main, ["index", str(tool_folder.absolute()), "--no-prompts"]) # We should see 11 files scanned and 12 tools indexed. @@ -489,11 +490,25 @@ def test_init_local(tmp_path): runner.invoke(click_main, ["init", "local", "catalog"]) files_present = os.listdir() - assert ".agent-catalog" in files_present and ".agent-activity" not in files_present + assert ( + ".agent-catalog" in files_present + and ".agent-activity" not in files_present + and ".model-cache" not in files_present + ) runner.invoke(click_main, ["init", "local", "auditor"]) files_present = os.listdir() - assert ".agent-catalog" in files_present and ".agent-activity" in files_present + assert ( + ".agent-catalog" in files_present + and ".agent-activity" in files_present + and ".model-cache" not in files_present + ) + + runner.invoke(click_main, ["init", "local", "model"]) + files_present = os.listdir() + assert ( + ".agent-catalog" in files_present and ".agent-activity" in files_present and ".model-cache" in files_present + ) @pytest.mark.smoke @@ -505,4 +520,6 @@ def test_init_local_all(tmp_path): runner.invoke(click_main, ["init", "local", "all"]) files_present = os.listdir() - assert ".agent-catalog" in files_present and ".agent-activity" in files_present + assert ( + ".agent-catalog" in files_present and ".agent-activity" in files_present and ".model-cache" in files_present + ) diff --git a/libs/agentc_testing/agentc_testing/repo.py b/libs/agentc_testing/agentc_testing/repo.py index c56d48c..0c0875f 100644 --- a/libs/agentc_testing/agentc_testing/repo.py +++ b/libs/agentc_testing/agentc_testing/repo.py @@ -75,6 +75,7 @@ def initialize_repo( # Initialize the local catalog. click_runner.invoke(click_command, ["init", "local", "catalog"]) + click_runner.invoke(click_command, ["init", "local", "model"]) # If we are not using the index command, we can return early... if repo_kind == ExampleRepoKind.EMPTY or repo_kind == ExampleRepoKind.NON_INDEXED_ALL_TRAVEL: From 8e18df6f5bf2f6a5383bb15071ac425f9d2a66ef Mon Sep 17 00:00:00 2001 From: ThejasNU Date: Wed, 12 Feb 2025 16:23:06 +0530 Subject: [PATCH 4/4] add query udfs creation --- libs/agentc_cli/agentc_cli/cmds/util.py | 17 ++++++++++++---- .../agentc_core/analytics/create.py | 20 ++++++++++++++++++- .../analytics/udfs/all_sessions.sqlpp | 10 +++++----- .../analytics/udfs/exchanges.sqlpp | 17 +++++++++------- .../analytics/udfs/last_session.sqlpp | 2 +- .../analytics/udfs/tool_calls.sqlpp | 8 +++++--- .../analytics/udfs/trajectories.sqlpp | 2 +- 7 files changed, 54 insertions(+), 22 deletions(-) diff --git a/libs/agentc_cli/agentc_cli/cmds/util.py b/libs/agentc_cli/agentc_cli/cmds/util.py index 71f7d9d..4f85178 100644 --- a/libs/agentc_cli/agentc_cli/cmds/util.py +++ b/libs/agentc_cli/agentc_cli/cmds/util.py @@ -10,7 +10,8 @@ import typing from ..models.context import Context -from agentc_core.analytics.create import create_analytics_udfs +from agentc_core.analytics.create import create_analytics_views +from agentc_core.analytics.create import create_query_udfs from agentc_core.catalog import CatalogChain from agentc_core.catalog import CatalogDB from agentc_core.catalog import CatalogMem @@ -268,10 +269,18 @@ def init_db_auditor(ctx: Context, cluster: Cluster, keyspace_details: Keyspace): else: click.secho("Scope and collection for the auditor have been successfully created!\n", fg="green") - click.secho("Now creating the analytics UDFs for the auditor.", fg="yellow") + click.secho("Now creating query UDFs for the auditor.", fg="yellow") try: - create_analytics_udfs(cluster, keyspace_details.bucket) - click.secho("All analytics UDFs for the auditor have been successfully created!\n", fg="green") + create_query_udfs(cluster, keyspace_details.bucket) + click.secho("All query UDFs for the auditor have been successfully created!\n", fg="green") + except CouchbaseException as e: + click.secho("Query UDFs could not be created.", fg="red") + logger.warning("Query UDFs could not be created: %s", e) + + click.secho("Now creating the analytics views for the auditor.", fg="yellow") + try: + create_analytics_views(cluster, keyspace_details.bucket) + click.secho("All analytics views for the auditor have been successfully created!\n", fg="green") except CouchbaseException as e: click.secho("Analytics views could not be created.", fg="red") logger.warning("Analytics views could not be created: %s", e) diff --git a/libs/agentc_core/agentc_core/analytics/create.py b/libs/agentc_core/agentc_core/analytics/create.py index 4b7f8b5..6fb0eb3 100644 --- a/libs/agentc_core/agentc_core/analytics/create.py +++ b/libs/agentc_core/agentc_core/analytics/create.py @@ -28,7 +28,7 @@ # pass -def create_analytics_udfs(cluster: couchbase.cluster.Cluster, bucket: str) -> None: +def create_analytics_views(cluster: couchbase.cluster.Cluster, bucket: str) -> None: logger.debug("Creating analytics log scope.") ddl_result = cluster.analytics_query(f""" CREATE ANALYTICS SCOPE `{bucket}`.`{DEFAULT_AUDIT_SCOPE}` @@ -63,3 +63,21 @@ def create_analytics_udfs(cluster: couchbase.cluster.Cluster, bucket: str) -> No ddl_result = cluster.analytics_query(ddl_string) for _ in ddl_result.rows(): pass + + +def create_query_udfs(cluster: couchbase.cluster.Cluster, bucket: str) -> None: + udfs_folder = pathlib.Path(__file__).parent / "udfs" + udfs_files = sorted(file for file in udfs_folder.iterdir()) + for udf_file in udfs_files: + with open(udf_file, "r") as fp: + raw_udf_string = fp.read() + udf_string = ( + raw_udf_string.replace("[BUCKET_NAME]", bucket) + .replace("[SCOPE_NAME]", DEFAULT_AUDIT_SCOPE) + .replace("[LOG_COLLECTION_NAME]", DEFAULT_AUDIT_COLLECTION) + ) + logger.debug(f"Issuing the following statement: {udf_string}") + + ddl_result = cluster.query(udf_string) + for _ in ddl_result.rows(): + pass diff --git a/libs/agentc_core/agentc_core/analytics/udfs/all_sessions.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/all_sessions.sqlpp index a24f39f..eb8db15 100644 --- a/libs/agentc_core/agentc_core/analytics/udfs/all_sessions.sqlpp +++ b/libs/agentc_core/agentc_core/analytics/udfs/all_sessions.sqlpp @@ -18,7 +18,7 @@ WITH PARTITION BY _id ORDER BY - rsli.timestamp ASC + STR_TO_MILLIS(rsli.timestamp) ASC ) AS rn ) SELECT @@ -29,7 +29,7 @@ WITH SELECT VALUE gi.rsl.timestamp ORDER BY - gi.rsl.timestamp ASC + STR_TO_MILLIS(gi.rsl.timestamp) ASC LIMIT 1 )[0] AS start_t, vid, @@ -50,19 +50,19 @@ LETTING SELECT VALUE gii ORDER BY - gii.rsl.timestamp ASC + STR_TO_MILLIS(gii.rsl.timestamp) ASC ) AS gi SELECT gi.rsl.content AS content, gi.rsl.timestamp AS timestamp, gi.rsl.kind AS kind, - ROW_NUMBER() OVER(ORDER BY gi.rsl.timestamp ASC) AS msg_num + ROW_NUMBER() OVER(ORDER BY STR_TO_MILLIS(gi.rsl.timestamp) ASC) AS msg_num ), vid = { "identifier": ver.identifier, "timestamp": ver.timestamp } ORDER BY - start_t DESC + STR_TO_MILLIS(start_t) DESC ) }; \ No newline at end of file diff --git a/libs/agentc_core/agentc_core/analytics/udfs/exchanges.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/exchanges.sqlpp index 12bf5ab..ea6fddc 100644 --- a/libs/agentc_core/agentc_core/analytics/udfs/exchanges.sqlpp +++ b/libs/agentc_core/agentc_core/analytics/udfs/exchanges.sqlpp @@ -1,5 +1,6 @@ -- Note: all_sessions.sqlpp should be run before this script. -CREATE OR REPLACE FUNCTION `[BUCKET_NAME]`.`[SCOPE_NAME]`.QueryLatestMessages() AS +CREATE OR REPLACE FUNCTION `[BUCKET_NAME]`.`[SCOPE_NAME]`.QueryLatestMessages() { +( WITH LM AS ( WITH msgs AS ( @@ -10,7 +11,7 @@ WITH msgs AS ( s.timestamp, s.grouping, s.session, - ROW_NUMBER() OVER (PARTITION BY s.grouping, s.session ORDER BY s.timestamp) AS row_num + ROW_NUMBER() OVER (PARTITION BY s.grouping, s.session ORDER BY STR_TO_MILLIS(s.timestamp)) AS row_num FROM `[BUCKET_NAME]`.`[SCOPE_NAME]`.`[LOG_COLLECTION_NAME]` AS s WHERE s.grouping IS NOT NULL ), @@ -18,13 +19,13 @@ first_human AS ( SELECT msgsi.row_num FROM msgs AS msgsi WHERE msgsi.kind = "human" - ORDER BY msgsi.timestamp ASC + ORDER BY STR_TO_MILLIS(msgsi.timestamp) ASC LIMIT 1 ), last_message AS ( SELECT msgsi.row_num FROM msgs AS msgsi - ORDER BY msgsi.timestamp DESC + ORDER BY STR_TO_MILLIS(msgsi.timestamp) DESC LIMIT 1 ) SELECT @@ -33,14 +34,14 @@ SELECT WHERE msgsi.kind = "human" AND msgsi.grouping = g.grouping AND msgsi.session = g.session - ORDER BY msgsi.timestamp ASC + ORDER BY STR_TO_MILLIS(msgsi.timestamp) ASC LIMIT 1) AS question, (SELECT RAW COALESCE(msgsi.content, msgsi.tool_calls) FROM msgs AS msgsi WHERE msgsi.grouping = g.grouping AND msgsi.session = g.session - ORDER BY msgsi.timestamp DESC + ORDER BY STR_TO_MILLIS(msgsi.timestamp) DESC LIMIT 1) AS answer, (SELECT RAW COALESCE(msgsi.content, msgsi.tool_calls) @@ -57,4 +58,6 @@ FROM (SELECT DISTINCT grouping, session FROM msgs) AS g SELECT (FROM LM e SELECT VALUE e.question ORDER BY e.row_num) AS question, (FROM LM e SELECT VALUE e.answer ORDER BY e.row_num) AS answer, -(FROM LM e SELECT VALUE e.contexts ORDER BY e.row_num) AS contexts; \ No newline at end of file +(FROM LM e SELECT VALUE e.contexts ORDER BY e.row_num) AS contexts +) +}; \ No newline at end of file diff --git a/libs/agentc_core/agentc_core/analytics/udfs/last_session.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/last_session.sqlpp index 072e2ea..6277774 100644 --- a/libs/agentc_core/agentc_core/analytics/udfs/last_session.sqlpp +++ b/libs/agentc_core/agentc_core/analytics/udfs/last_session.sqlpp @@ -7,7 +7,7 @@ CREATE OR REPLACE FUNCTION SELECT VALUE s.sid ORDER BY - s.start_t DESC + STR_TO_MILLIS(s.start_t) DESC LIMIT 1 )[0] }; diff --git a/libs/agentc_core/agentc_core/analytics/udfs/tool_calls.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/tool_calls.sqlpp index a79ceb8..3c8c44d 100644 --- a/libs/agentc_core/agentc_core/analytics/udfs/tool_calls.sqlpp +++ b/libs/agentc_core/agentc_core/analytics/udfs/tool_calls.sqlpp @@ -1,5 +1,6 @@ -- Note: all_sessions.sqlpp should be run before this script. -CREATE OR REPLACE FUNCTION `[BUCKET_NAME]`.`[SCOPE_NAME]`.QueryToolCalls() +CREATE OR REPLACE FUNCTION `[BUCKET_NAME]`.`[SCOPE_NAME]`.QueryToolCalls(){ +( SELECT s1.sid AS sid, s1.vid AS vid, @@ -30,5 +31,6 @@ LETTING gi.m2.content.dump.kwargs.status AS tool_status ) ORDER BY - vid.timestamp DESC -; \ No newline at end of file + STR_TO_MILLIS(vid.timestamp) DESC +) +}; \ No newline at end of file diff --git a/libs/agentc_core/agentc_core/analytics/udfs/trajectories.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/trajectories.sqlpp index 3070e7b..70990f7 100644 --- a/libs/agentc_core/agentc_core/analytics/udfs/trajectories.sqlpp +++ b/libs/agentc_core/agentc_core/analytics/udfs/trajectories.sqlpp @@ -17,7 +17,7 @@ CREATE OR REPLACE FUNCTION SELECT VALUE gi.msg ORDER BY - gi.msg.timestamp ASC + STR_TO_MILLIS(gi.msg.timestamp) ASC ) SELECT s.sid AS sid,