couchbaselabs · TJ202 · Feb 13, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/libs/agentc/agentc/auditor.py b/libs/agentc/agentc/auditor.py
@@ -149,7 +149,8 @@ def _initialize_auditor(self) -> typing.Self:
         if self.auditor_output is None and self.conn_string is None:
             error_message = textwrap.dedent("""
                 Could not initialize a local or remote auditor!
-                If this is a new project, please run the command `agentc index` before instantiating an auditor.
+                If this is a new project, please run the command `agentc init activity` before instantiating an auditor.
+                Execute `agentc init --help` for more information.
                 If you are intending to use a remote-only auditor, please ensure that all of the relevant variables
                 (i.e., conn_string, username, password, and bucket) are set.
             """)

diff --git a/libs/agentc_cli/agentc_cli/cmds/init.py b/libs/agentc_cli/agentc_cli/cmds/init.py
@@ -5,6 +5,7 @@
 from .util import init_db_catalog
 from .util import init_local_activity
 from .util import init_local_catalog
+from .util import init_local_embedding_model
 from agentc_core.util.models import CouchbaseConnect
 from agentc_core.util.models import Keyspace
 from agentc_core.util.publish import get_connection
@@ -25,6 +26,10 @@ def cmd_init(
     initialize_db = "db" in catalog_type
     initialize_catalog = "catalog" in type_metadata
     initialize_auditor = "auditor" in type_metadata
+    initialize_model = "model" in type_metadata
+
+    if initialize_model:
+        init_local_embedding_model()
 
     if initialize_local:
         if initialize_catalog:

diff --git a/libs/agentc_cli/agentc_cli/cmds/util.py b/libs/agentc_cli/agentc_cli/cmds/util.py
@@ -10,7 +10,8 @@
 import typing
 
 from ..models.context import Context
-from agentc_core.analytics.create import create_analytics_udfs
+from agentc_core.analytics.create import create_analytics_views
+from agentc_core.analytics.create import create_query_udfs
 from agentc_core.catalog import CatalogChain
 from agentc_core.catalog import CatalogDB
 from agentc_core.catalog import CatalogMem
@@ -23,8 +24,10 @@
 from agentc_core.defaults import DEFAULT_AUDIT_SCOPE
 from agentc_core.defaults import DEFAULT_CATALOG_COLLECTION_NAME
 from agentc_core.defaults import DEFAULT_CATALOG_NAME
+from agentc_core.defaults import DEFAULT_EMBEDDING_MODEL
 from agentc_core.defaults import DEFAULT_MAX_ERRS
 from agentc_core.defaults import DEFAULT_META_COLLECTION_NAME
+from agentc_core.defaults import DEFAULT_MODEL_CACHE_FOLDER
 from agentc_core.defaults import DEFAULT_SCAN_DIRECTORY_OPTS
 from agentc_core.learned.embedding import EmbeddingModel
 from agentc_core.util.ddl import create_gsi_indexes
@@ -266,10 +269,35 @@ def init_db_auditor(ctx: Context, cluster: Cluster, keyspace_details: Keyspace):
     else:
         click.secho("Scope and collection for the auditor have been successfully created!\n", fg="green")
 
-    click.secho("Now creating the analytics UDFs for the auditor.", fg="yellow")
+    click.secho("Now creating query UDFs for the auditor.", fg="yellow")
     try:
-        create_analytics_udfs(cluster, keyspace_details.bucket)
-        click.secho("All analytics UDFs for the auditor have been successfully created!\n", fg="green")
+        create_query_udfs(cluster, keyspace_details.bucket)
+        click.secho("All query UDFs for the auditor have been successfully created!\n", fg="green")
+    except CouchbaseException as e:
+        click.secho("Query UDFs could not be created.", fg="red")
+        logger.warning("Query UDFs could not be created: %s", e)
+
+    click.secho("Now creating the analytics views for the auditor.", fg="yellow")
+    try:
+        create_analytics_views(cluster, keyspace_details.bucket)
+        click.secho("All analytics views for the auditor have been successfully created!\n", fg="green")
     except CouchbaseException as e:
         click.secho("Analytics views could not be created.", fg="red")
         logger.warning("Analytics views could not be created: %s", e)
+
+
+def init_local_embedding_model():
+    # import only in this function to avoid large import times
+    import sentence_transformers
+
+    try:
+        sentence_transformers.SentenceTransformer(
+            os.getenv("AGENT_CATALOG_EMBEDDING_MODEL_NAME", DEFAULT_EMBEDDING_MODEL),
+            tokenizer_kwargs={"clean_up_tokenization_spaces": True},
+            cache_folder=DEFAULT_MODEL_CACHE_FOLDER,
+            local_files_only=False,
+        )
+    except Exception as e:
+        raise RuntimeError(
+            f"Unable to download model {os.getenv("AGENT_CATALOG_EMBEDDING_MODEL_NAME", DEFAULT_EMBEDDING_MODEL)}!!\n{e}"
+        ) from None
diff --git a/libs/agentc_cli/agentc_cli/main.py b/libs/agentc_cli/agentc_cli/main.py
@@ -131,7 +131,7 @@ def click_main(ctx, catalog, activity, verbose, interactive):
 )
 @click.argument(
     "type_metadata",
-    type=click.Choice(["catalog", "auditor", "all"], case_sensitive=False),
+    type=click.Choice(["model", "catalog", "auditor", "all"], case_sensitive=False),
 )
 @click.option(
     "--bucket",
@@ -141,18 +141,23 @@ def click_main(ctx, catalog, activity, verbose, interactive):
     show_default=False,
 )
 def init(ctx, catalog_type, type_metadata, bucket):
-    """Initialize the necessary files/collections for local/database catalog."""
+    """Initialize the necessary files/collections for local/database catalog or download sentence-transformer model required for embedding."""
     ctx_obj: Context = ctx.obj
 
     if not catalog_type:
         catalog_type = ["local", "db"]
 
-    type_metadata = ["catalog", "auditor"] if type_metadata == "all" else [type_metadata]
+    type_metadata = ["model", "catalog", "auditor"] if type_metadata == "all" else [type_metadata]
 
     connection_details_env = None
     keyspace_details = None
 
     if "db" in catalog_type:
+        if "model" in type_metadata:
+            raise ValueError(
+                "Model initialization can be used only with local keyword, db is not supported!!\nPlease execute separately 'agentc init local model' to download the model."
+            )
+
         # Load all Couchbase connection related data from env
         connection_details_env = CouchbaseConnect(
             connection_url=os.getenv("AGENT_CATALOG_CONN_STRING"),
@@ -529,7 +534,7 @@ def find(
 @click.option(
     "-em",
     "--embedding-model-name",
-    default=DEFAULT_EMBEDDING_MODEL,
+    default=os.getenv("AGENT_CATALOG_EMBEDDING_MODEL_NAME", DEFAULT_EMBEDDING_MODEL),
     help="Name of the embedding model used when indexing source files into the local catalog.",
     show_default=True,
 )

diff --git a/libs/agentc_cli/tests/test_click.py b/libs/agentc_cli/tests/test_click.py
@@ -49,6 +49,7 @@ def test_index(tmp_path):
             pathlib.Path(tool_folder / tool.parent.name).mkdir(exist_ok=True)
             shutil.copy(tool, tool_folder / tool.parent.name / (uuid.uuid4().hex + tool.suffix))
         shutil.copy(resources_folder / "_good_spec.json", tool_folder / "_good_spec.json")
+        runner.invoke(click_main, ["init", "local", "model"])
         invocation = runner.invoke(click_main, ["index", str(tool_folder.absolute()), "--no-prompts"])
 
         # We should see 11 files scanned and 12 tools indexed.
@@ -489,11 +490,25 @@ def test_init_local(tmp_path):
 
         runner.invoke(click_main, ["init", "local", "catalog"])
         files_present = os.listdir()
-        assert ".agent-catalog" in files_present and ".agent-activity" not in files_present
+        assert (
+            ".agent-catalog" in files_present
+            and ".agent-activity" not in files_present
+            and ".model-cache" not in files_present
+        )
 
         runner.invoke(click_main, ["init", "local", "auditor"])
         files_present = os.listdir()
-        assert ".agent-catalog" in files_present and ".agent-activity" in files_present
+        assert (
+            ".agent-catalog" in files_present
+            and ".agent-activity" in files_present
+            and ".model-cache" not in files_present
+        )
+
+        runner.invoke(click_main, ["init", "local", "model"])
+        files_present = os.listdir()
+        assert (
+            ".agent-catalog" in files_present and ".agent-activity" in files_present and ".model-cache" in files_present
+        )
 
 
 @pytest.mark.smoke
@@ -505,4 +520,6 @@ def test_init_local_all(tmp_path):
 
         runner.invoke(click_main, ["init", "local", "all"])
         files_present = os.listdir()
-        assert ".agent-catalog" in files_present and ".agent-activity" in files_present
+        assert (
+            ".agent-catalog" in files_present and ".agent-activity" in files_present and ".model-cache" in files_present
+        )
diff --git a/libs/agentc_core/agentc_core/analytics/create.py b/libs/agentc_core/agentc_core/analytics/create.py
@@ -28,7 +28,7 @@
 #                 pass
 
 
-def create_analytics_udfs(cluster: couchbase.cluster.Cluster, bucket: str) -> None:
+def create_analytics_views(cluster: couchbase.cluster.Cluster, bucket: str) -> None:
     logger.debug("Creating analytics log scope.")
     ddl_result = cluster.analytics_query(f"""
         CREATE ANALYTICS SCOPE `{bucket}`.`{DEFAULT_AUDIT_SCOPE}`
@@ -63,3 +63,21 @@ def create_analytics_udfs(cluster: couchbase.cluster.Cluster, bucket: str) -> No
             ddl_result = cluster.analytics_query(ddl_string)
             for _ in ddl_result.rows():
                 pass
+
+
+def create_query_udfs(cluster: couchbase.cluster.Cluster, bucket: str) -> None:
+    udfs_folder = pathlib.Path(__file__).parent / "udfs"
+    udfs_files = sorted(file for file in udfs_folder.iterdir())
+    for udf_file in udfs_files:
+        with open(udf_file, "r") as fp:
+            raw_udf_string = fp.read()
+            udf_string = (
+                raw_udf_string.replace("[BUCKET_NAME]", bucket)
+                .replace("[SCOPE_NAME]", DEFAULT_AUDIT_SCOPE)
+                .replace("[LOG_COLLECTION_NAME]", DEFAULT_AUDIT_COLLECTION)
+            )
+            logger.debug(f"Issuing the following statement: {udf_string}")
+
+            ddl_result = cluster.query(udf_string)
+            for _ in ddl_result.rows():
+                pass
diff --git a/libs/agentc_core/agentc_core/analytics/udfs/all_sessions.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/all_sessions.sqlpp
@@ -18,7 +18,7 @@ WITH
                     PARTITION BY
                         _id
                     ORDER BY
-                        rsli.timestamp ASC
+                        STR_TO_MILLIS(rsli.timestamp) ASC
                 ) AS rn
     )
     SELECT
@@ -29,7 +29,7 @@ WITH
         SELECT VALUE
             gi.rsl.timestamp
         ORDER BY
-            gi.rsl.timestamp ASC
+            STR_TO_MILLIS(gi.rsl.timestamp) ASC
         LIMIT 1
     )[0] AS start_t,
     vid,
@@ -50,19 +50,19 @@ LETTING
             SELECT VALUE
                 gii
             ORDER BY
-                gii.rsl.timestamp ASC
+                STR_TO_MILLIS(gii.rsl.timestamp) ASC
         ) AS gi
         SELECT
             gi.rsl.content AS content,
             gi.rsl.timestamp AS timestamp,
             gi.rsl.kind AS kind,
-            ROW_NUMBER() OVER(ORDER BY gi.rsl.timestamp ASC) AS msg_num
+            ROW_NUMBER() OVER(ORDER BY STR_TO_MILLIS(gi.rsl.timestamp) ASC) AS msg_num
     ),
     vid = {
         "identifier": ver.identifier,
         "timestamp": ver.timestamp
     }
     ORDER BY
-    start_t DESC
+    STR_TO_MILLIS(start_t) DESC
 )
 };
diff --git a/libs/agentc_core/agentc_core/analytics/udfs/exchanges.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/exchanges.sqlpp
@@ -1,5 +1,6 @@
 -- Note: all_sessions.sqlpp should be run before this script.
-CREATE OR REPLACE FUNCTION `[BUCKET_NAME]`.`[SCOPE_NAME]`.QueryLatestMessages() AS
+CREATE OR REPLACE FUNCTION `[BUCKET_NAME]`.`[SCOPE_NAME]`.QueryLatestMessages() {
+(
 WITH LM AS
 (
 WITH msgs AS (
@@ -10,21 +11,21 @@ WITH msgs AS (
         s.timestamp,
         s.grouping,
         s.session,
-        ROW_NUMBER() OVER (PARTITION BY s.grouping, s.session ORDER BY s.timestamp) AS row_num
+        ROW_NUMBER() OVER (PARTITION BY s.grouping, s.session ORDER BY STR_TO_MILLIS(s.timestamp)) AS row_num
     FROM `[BUCKET_NAME]`.`[SCOPE_NAME]`.`[LOG_COLLECTION_NAME]` AS s
     WHERE s.grouping IS NOT NULL
 ),
 first_human AS (
     SELECT msgsi.row_num
     FROM msgs AS msgsi
     WHERE msgsi.kind = "human"
-    ORDER BY msgsi.timestamp ASC
+    ORDER BY STR_TO_MILLIS(msgsi.timestamp) ASC
     LIMIT 1
 ),
 last_message AS (
     SELECT msgsi.row_num
     FROM msgs AS msgsi
-    ORDER BY msgsi.timestamp DESC
+    ORDER BY STR_TO_MILLIS(msgsi.timestamp) DESC
     LIMIT 1
 )
 SELECT
@@ -33,14 +34,14 @@ SELECT
      WHERE msgsi.kind = "human"
      AND msgsi.grouping = g.grouping
      AND msgsi.session = g.session
-     ORDER BY msgsi.timestamp ASC
+     ORDER BY STR_TO_MILLIS(msgsi.timestamp) ASC
      LIMIT 1) AS question,
 
     (SELECT RAW COALESCE(msgsi.content, msgsi.tool_calls)
      FROM msgs AS msgsi
      WHERE msgsi.grouping = g.grouping
      AND msgsi.session = g.session
-     ORDER BY msgsi.timestamp DESC
+     ORDER BY STR_TO_MILLIS(msgsi.timestamp) DESC
      LIMIT 1) AS answer,
 
     (SELECT RAW COALESCE(msgsi.content, msgsi.tool_calls)
@@ -57,4 +58,6 @@ FROM (SELECT DISTINCT grouping, session FROM msgs) AS g
 SELECT
 (FROM LM e SELECT VALUE e.question ORDER BY e.row_num) AS question,
 (FROM LM e SELECT VALUE e.answer ORDER BY e.row_num) AS answer,
-(FROM LM e SELECT VALUE e.contexts ORDER BY e.row_num) AS contexts;
+(FROM LM e SELECT VALUE e.contexts ORDER BY e.row_num) AS contexts
+)
+};
diff --git a/libs/agentc_core/agentc_core/analytics/udfs/last_session.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/last_session.sqlpp
@@ -7,7 +7,7 @@ CREATE OR REPLACE FUNCTION
             SELECT VALUE
                 s.sid
             ORDER BY
-                s.start_t DESC
+                STR_TO_MILLIS(s.start_t) DESC
             LIMIT 1
         )[0]
     };
diff --git a/libs/agentc_core/agentc_core/analytics/udfs/tool_calls.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/tool_calls.sqlpp
@@ -1,5 +1,6 @@
 -- Note: all_sessions.sqlpp should be run before this script.
-CREATE OR REPLACE FUNCTION `[BUCKET_NAME]`.`[SCOPE_NAME]`.QueryToolCalls()
+CREATE OR REPLACE FUNCTION `[BUCKET_NAME]`.`[SCOPE_NAME]`.QueryToolCalls(){
+(
 SELECT
     s1.sid AS sid,
     s1.vid AS vid,
@@ -30,5 +31,6 @@ LETTING
             gi.m2.content.dump.kwargs.status AS tool_status
     )
 ORDER BY
-    vid.timestamp DESC
-;
+    STR_TO_MILLIS(vid.timestamp) DESC
+)
+};
diff --git a/libs/agentc_core/agentc_core/analytics/udfs/trajectories.sqlpp b/libs/agentc_core/agentc_core/analytics/udfs/trajectories.sqlpp
@@ -17,7 +17,7 @@ CREATE OR REPLACE FUNCTION
                 SELECT VALUE
                     gi.msg
                 ORDER BY
-                    gi.msg.timestamp ASC
+                    STR_TO_MILLIS(gi.msg.timestamp) ASC
             )
         SELECT
             s.sid AS sid,

diff --git a/libs/agentc_core/agentc_core/learned/embedding.py b/libs/agentc_core/agentc_core/learned/embedding.py
@@ -182,12 +182,17 @@ def _encode(_text: str) -> list[float]:
             else:
                 import sentence_transformers
 
-                sentence_transformers_model = sentence_transformers.SentenceTransformer(
-                    self.embedding_model_name,
-                    tokenizer_kwargs={"clean_up_tokenization_spaces": True},
-                    cache_folder=DEFAULT_MODEL_CACHE_FOLDER,
-                    local_files_only=False,
-                )
+                try:
+                    sentence_transformers_model = sentence_transformers.SentenceTransformer(
+                        self.embedding_model_name,
+                        tokenizer_kwargs={"clean_up_tokenization_spaces": True},
+                        cache_folder=DEFAULT_MODEL_CACHE_FOLDER,
+                        local_files_only=True,
+                    )
+                except OSError:
+                    raise ValueError(
+                        f"Unable to find local embedding model {self.embedding_model_name}!!!\nPlease execute 'agentc init local model' to download the model."
+                    ) from None
 
                 def _encode(_text: str) -> list[float]:
                     return sentence_transformers_model.encode(_text, convert_to_tensor=False).tolist()