Merge pull request #17 from databricks/fix_dashes

Update dashes fix to work with vector search index
databricks · Jul 25, 2024 · 277344a · 277344a
2 parents 1e205f1 + 71e3752
commit 277344a
Show file tree

Hide file tree

Showing 14 changed files with 34 additions and 24 deletions.
diff --git a/rag_app_sample_code/A_POC_app/databricks_docs_example/00_config.py b/rag_app_sample_code/A_POC_app/databricks_docs_example/00_config.py
@@ -101,8 +101,9 @@
     # Chunked documents that are loaded into the Vector Index
     "chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
     # Destination Vector Index
-    "vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
+    "vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
 }
+destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")
 
 # COMMAND ----------
 

diff --git a/rag_app_sample_code/A_POC_app/databricks_docs_example/02_poc_data_pipeline.py b/rag_app_sample_code/A_POC_app/databricks_docs_example/02_poc_data_pipeline.py
@@ -457,7 +457,7 @@ def chunk_parsed_content_langrecchar(
 
 def find_index(endpoint_name, index_name):
     all_indexes = vsc.list_indexes(name=VECTOR_SEARCH_ENDPOINT).get("vector_indexes", [])
-    return destination_tables_config["vectorsearch_index_name"] in map(lambda i: i.get("name"), all_indexes)
+    return index_name in map(lambda i: i.get("name"), all_indexes)
 
 if find_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=destination_tables_config["vectorsearch_index_name"]):
     if force_delete:
@@ -476,13 +476,13 @@ def find_index(endpoint_name, index_name):
         endpoint_name=VECTOR_SEARCH_ENDPOINT,
         index_name=destination_tables_config["vectorsearch_index_name"],
         primary_key="chunk_id",
-        source_table_name=destination_tables_config["chunked_docs_table_name"],
+        source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
         pipeline_type=vectorsearch_config['pipeline_type'],
         embedding_source_column="chunked_text",
         embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
     )
 
-tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
+tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)
 
 # COMMAND ----------
 

diff --git a/rag_app_sample_code/A_POC_app/docx_uc_volume/00_config.py b/rag_app_sample_code/A_POC_app/docx_uc_volume/00_config.py
@@ -94,8 +94,9 @@
     # Chunked documents that are loaded into the Vector Index
     "chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
     # Destination Vector Index
-    "vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
+    "vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
 }
+destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")
 
 # COMMAND ----------
 

diff --git a/rag_app_sample_code/A_POC_app/docx_uc_volume/02_poc_data_pipeline.py b/rag_app_sample_code/A_POC_app/docx_uc_volume/02_poc_data_pipeline.py
@@ -451,7 +451,7 @@ def chunk_parsed_content_langrecchar(
 
 def find_index(endpoint_name, index_name):
     all_indexes = vsc.list_indexes(name=VECTOR_SEARCH_ENDPOINT).get("vector_indexes", [])
-    return destination_tables_config["vectorsearch_index_name"] in map(lambda i: i.get("name"), all_indexes)
+    return index_name in map(lambda i: i.get("name"), all_indexes)
 
 if find_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=destination_tables_config["vectorsearch_index_name"]):
     if force_delete:
@@ -470,13 +470,13 @@ def find_index(endpoint_name, index_name):
         endpoint_name=VECTOR_SEARCH_ENDPOINT,
         index_name=destination_tables_config["vectorsearch_index_name"],
         primary_key="chunk_id",
-        source_table_name=destination_tables_config["chunked_docs_table_name"],
+        source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
         pipeline_type=vectorsearch_config['pipeline_type'],
         embedding_source_column="chunked_text",
         embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
     )
 
-tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
+tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)
 
 # COMMAND ----------
 

diff --git a/rag_app_sample_code/A_POC_app/html_uc_volume/00_config.py b/rag_app_sample_code/A_POC_app/html_uc_volume/00_config.py
@@ -94,8 +94,9 @@
     # Chunked documents that are loaded into the Vector Index
     "chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
     # Destination Vector Index
-    "vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
+    "vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
 }
+destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")
 
 # COMMAND ----------
 

diff --git a/rag_app_sample_code/A_POC_app/html_uc_volume/02_poc_data_pipeline.py b/rag_app_sample_code/A_POC_app/html_uc_volume/02_poc_data_pipeline.py
@@ -406,7 +406,7 @@ def chunk_parsed_content_langrecchar(
 
 def find_index(endpoint_name, index_name):
     all_indexes = vsc.list_indexes(name=VECTOR_SEARCH_ENDPOINT).get("vector_indexes", [])
-    return destination_tables_config["vectorsearch_index_name"] in map(lambda i: i.get("name"), all_indexes)
+    return index_name in map(lambda i: i.get("name"), all_indexes)
 
 if find_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=destination_tables_config["vectorsearch_index_name"]):
     if force_delete:
@@ -425,13 +425,13 @@ def find_index(endpoint_name, index_name):
         endpoint_name=VECTOR_SEARCH_ENDPOINT,
         index_name=destination_tables_config["vectorsearch_index_name"],
         primary_key="chunk_id",
-        source_table_name=destination_tables_config["chunked_docs_table_name"],
+        source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
         pipeline_type=vectorsearch_config['pipeline_type'],
         embedding_source_column="chunked_text",
         embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
     )
 
-tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
+tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)
 mlflow.log_input(mlflow.data.load_delta(table_name=destination_tables_config.get("chunked_docs_table_name")), context="chunked_docs")
 
 # COMMAND ----------

diff --git a/rag_app_sample_code/A_POC_app/pdf_uc_volume/00_config.py b/rag_app_sample_code/A_POC_app/pdf_uc_volume/00_config.py
@@ -94,9 +94,9 @@
     # Chunked documents that are loaded into the Vector Index
     "chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
     # Destination Vector Index
-    "vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
+    "vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
 }
-
+destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")
 # COMMAND ----------
 
 # MAGIC %md

diff --git a/rag_app_sample_code/A_POC_app/pdf_uc_volume/02_poc_data_pipeline.py b/rag_app_sample_code/A_POC_app/pdf_uc_volume/02_poc_data_pipeline.py
@@ -423,13 +423,13 @@ def find_index(endpoint_name, index_name):
         endpoint_name=VECTOR_SEARCH_ENDPOINT,
         index_name=destination_tables_config["vectorsearch_index_name"],
         primary_key="chunk_id",
-        source_table_name=destination_tables_config["chunked_docs_table_name"],
+        source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
         pipeline_type=vectorsearch_config['pipeline_type'],
         embedding_source_column="chunked_text",
         embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
     )
 
-tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
+tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)
 
 # COMMAND ----------
 

diff --git a/rag_app_sample_code/A_POC_app/pptx_uc_volume/00_config.py b/rag_app_sample_code/A_POC_app/pptx_uc_volume/00_config.py
@@ -103,8 +103,9 @@
     # Chunked documents that are loaded into the Vector Index
     "chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
     # Destination Vector Index
-    "vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
+    "vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
 }
+destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")
 
 # COMMAND ----------
 

diff --git a/rag_app_sample_code/A_POC_app/pptx_uc_volume/02_poc_data_pipeline.py b/rag_app_sample_code/A_POC_app/pptx_uc_volume/02_poc_data_pipeline.py
@@ -553,7 +553,7 @@ def chunk_parsed_content_langrecchar(
 
 def find_index(endpoint_name, index_name):
     all_indexes = vsc.list_indexes(name=VECTOR_SEARCH_ENDPOINT).get("vector_indexes", [])
-    return destination_tables_config["vectorsearch_index_name"] in map(lambda i: i.get("name"), all_indexes)
+    return index_name in map(lambda i: i.get("name"), all_indexes)
 
 if find_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=destination_tables_config["vectorsearch_index_name"]):
     if force_delete:
@@ -572,13 +572,13 @@ def find_index(endpoint_name, index_name):
         endpoint_name=VECTOR_SEARCH_ENDPOINT,
         index_name=destination_tables_config["vectorsearch_index_name"],
         primary_key="chunk_id",
-        source_table_name=destination_tables_config["chunked_docs_table_name"],
+        source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
         pipeline_type=vectorsearch_config['pipeline_type'],
         embedding_source_column="chunked_text",
         embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
     )
 
-tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
+tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)
 
 # COMMAND ----------
 

diff --git a/...pp_sample_code/B_quality_iteration/data_pipeline_fixes/multiple_fixes/README.md b/...pp_sample_code/B_quality_iteration/data_pipeline_fixes/multiple_fixes/README.md
@@ -1 +1,4 @@
-View the instructions for using this pipeline at http://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-2-data-pipeline.html
+View the instructions for using this pipeline at http://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-2-data-pipeline.html
+
+Known issues:
+* Catalog and Schema names with dashes result in an error loading the vector index
diff --git a/...pp_sample_code/B_quality_iteration/data_pipeline_fixes/multiple_fixes/shared_utilities.py b/...pp_sample_code/B_quality_iteration/data_pipeline_fixes/multiple_fixes/shared_utilities.py
@@ -27,7 +27,7 @@ def get_strategy_packed_json_string(baseline_strategy, strategy_to_try):
         # Chunked documents that are loaded into the Vector Index
         "chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{strategy_name}_chunked_docs_gold`",
         # Destination Vector Index
-        "vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{strategy_name}_chunked_docs_gold_index`",
+        "vectorsearch_index_name": f"{UC_CATALOG}.{UC_SCHEMA}.{strategy_name}_chunked_docs_gold_index",
         # Streaming checkpoints, used to only process each file once
         "checkpoint_path": f"{CHECKPOINTS_VOLUME_PATH}/{strategy_name}/",
     }

diff --git a/rag_app_sample_code/B_quality_iteration/data_pipeline_fixes/single_fix/00_config.py b/rag_app_sample_code/B_quality_iteration/data_pipeline_fixes/single_fix/00_config.py
@@ -203,7 +203,7 @@
     # Chunked documents that are loaded into the Vector Index
     "chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{config_short_name}_chunked_docs_gold`",
     # Destination Vector Index
-    "vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{config_short_name}_chunked_docs_gold_index`",
+    "vectorsearch_index_name": f"{UC_CATALOG}.{UC_SCHEMA}.{config_short_name}_chunked_docs_gold_index",
     # Streaming checkpoints, used to only process each file once
     "checkpoint_path": f"{CHECKPOINTS_VOLUME_PATH}/{config_short_name}",
 }

diff --git a/rag_app_sample_code/B_quality_iteration/data_pipeline_fixes/single_fix/README.md b/rag_app_sample_code/B_quality_iteration/data_pipeline_fixes/single_fix/README.md
@@ -1 +1,4 @@
-View the instructions for using this pipeline at http://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-2-data-pipeline.html
+View the instructions for using this pipeline at http://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-2-data-pipeline.html
+
+Known issues:
+* Catalog and Schema names with dashes result in an error loading the vector index