Skip to content

Commit

Permalink
Merge pull request #17 from databricks/fix_dashes
Browse files Browse the repository at this point in the history
Update dashes fix to work with vector search index
  • Loading branch information
epec254 authored Jul 25, 2024
2 parents 1e205f1 + 71e3752 commit 277344a
Show file tree
Hide file tree
Showing 14 changed files with 34 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@
# Chunked documents that are loaded into the Vector Index
"chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
# Destination Vector Index
"vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
"vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
}
destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def chunk_parsed_content_langrecchar(

def find_index(endpoint_name, index_name):
all_indexes = vsc.list_indexes(name=VECTOR_SEARCH_ENDPOINT).get("vector_indexes", [])
return destination_tables_config["vectorsearch_index_name"] in map(lambda i: i.get("name"), all_indexes)
return index_name in map(lambda i: i.get("name"), all_indexes)

if find_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=destination_tables_config["vectorsearch_index_name"]):
if force_delete:
Expand All @@ -476,13 +476,13 @@ def find_index(endpoint_name, index_name):
endpoint_name=VECTOR_SEARCH_ENDPOINT,
index_name=destination_tables_config["vectorsearch_index_name"],
primary_key="chunk_id",
source_table_name=destination_tables_config["chunked_docs_table_name"],
source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
pipeline_type=vectorsearch_config['pipeline_type'],
embedding_source_column="chunked_text",
embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
)

tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)

# COMMAND ----------

Expand Down
3 changes: 2 additions & 1 deletion rag_app_sample_code/A_POC_app/docx_uc_volume/00_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,9 @@
# Chunked documents that are loaded into the Vector Index
"chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
# Destination Vector Index
"vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
"vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
}
destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def chunk_parsed_content_langrecchar(

def find_index(endpoint_name, index_name):
all_indexes = vsc.list_indexes(name=VECTOR_SEARCH_ENDPOINT).get("vector_indexes", [])
return destination_tables_config["vectorsearch_index_name"] in map(lambda i: i.get("name"), all_indexes)
return index_name in map(lambda i: i.get("name"), all_indexes)

if find_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=destination_tables_config["vectorsearch_index_name"]):
if force_delete:
Expand All @@ -470,13 +470,13 @@ def find_index(endpoint_name, index_name):
endpoint_name=VECTOR_SEARCH_ENDPOINT,
index_name=destination_tables_config["vectorsearch_index_name"],
primary_key="chunk_id",
source_table_name=destination_tables_config["chunked_docs_table_name"],
source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
pipeline_type=vectorsearch_config['pipeline_type'],
embedding_source_column="chunked_text",
embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
)

tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)

# COMMAND ----------

Expand Down
3 changes: 2 additions & 1 deletion rag_app_sample_code/A_POC_app/html_uc_volume/00_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,9 @@
# Chunked documents that are loaded into the Vector Index
"chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
# Destination Vector Index
"vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
"vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
}
destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def chunk_parsed_content_langrecchar(

def find_index(endpoint_name, index_name):
all_indexes = vsc.list_indexes(name=VECTOR_SEARCH_ENDPOINT).get("vector_indexes", [])
return destination_tables_config["vectorsearch_index_name"] in map(lambda i: i.get("name"), all_indexes)
return index_name in map(lambda i: i.get("name"), all_indexes)

if find_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=destination_tables_config["vectorsearch_index_name"]):
if force_delete:
Expand All @@ -425,13 +425,13 @@ def find_index(endpoint_name, index_name):
endpoint_name=VECTOR_SEARCH_ENDPOINT,
index_name=destination_tables_config["vectorsearch_index_name"],
primary_key="chunk_id",
source_table_name=destination_tables_config["chunked_docs_table_name"],
source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
pipeline_type=vectorsearch_config['pipeline_type'],
embedding_source_column="chunked_text",
embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
)

tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)
mlflow.log_input(mlflow.data.load_delta(table_name=destination_tables_config.get("chunked_docs_table_name")), context="chunked_docs")

# COMMAND ----------
Expand Down
4 changes: 2 additions & 2 deletions rag_app_sample_code/A_POC_app/pdf_uc_volume/00_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@
# Chunked documents that are loaded into the Vector Index
"chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
# Destination Vector Index
"vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
"vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
}

destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")
# COMMAND ----------

# MAGIC %md
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -423,13 +423,13 @@ def find_index(endpoint_name, index_name):
endpoint_name=VECTOR_SEARCH_ENDPOINT,
index_name=destination_tables_config["vectorsearch_index_name"],
primary_key="chunk_id",
source_table_name=destination_tables_config["chunked_docs_table_name"],
source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
pipeline_type=vectorsearch_config['pipeline_type'],
embedding_source_column="chunked_text",
embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
)

tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)

# COMMAND ----------

Expand Down
3 changes: 2 additions & 1 deletion rag_app_sample_code/A_POC_app/pptx_uc_volume/00_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,9 @@
# Chunked documents that are loaded into the Vector Index
"chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold`",
# Destination Vector Index
"vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
"vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{RAG_APP_NAME}_poc_chunked_docs_gold_index`",
}
destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ def chunk_parsed_content_langrecchar(

def find_index(endpoint_name, index_name):
all_indexes = vsc.list_indexes(name=VECTOR_SEARCH_ENDPOINT).get("vector_indexes", [])
return destination_tables_config["vectorsearch_index_name"] in map(lambda i: i.get("name"), all_indexes)
return index_name in map(lambda i: i.get("name"), all_indexes)

if find_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=destination_tables_config["vectorsearch_index_name"]):
if force_delete:
Expand All @@ -572,13 +572,13 @@ def find_index(endpoint_name, index_name):
endpoint_name=VECTOR_SEARCH_ENDPOINT,
index_name=destination_tables_config["vectorsearch_index_name"],
primary_key="chunk_id",
source_table_name=destination_tables_config["chunked_docs_table_name"],
source_table_name=destination_tables_config["chunked_docs_table_name"].replace("`", ""),
pipeline_type=vectorsearch_config['pipeline_type'],
embedding_source_column="chunked_text",
embedding_model_endpoint_name=embedding_config['embedding_endpoint_name']
)

tag_delta_table(destination_tables_config["vectorsearch_index_name"], data_pipeline_config)
tag_delta_table(destination_tables_config["vectorsearch_index_table_name"], data_pipeline_config)

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
View the instructions for using this pipeline at http://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-2-data-pipeline.html
View the instructions for using this pipeline at http://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-2-data-pipeline.html

Known issues:
* Catalog and Schema names with dashes result in an error loading the vector index
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def get_strategy_packed_json_string(baseline_strategy, strategy_to_try):
# Chunked documents that are loaded into the Vector Index
"chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{strategy_name}_chunked_docs_gold`",
# Destination Vector Index
"vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{strategy_name}_chunked_docs_gold_index`",
"vectorsearch_index_name": f"{UC_CATALOG}.{UC_SCHEMA}.{strategy_name}_chunked_docs_gold_index",
# Streaming checkpoints, used to only process each file once
"checkpoint_path": f"{CHECKPOINTS_VOLUME_PATH}/{strategy_name}/",
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@
# Chunked documents that are loaded into the Vector Index
"chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{config_short_name}_chunked_docs_gold`",
# Destination Vector Index
"vectorsearch_index_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`{config_short_name}_chunked_docs_gold_index`",
"vectorsearch_index_name": f"{UC_CATALOG}.{UC_SCHEMA}.{config_short_name}_chunked_docs_gold_index",
# Streaming checkpoints, used to only process each file once
"checkpoint_path": f"{CHECKPOINTS_VOLUME_PATH}/{config_short_name}",
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
View the instructions for using this pipeline at http://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-2-data-pipeline.html
View the instructions for using this pipeline at http://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-2-data-pipeline.html

Known issues:
* Catalog and Schema names with dashes result in an error loading the vector index

0 comments on commit 277344a

Please sign in to comment.