From fe6e73e83126ff5f8082c5340ae557eae5da8b29 Mon Sep 17 00:00:00 2001 From: "Jack Boswell (boswelja)" Date: Wed, 18 Dec 2024 13:12:45 +1100 Subject: [PATCH] Embeder -> Embedder --- README.md | 12 +-- docs/blog/posts/embed-anything.md | 2 +- docs/blog/posts/vector-streaming.md | 4 +- docs/index.md | 8 +- examples/adapters/elastic.py | 2 +- examples/adapters/pinecone_db.py | 2 +- examples/adapters/weaviate_db.py | 4 +- examples/audio.py | 4 +- examples/clip.py | 4 +- examples/hybridsearch.py | 8 +- examples/onnx_models.py | 2 +- examples/semantic_chunking.py | 2 +- examples/splade.py | 2 +- examples/text.py | 8 +- examples/text_ocr.py | 2 +- examples/web.py | 2 +- python/python/embed_anything/__init__.py | 12 +-- .../python/embed_anything/_embed_anything.pyi | 34 ++++---- python/src/lib.rs | 36 ++++----- rust/examples/web_embed.rs | 8 +- rust/src/embeddings/embed.rs | 34 ++++---- rust/src/embeddings/local/clip.rs | 18 ++--- rust/src/embeddings/local/jina.rs | 4 +- rust/src/embeddings/mod.rs | 4 +- rust/src/file_processor/html_processor.rs | 12 +-- rust/src/file_processor/website_processor.rs | 12 +-- rust/src/lib.rs | 78 +++++++++---------- rust/src/text_loader.rs | 10 +-- tests/model_tests/test_adapter.py | 6 +- 29 files changed, 168 insertions(+), 168 deletions(-) diff --git a/README.md b/README.md index 36cf7e80..fb89f98f 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ model = EmbeddingModel.from_pretrained_hf( WhichModel.Bert, model_id="model link from huggingface" ) config = TextEmbedConfig(chunk_size=200, batch_size=32) -data = embed_anything.embed_file("file_address", embeder=model, config=config) +data = embed_anything.embed_file("file_address", embedder=model, config=config) ``` @@ -190,7 +190,7 @@ pip install embed-anything-gpu model = EmbeddingModel.from_pretrained_local( WhichModel.Bert, model_id="Hugging_face_link" ) -data = embed_anything.embed_file("test_files/test.pdf", embeder=model) +data = embed_anything.embed_file("test_files/test.pdf", embedder=model) ``` @@ -206,11 +206,11 @@ model = embed_anything.EmbeddingModel.from_pretrained_local( model_id="openai/clip-vit-base-patch16", # revision="refs/pr/15", ) -data: list[EmbedData] = embed_anything.embed_directory("test_files", embeder=model) +data: list[EmbedData] = embed_anything.embed_directory("test_files", embedder=model) embeddings = np.array([data.embedding for data in data]) query = ["Photo of a monkey?"] query_embedding = np.array( - embed_anything.embed_query(query, embeder=model)[0].embedding + embed_anything.embed_query(query, embedder=model)[0].embedding ) similarities = np.dot(embeddings, query_embedding) max_index = np.argmax(similarities) @@ -233,7 +233,7 @@ from embed_anything import ( audio_decoder = AudioDecoderModel.from_pretrained_hf( "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False ) -embeder = EmbeddingModel.from_pretrained_hf( +embedder = EmbeddingModel.from_pretrained_hf( embed_anything.WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", @@ -242,7 +242,7 @@ config = TextEmbedConfig(chunk_size=200, batch_size=32) data = embed_anything.embed_audio_file( "test_files/audio/samples_hp0.wav", audio_decoder=audio_decoder, - embeder=embeder, + embedder=embedder, text_embed_config=config, ) print(data[0].metadata) diff --git a/docs/blog/posts/embed-anything.md b/docs/blog/posts/embed-anything.md index d032db7c..9c545325 100644 --- a/docs/blog/posts/embed-anything.md +++ b/docs/blog/posts/embed-anything.md @@ -115,7 +115,7 @@ model = EmbeddingModel.from_pretrained_hf( WhichModel.Bert, model_id="model link from huggingface" ) config = TextEmbedConfig(chunk_size=200, batch_size=32) -data = embed_anything.embed_file("file_address", embeder=model, config=config) +data = embed_anything.embed_file("file_address", embedder=model, config=config) ``` You can check out the documentation at https://starlight-search.com/references/ diff --git a/docs/blog/posts/vector-streaming.md b/docs/blog/posts/vector-streaming.md index f768ecbd..abc2c35e 100644 --- a/docs/blog/posts/vector-streaming.md +++ b/docs/blog/posts/vector-streaming.md @@ -114,7 +114,7 @@ model = embed_anything.EmbeddingModel.from_pretrained_cloud( data = embed_anything.embed_image_directory( "\image_directory", - embeder=model, + embedder=model, adapter=weaviate_adapter, config=embed_anything.ImageEmbedConfig(buffer_size=100), ) @@ -124,7 +124,7 @@ data = embed_anything.embed_image_directory( #### Step 4: Query the Vector Database ```python -query_vector = embed_anything.embed_query(["image of a cat"], embeder=model)[0].embedding +query_vector = embed_anything.embed_query(["image of a cat"], embedder=model)[0].embedding ``` #### Step 5: Query the Vector Database diff --git a/docs/index.md b/docs/index.md index 02643ed6..e45cf78e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -119,7 +119,7 @@ pip install embed-anything-gpu model = EmbeddingModel.from_pretrained_local( WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2" ) -data = embed_anything.embed_file("test_files/test.pdf", embeder=model) +data = embed_anything.embed_file("test_files/test.pdf", embedder=model) ``` @@ -162,11 +162,11 @@ model = embed_anything.EmbeddingModel.from_pretrained_local( model_id="openai/clip-vit-base-patch16", # revision="refs/pr/15", ) -data: list[EmbedData] = embed_anything.embed_directory("test_files", embeder=model) +data: list[EmbedData] = embed_anything.embed_directory("test_files", embedder=model) embeddings = np.array([data.embedding for data in data]) query = ["Photo of a monkey?"] query_embedding = np.array( - embed_anything.embed_query(query, embeder=model)[0].embedding + embed_anything.embed_query(query, embedder=model)[0].embedding ) similarities = np.dot(embeddings, query_embedding) max_index = np.argmax(similarities) @@ -199,7 +199,7 @@ jina_config = JinaConfig( config = EmbedConfig(jina=jina_config, audio_decoder=audio_decoder_config) data = embed_anything.embed_file( - "test_files/audio/samples_hp0.wav", embeder="Audio", config=config + "test_files/audio/samples_hp0.wav", embedder="Audio", config=config ) print(data[0].metadata) end_time = time.time() diff --git a/examples/adapters/elastic.py b/examples/adapters/elastic.py index 0600090f..4b70f606 100644 --- a/examples/adapters/elastic.py +++ b/examples/adapters/elastic.py @@ -77,7 +77,7 @@ def upsert(self, data: List[Dict]): data = embed_anything.embed_file( "/path/to/my-file.pdf", - embeder="Bert", + embedder="Bert", adapter=elasticsearch_adapter, config=embed_config, ) diff --git a/examples/adapters/pinecone_db.py b/examples/adapters/pinecone_db.py index de054f7e..7545bc7e 100644 --- a/examples/adapters/pinecone_db.py +++ b/examples/adapters/pinecone_db.py @@ -123,7 +123,7 @@ def upsert(self, data: List[Dict]): data = embed_anything.embed_image_directory( "test_files", - embeder=clip_model, + embedder=clip_model, adapter=pinecone_adapter, config=embed_config, ) diff --git a/examples/adapters/weaviate_db.py b/examples/adapters/weaviate_db.py index e0dbf456..1fc54780 100644 --- a/examples/adapters/weaviate_db.py +++ b/examples/adapters/weaviate_db.py @@ -65,10 +65,10 @@ def delete_index(self, index_name: str): data = embed_anything.embed_directory( - "test_files", embeder=model, adapter=weaviate_adapter + "test_files", embedder=model, adapter=weaviate_adapter ) -query_vector = embed_anything.embed_query(["What is attention"], embeder=model)[ +query_vector = embed_anything.embed_query(["What is attention"], embedder=model)[ 0 ].embedding diff --git a/examples/audio.py b/examples/audio.py index 5277d0da..ba000aee 100644 --- a/examples/audio.py +++ b/examples/audio.py @@ -14,7 +14,7 @@ "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False ) -embeder = EmbeddingModel.from_pretrained_hf( +embedder = EmbeddingModel.from_pretrained_hf( embed_anything.WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", @@ -24,7 +24,7 @@ data = embed_anything.embed_audio_file( "test_files/audio/samples_hp0.wav", audio_decoder=audio_decoder, - embeder=embeder, + embedder=embedder, text_embed_config=config, ) print(data[0].metadata) diff --git a/examples/clip.py b/examples/clip.py index ca4b14cd..61dcd80c 100644 --- a/examples/clip.py +++ b/examples/clip.py @@ -11,7 +11,7 @@ model_id="openai/clip-vit-base-patch16", ) data: list[EmbedData] = embed_anything.embed_image_directory( - "test_files", embeder=model + "test_files", embedder=model ) # Convert the embeddings to a numpy array @@ -22,7 +22,7 @@ # Embed a query query = ["Photo of a monkey?"] query_embedding = np.array( - embed_anything.embed_query(query, embeder=model)[0].embedding + embed_anything.embed_query(query, embedder=model)[0].embedding ) # Calculate the similarities between the query embedding and all the embeddings diff --git a/examples/hybridsearch.py b/examples/hybridsearch.py index c3d635d4..855cabc8 100644 --- a/examples/hybridsearch.py +++ b/examples/hybridsearch.py @@ -50,16 +50,16 @@ WhichModel.Jina, model_id="jinaai/jina-embeddings-v2-small-en" ) -jina_embedddings = embed_anything.embed_query(sentences, embeder=jina_model) -jina_query = embed_anything.embed_query(query_text, embeder=jina_model)[0] +jina_embedddings = embed_anything.embed_query(sentences, embedder=jina_model) +jina_query = embed_anything.embed_query(query_text, embedder=jina_model)[0] splade_model = EmbeddingModel.from_pretrained_hf( WhichModel.SparseBert, "prithivida/Splade_PP_en_v1" ) -jina_embedddings = embed_anything.embed_query(sentences, embeder=jina_model) +jina_embedddings = embed_anything.embed_query(sentences, embedder=jina_model) -splade_query = embed_anything.embed_query(query_text, embeder=splade_model) +splade_query = embed_anything.embed_query(query_text, embedder=splade_model) client.query_points( collection_name="my-hybrid-collection", diff --git a/examples/onnx_models.py b/examples/onnx_models.py index 312556cc..a6752f8f 100644 --- a/examples/onnx_models.py +++ b/examples/onnx_models.py @@ -25,7 +25,7 @@ "The dog is sitting in the park", ] -embedddings = embed_query(sentences, embeder=model) +embedddings = embed_query(sentences, embedder=model) embed_vector = np.array([e.embedding for e in embedddings]) diff --git a/examples/semantic_chunking.py b/examples/semantic_chunking.py index 7575f60a..20e59a5c 100644 --- a/examples/semantic_chunking.py +++ b/examples/semantic_chunking.py @@ -16,7 +16,7 @@ semantic_encoder=semantic_encoder, ) -data = embed_anything.embed_file("test_files/bank.txt", embeder=model, config=config) +data = embed_anything.embed_file("test_files/bank.txt", embedder=model, config=config) for d in data: print(d.text) diff --git a/examples/splade.py b/examples/splade.py index 4f806614..40b7738f 100644 --- a/examples/splade.py +++ b/examples/splade.py @@ -22,7 +22,7 @@ "Do you like pizza?", ] -embedddings = embed_query(sentences, embeder=model) +embedddings = embed_query(sentences, embedder=model) embed_vector = np.array([e.embedding for e in embedddings]) diff --git a/examples/text.py b/examples/text.py index 15225727..296bff83 100644 --- a/examples/text.py +++ b/examples/text.py @@ -21,7 +21,7 @@ def embed_directory_example(): # Embed all files in a directory data: list[EmbedData] = embed_anything.embed_directory( - "bench", embeder=model, config=config + "bench", embedder=model, config=config ) # End timing @@ -39,7 +39,7 @@ def embed_query_example(): # Embed a query embeddings: EmbedData = embed_anything.embed_query( - ["Hello world my"], embeder=model, config=config + ["Hello world my"], embedder=model, config=config )[0] # Print the shape of the embedding @@ -48,7 +48,7 @@ def embed_query_example(): # Embed another query and print the result print( embed_anything.embed_query( - ["What is the capital of India?"], embeder=model, config=config + ["What is the capital of India?"], embedder=model, config=config ) ) @@ -62,7 +62,7 @@ def embed_file_example(): # Embed a single file data: list[EmbedData] = embed_anything.embed_file( - "test_files/bank.txt", embeder=model, config=config + "test_files/bank.txt", embedder=model, config=config ) # Print the embedded data diff --git a/examples/text_ocr.py b/examples/text_ocr.py index a0db094d..f01a68bf 100644 --- a/examples/text_ocr.py +++ b/examples/text_ocr.py @@ -22,7 +22,7 @@ data: list[EmbedData] = embed_anything.embed_file( "/home/akshay/projects/starlaw/src-server/test_files/court.pdf", # Replace with your file path - embeder=model, + embedder=model, config=config, ) end = time() diff --git a/examples/web.py b/examples/web.py index e877e084..dcb55d88 100644 --- a/examples/web.py +++ b/examples/web.py @@ -1,3 +1,3 @@ import embed_anything -data = embed_anything.embed_webpage("https://www.akshaymakes.com/", embeder="Bert") +data = embed_anything.embed_webpage("https://www.akshaymakes.com/", embedder="Bert") diff --git a/python/python/embed_anything/__init__.py b/python/python/embed_anything/__init__.py index 95f0f29f..4c8506b5 100644 --- a/python/python/embed_anything/__init__.py +++ b/python/python/embed_anything/__init__.py @@ -21,7 +21,7 @@ model = EmbeddingModel.from_pretrained_local( WhichModel.Bert, model_id="Hugging_face_link" ) -data = embed_anything.embed_file("test_files/test.pdf", embeder=model) +data = embed_anything.embed_file("test_files/test.pdf", embedder=model) #For images @@ -30,11 +30,11 @@ model_id="openai/clip-vit-base-patch16", # revision="refs/pr/15", ) -data: list[EmbedData] = embed_anything.embed_directory("test_files", embeder=model) +data: list[EmbedData] = embed_anything.embed_directory("test_files", embedder=model) embeddings = np.array([data.embedding for data in data]) query = ["Photo of a monkey?"] query_embedding = np.array( - embed_anything.embed_query(query, embeder=model)[0].embedding + embed_anything.embed_query(query, embedder=model)[0].embedding ) # For audio files from embed_anything import ( @@ -47,7 +47,7 @@ audio_decoder = AudioDecoderModel.from_pretrained_hf( "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False ) -embeder = EmbeddingModel.from_pretrained_hf( +embedder = EmbeddingModel.from_pretrained_hf( embed_anything.WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", @@ -56,7 +56,7 @@ data = embed_anything.embed_audio_file( "test_files/audio/samples_hp0.wav", audio_decoder=audio_decoder, - embeder=embeder, + embedder=embedder, text_embed_config=config, ) @@ -98,7 +98,7 @@ data = embed_anything.embed_image_directory( "test_files", - embeder=clip_model, + embedder=clip_model, adapter=pinecone_adapter, # config=embed_config, ``` diff --git a/python/python/embed_anything/_embed_anything.pyi b/python/python/embed_anything/_embed_anything.pyi index 50b32783..00290dd8 100644 --- a/python/python/embed_anything/_embed_anything.pyi +++ b/python/python/embed_anything/_embed_anything.pyi @@ -53,14 +53,14 @@ class Adapter(ABC): """ def embed_query( - query: list[str], embeder: EmbeddingModel, config: TextEmbedConfig | None = None + query: list[str], embedder: EmbeddingModel, config: TextEmbedConfig | None = None ) -> list[EmbedData]: """ Embeds the given query and returns a list of EmbedData objects. Args: query: The query to embed. - embeder: The embedding model to use. + embedder: The embedding model to use. config: The configuration for the embedding model. Returns: @@ -80,7 +80,7 @@ def embed_query( def embed_file( file_path: str, - embeder: EmbeddingModel, + embedder: EmbeddingModel, config: TextEmbedConfig | None = None, adapter: Adapter | None = None, ) -> list[EmbedData]: @@ -89,7 +89,7 @@ def embed_file( Args: file_path: The path to the file to embed. - embeder: The embedding model to use. + embedder: The embedding model to use. config: The configuration for the embedding model. adapter: The adapter to use for storing the embeddings in a vector database. @@ -104,13 +104,13 @@ def embed_file( model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", ) - data = embed_anything.embed_file("test_files/test.pdf", embeder=model) + data = embed_anything.embed_file("test_files/test.pdf", embedder=model) ``` """ def embed_directory( file_path: str, - embeder: EmbeddingModel, + embedder: EmbeddingModel, extensions: list[str], config: TextEmbedConfig | None = None, adapter: Adapter | None = None, @@ -120,7 +120,7 @@ def embed_directory( Args: file_path: The path to the directory containing the files to embed. - embeder: The embedding model to use. + embedder: The embedding model to use. extensions: The list of file extensions to consider for embedding. config: The configuration for the embedding model. adapter: The adapter to use for storing the embeddings in a vector database. @@ -136,13 +136,13 @@ def embed_directory( model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", ) - data = embed_anything.embed_directory("test_files", embeder=model, extensions=[".pdf"]) + data = embed_anything.embed_directory("test_files", embedder=model, extensions=[".pdf"]) ``` """ def embed_image_directory( file_path: str, - embeder: EmbeddingModel, + embedder: EmbeddingModel, config: ImageEmbedConfig | None = None, adapter: Adapter | None = None, ) -> list[EmbedData]: @@ -151,7 +151,7 @@ def embed_image_directory( Args: file_path: The path to the directory containing the images to embed. - embeder: The embedding model to use. + embedder: The embedding model to use. config: The configuration for the embedding model. adapter: The adapter to use for storing the embeddings in a vector database. @@ -161,7 +161,7 @@ def embed_image_directory( def embed_webpage( url: str, - embeder: EmbeddingModel, + embedder: EmbeddingModel, config: TextEmbedConfig | None, adapter: Adapter | None, ) -> list[EmbedData] | None: @@ -170,7 +170,7 @@ def embed_webpage( Args: url: The URL of the webpage to embed. - embeder: The name of the embedding model to use. Choose between "OpenAI", "Jina", "Bert" + embedder: The name of the embedding model to use. Choose between "OpenAI", "Jina", "Bert" config: The configuration for the embedding model. adapter: The adapter to use for storing the embeddings. @@ -185,7 +185,7 @@ def embed_webpage( openai_config=embed_anything.OpenAIConfig(model="text-embedding-3-small") ) data = embed_anything.embed_webpage( - "https://www.akshaymakes.com/", embeder="OpenAI", config=config + "https://www.akshaymakes.com/", embedder="OpenAI", config=config ) ``` """ @@ -193,7 +193,7 @@ def embed_webpage( def embed_audio_file( file_path: str, audio_decoder: AudioDecoderModel, - embeder: EmbeddingModel, + embedder: EmbeddingModel, text_embed_config: TextEmbedConfig | None = TextEmbedConfig( chunk_size=200, batch_size=32 ), @@ -204,7 +204,7 @@ def embed_audio_file( Args: file_path: The path to the audio file to embed. audio_decoder: The audio decoder model to use. - embeder: The embedding model to use. + embedder: The embedding model to use. text_embed_config: The configuration for the embedding model. Returns: @@ -218,7 +218,7 @@ def embed_audio_file( "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False ) - embeder = embed_anything.EmbeddingModel.from_pretrained_hf( + embedder = embed_anything.EmbeddingModel.from_pretrained_hf( embed_anything.WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", @@ -228,7 +228,7 @@ def embed_audio_file( data = embed_anything.embed_audio_file( "test_files/audio/samples_hp0.wav", audio_decoder=audio_decoder, - embeder=embeder, + embedder=embedder, text_embed_config=config, ) ``` diff --git a/python/src/lib.rs b/python/src/lib.rs index 3fd6b1e2..d5afba28 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -353,14 +353,14 @@ impl AudioDecoderModel { } #[pyfunction] -#[pyo3(signature = (query, embeder, config=None))] +#[pyo3(signature = (query, embedder, config=None))] pub fn embed_query( query: Vec, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, config: Option<&config::TextEmbedConfig>, ) -> PyResult> { let config = config.map(|c| &c.inner); - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let rt = Builder::new_multi_thread().enable_all().build().unwrap(); Ok(rt.block_on(async { embed_anything::embed_query( @@ -378,15 +378,15 @@ pub fn embed_query( } #[pyfunction] -#[pyo3(signature = (file_name, embeder, config=None, adapter=None))] +#[pyo3(signature = (file_name, embedder, config=None, adapter=None))] pub fn embed_file( file_name: &str, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, config: Option<&config::TextEmbedConfig>, adapter: Option, ) -> PyResult>> { let config = config.map(|c| &c.inner); - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let rt = Builder::new_multi_thread().enable_all().build().unwrap(); if !Path::new(file_name).exists() { // check if the file exists other wise return a "File not found" error with PyValueError @@ -437,15 +437,15 @@ pub fn embed_file( } #[pyfunction] -#[pyo3(signature = (audio_file, audio_decoder, embeder, text_embed_config=None))] +#[pyo3(signature = (audio_file, audio_decoder, embedder, text_embed_config=None))] pub fn embed_audio_file( audio_file: String, audio_decoder: &mut AudioDecoderModel, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, text_embed_config: Option<&config::TextEmbedConfig>, ) -> PyResult>> { let config = text_embed_config.map(|c| &c.inner); - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let audio_decoder = &mut audio_decoder.inner; let rt = Builder::new_multi_thread().enable_all().build().unwrap(); let data = rt.block_on(async { @@ -463,16 +463,16 @@ pub fn embed_audio_file( } #[pyfunction] -#[pyo3(signature = (directory, embeder, extensions=None, config=None, adapter = None))] +#[pyo3(signature = (directory, embedder, extensions=None, config=None, adapter = None))] pub fn embed_directory( directory: PathBuf, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, extensions: Option>, config: Option<&config::TextEmbedConfig>, adapter: Option, ) -> PyResult>> { let config = config.map(|c| &c.inner); - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let rt = Builder::new_multi_thread().enable_all().build().unwrap(); println!("Runtime created"); @@ -517,14 +517,14 @@ pub fn embed_directory( } #[pyfunction] -#[pyo3(signature = (directory, embeder, config=None, adapter = None))] +#[pyo3(signature = (directory, embedder, config=None, adapter = None))] pub fn embed_image_directory( directory: PathBuf, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, config: Option<&config::ImageEmbedConfig>, adapter: Option, ) -> PyResult>> { - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let config = config.map(|c| &c.inner); let rt = Builder::new_multi_thread().enable_all().build().unwrap(); println!("Runtime created"); @@ -563,14 +563,14 @@ pub fn embed_image_directory( Ok(data) } #[pyfunction] -#[pyo3(signature = (url, embeder, config=None, adapter = None))] +#[pyo3(signature = (url, embedder, config=None, adapter = None))] pub fn embed_webpage( url: String, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, config: Option<&config::TextEmbedConfig>, adapter: Option, ) -> PyResult>> { - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let config = config.map(|c| &c.inner); let rt = Builder::new_multi_thread().enable_all().build().unwrap(); let adapter = match adapter { diff --git a/rust/examples/web_embed.rs b/rust/examples/web_embed.rs index 3bdfa668..4adf5eea 100644 --- a/rust/examples/web_embed.rs +++ b/rust/examples/web_embed.rs @@ -13,7 +13,7 @@ async fn main() { let start_time = std::time::Instant::now(); let url = "https://www.scrapingbee.com/blog/web-scraping-rust/".to_string(); - let embeder = Arc::new( + let embedder = Arc::new( Embedder::from_pretrained_hf("bert", "sentence-transformers/all-MiniLM-L6-v2", None) .unwrap(), ); @@ -23,11 +23,11 @@ async fn main() { .with_batch_size(32) .with_buffer_size(100) .with_splitting_strategy(SplittingStrategy::Sentence) - .with_semantic_encoder(Arc::clone(&embeder)); + .with_semantic_encoder(Arc::clone(&embedder)); let embed_data = embed_webpage( url, - &embeder, + &embedder, Some(&embed_config), None::)>, ) @@ -48,7 +48,7 @@ async fn main() { .unwrap(); let query = vec!["Rust for web scraping".to_string()]; - let query_embedding: Vec = embed_query(query, &embeder, Some(&embed_config)) + let query_embedding: Vec = embed_query(query, &embedder, Some(&embed_config)) .await .unwrap() .iter() diff --git a/rust/src/embeddings/embed.rs b/rust/src/embeddings/embed.rs index 8dcf1759..72762fa3 100644 --- a/rust/src/embeddings/embed.rs +++ b/rust/src/embeddings/embed.rs @@ -99,10 +99,10 @@ impl TextEmbedder { batch_size: Option, ) -> Result, anyhow::Error> { match self { - TextEmbedder::OpenAI(embeder) => embeder.embed(text_batch).await, - TextEmbedder::Cohere(embeder) => embeder.embed(text_batch).await, - TextEmbedder::Jina(embeder) => embeder.embed(text_batch, batch_size), - TextEmbedder::Bert(embeder) => embeder.embed(text_batch, batch_size), + TextEmbedder::OpenAI(embedder) => embedder.embed(text_batch).await, + TextEmbedder::Cohere(embedder) => embedder.embed(text_batch).await, + TextEmbedder::Jina(embedder) => embedder.embed(text_batch, batch_size), + TextEmbedder::Bert(embedder) => embedder.embed(text_batch, batch_size), } } @@ -142,7 +142,7 @@ impl TextEmbedder { } } - /// Creates a new instance of a cloud api based `Embeder` with the specified model and API key. + /// Creates a new instance of a cloud api based `Embedder` with the specified model and API key. /// /// # Arguments /// @@ -159,7 +159,7 @@ impl TextEmbedder { /// /// # Returns /// - /// A new instance of `Embeder`. + /// A new instance of `Embedder`. pub fn from_pretrained_cloud( model: &str, model_id: &str, @@ -239,8 +239,8 @@ impl Embedder { batch_size: Option, ) -> Result, anyhow::Error> { match self { - Self::Text(embeder) => embeder.embed(text_batch, batch_size).await, - Self::Vision(embeder) => embeder.embed(text_batch, batch_size), + Self::Text(embedder) => embedder.embed(text_batch, batch_size).await, + Self::Vision(embedder) => embedder.embed(text_batch, batch_size), } } @@ -302,7 +302,7 @@ impl EmbedImage for Embedder { metadata: Option>, ) -> anyhow::Result { match self { - Self::Vision(embeder) => embeder.embed_image(image_path, metadata), + Self::Vision(embedder) => embedder.embed_image(image_path, metadata), _ => Err(anyhow::anyhow!("Model not supported for vision embedding")), } } @@ -312,7 +312,7 @@ impl EmbedImage for Embedder { image_paths: &[T], ) -> anyhow::Result> { match self { - Self::Vision(embeder) => embeder.embed_image_batch(image_paths), + Self::Vision(embedder) => embedder.embed_image_batch(image_paths), _ => Err(anyhow::anyhow!("Model not supported for vision embedding")), } } @@ -333,8 +333,8 @@ impl TextEmbed for VisionEmbedder { batch_size: Option, ) -> Result, anyhow::Error> { match self { - Self::Clip(embeder) => embeder.embed(text_batch, batch_size), - Self::ColPali(embeder) => embeder.embed(text_batch, batch_size), + Self::Clip(embedder) => embedder.embed(text_batch, batch_size), + Self::ColPali(embedder) => embedder.embed(text_batch, batch_size), } } } @@ -358,9 +358,9 @@ impl EmbedImage for VisionEmbedder { metadata: Option>, ) -> anyhow::Result { match self { - Self::Clip(embeder) => embeder.embed_image(image_path, metadata), - Self::ColPali(embeder) => { - embeder.embed_image(PathBuf::from(image_path.as_ref()), metadata) + Self::Clip(embedder) => embedder.embed_image(image_path, metadata), + Self::ColPali(embedder) => { + embedder.embed_image(PathBuf::from(image_path.as_ref()), metadata) } } } @@ -370,8 +370,8 @@ impl EmbedImage for VisionEmbedder { image_paths: &[T], ) -> anyhow::Result> { match self { - Self::Clip(embeder) => embeder.embed_image_batch(image_paths), - Self::ColPali(embeder) => embeder.embed_image_batch( + Self::Clip(embedder) => embedder.embed_image_batch(image_paths), + Self::ColPali(embedder) => embedder.embed_image_batch( &image_paths .iter() .map(|p| PathBuf::from(p.as_ref())) diff --git a/rust/src/embeddings/local/clip.rs b/rust/src/embeddings/local/clip.rs index 2f84ca43..db73f001 100644 --- a/rust/src/embeddings/local/clip.rs +++ b/rust/src/embeddings/local/clip.rs @@ -288,13 +288,13 @@ mod tests { // Tests the tokenize_sequences method. #[test] fn test_tokenize_sequences() { - let clip_embeder = ClipEmbedder::default(); + let clip_embedder = ClipEmbedder::default(); let sequences = Some(vec![ "Hey there how are you?".to_string(), "EmbedAnything is the best!".to_string(), ]); - let (input_ids, vec_seq) = clip_embeder - .tokenize_sequences(sequences, &clip_embeder.tokenizer) + let (input_ids, vec_seq) = clip_embedder + .tokenize_sequences(sequences, &clip_embedder.tokenizer) .unwrap(); assert_eq!( vec_seq, @@ -309,8 +309,8 @@ mod tests { // Tests the load_image method. #[test] fn test_load_image() { - let clip_embeder = ClipEmbedder::default(); - let image = clip_embeder + let clip_embedder = ClipEmbedder::default(); + let image = clip_embedder .load_image("test_files/clip/cat1.jpg", 224) .unwrap(); assert_eq!(image.shape().clone().into_dims(), &[3, 224, 224]); @@ -319,8 +319,8 @@ mod tests { // Tests the load_images method. #[test] fn test_load_images() { - let clip_embeder = ClipEmbedder::default(); - let images = clip_embeder + let clip_embedder = ClipEmbedder::default(); + let images = clip_embedder .load_images( &["test_files/clip/cat1.jpg", "test_files/clip/cat2.jpeg"], 224, @@ -332,8 +332,8 @@ mod tests { // Tests the embed_image_batch method. #[test] fn test_embed_image_batch() { - let clip_embeder = ClipEmbedder::default(); - let embeddings = clip_embeder + let clip_embedder = ClipEmbedder::default(); + let embeddings = clip_embedder .embed_image_batch(&["test_files/clip/cat1.jpg", "test_files/clip/cat2.jpeg"]) .unwrap(); assert_eq!(embeddings.len(), 2); diff --git a/rust/src/embeddings/local/jina.rs b/rust/src/embeddings/local/jina.rs index ee4c252b..98c3a4b5 100644 --- a/rust/src/embeddings/local/jina.rs +++ b/rust/src/embeddings/local/jina.rs @@ -332,10 +332,10 @@ mod tests { #[test] fn test_embed() { - let embeder = JinaEmbedder::new("jinaai/jina-embeddings-v2-small-en", None).unwrap(); + let embedder = JinaEmbedder::new("jinaai/jina-embeddings-v2-small-en", None).unwrap(); let text_batch = vec!["Hello, world!".to_string()]; - let encodings = embeder.embed(&text_batch, None).unwrap(); + let encodings = embedder.embed(&text_batch, None).unwrap(); println!("{:?}", encodings); } } diff --git a/rust/src/embeddings/mod.rs b/rust/src/embeddings/mod.rs index 73e9f639..4215ac57 100644 --- a/rust/src/embeddings/mod.rs +++ b/rust/src/embeddings/mod.rs @@ -64,13 +64,13 @@ pub fn text_batch_from_audio(segments: &[Segment]) -> Vec { } pub async fn embed_audio>( - embeder: &Embedder, + embedder: &Embedder, segments: Vec, audio_file: T, batch_size: Option, ) -> Result, anyhow::Error> { let text_batch = text_batch_from_audio(&segments); - let encodings = embeder.embed(&text_batch, batch_size).await?; + let encodings = embedder.embed(&text_batch, batch_size).await?; get_audio_metadata(encodings, segments, audio_file) } diff --git a/rust/src/file_processor/html_processor.rs b/rust/src/file_processor/html_processor.rs index e5ba21a1..fa0db810 100644 --- a/rust/src/file_processor/html_processor.rs +++ b/rust/src/file_processor/html_processor.rs @@ -21,7 +21,7 @@ pub struct HtmlDocument { impl HtmlDocument { pub async fn embed_webpage( &self, - embeder: &Embedder, + embedder: &Embedder, chunk_size: usize, overlap_ratio: f32, batch_size: Option, @@ -33,7 +33,7 @@ impl HtmlDocument { self.embed_tag( "p", paragraphs, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -47,7 +47,7 @@ impl HtmlDocument { self.embed_tag( "h1", headers, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -61,7 +61,7 @@ impl HtmlDocument { self.embed_tag( "code", codes, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -77,7 +77,7 @@ impl HtmlDocument { &self, tag: &str, tag_content: &[String], - embeder: &Embedder, + embedder: &Embedder, chunk_size: usize, overlap_ratio: f32, batch_size: Option, @@ -113,7 +113,7 @@ impl HtmlDocument { let metadata_hashmap: HashMap = serde_json::from_value(metadata)?; - let encodings = embeder.embed(&chunks, batch_size).await?; + let encodings = embedder.embed(&chunks, batch_size).await?; let embeddings = get_text_metadata(&Rc::new(encodings), &chunks, &Some(metadata_hashmap))?; embed_data.extend(embeddings); diff --git a/rust/src/file_processor/website_processor.rs b/rust/src/file_processor/website_processor.rs index 2a53f6a6..e704b7af 100644 --- a/rust/src/file_processor/website_processor.rs +++ b/rust/src/file_processor/website_processor.rs @@ -28,7 +28,7 @@ pub struct WebPage { impl WebPage { pub async fn embed_webpage( &self, - embeder: &Embedder, + embedder: &Embedder, chunk_size: usize, overlap_ratio: f32, batch_size: Option, @@ -40,7 +40,7 @@ impl WebPage { self.embed_tag( "p", paragraphs, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -54,7 +54,7 @@ impl WebPage { self.embed_tag( "h1", headers, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -68,7 +68,7 @@ impl WebPage { self.embed_tag( "code", codes, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -84,7 +84,7 @@ impl WebPage { &self, tag: &str, tag_content: &[String], - embeder: &Embedder, + embedder: &Embedder, chunk_size: usize, overlap_ratio: f32, batch_size: Option, @@ -120,7 +120,7 @@ impl WebPage { let metadata_hashmap: HashMap = serde_json::from_value(metadata)?; - let encodings = embeder.embed(&chunks, batch_size).await?; + let encodings = embedder.embed(&chunks, batch_size).await?; let embeddings = get_text_metadata(&Rc::new(encodings), &chunks, &Some(metadata_hashmap))?; embed_data.extend(embeddings); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 26d4106d..0b389c52 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -29,7 +29,7 @@ use tokio::sync::mpsc; // Add this at the top of your file /// # Arguments /// /// * `query` - A vector of strings representing the queries to embed. -/// * `embeder` - A string specifying the embedding model to use. Valid options are "OpenAI", "Jina", "Clip", and "Bert". +/// * `embedder` - A string specifying the embedding model to use. Valid options are "OpenAI", "Jina", "Clip", and "Bert". /// * `config` - An optional `EmbedConfig` object specifying the configuration for the embedding model. /// * 'adapter' - An optional `Adapter` object to send the embeddings to a vector database. /// @@ -47,16 +47,16 @@ use tokio::sync::mpsc; // Add this at the top of your file /// use embed_anything::embed_query; /// /// let query = vec!["Hello".to_string(), "World".to_string()]; -/// let embeder = "OpenAI"; +/// let embedder = "OpenAI"; /// let openai_config = OpenAIConfig{ model: Some("text-embedding-3-small".to_string()), api_key: None, chunk_size: Some(256) }; /// let config = EmbedConfig{ openai: Some(openai_config), ..Default::default() }; -/// let embeddings = embed_query(query, embeder).unwrap(); +/// let embeddings = embed_query(query, embedder).unwrap(); /// println!("{:?}", embeddings); /// ``` /// This will output the embeddings of the queries using the OpenAI embedding model. pub async fn embed_query( query: Vec, - embeder: &Embedder, + embedder: &Embedder, config: Option<&TextEmbedConfig>, ) -> Result> { let binding = TextEmbedConfig::default(); @@ -64,7 +64,7 @@ pub async fn embed_query( let _chunk_size = config.chunk_size.unwrap_or(256); let batch_size = config.batch_size; - let encodings = embeder.embed(&query, batch_size).await.unwrap(); + let encodings = embedder.embed(&query, batch_size).await.unwrap(); let embeddings = get_text_metadata(&Rc::new(encodings), &query, &None)?; Ok(embeddings) @@ -75,7 +75,7 @@ pub async fn embed_query( /// # Arguments /// /// * `file_name` - A string specifying the name of the file to embed. -/// * `embeder` - A string specifying the embedding model to use. Valid options are "OpenAI", "Jina", "Clip", and "Bert". +/// * `embedder` - A string specifying the embedding model to use. Valid options are "OpenAI", "Jina", "Clip", and "Bert". /// * `config` - An optional `EmbedConfig` object specifying the configuration for the embedding model. /// * 'adapter' - An optional `Adapter` object to send the embeddings to a vector database. /// @@ -93,14 +93,14 @@ pub async fn embed_query( /// use embed_anything::embed_file; /// /// let file_name = "test_files/test.pdf"; -/// let embeder = "Bert"; +/// let embedder = "Bert"; /// let bert_config = BertConfig{ model_id: Some("sentence-transformers/all-MiniLM-L12-v2".to_string()), revision: None, chunk_size: Some(256) }; -/// let embeddings = embed_file(file_name, embeder, config).unwrap(); +/// let embeddings = embed_file(file_name, embedder, config).unwrap(); /// ``` /// This will output the embeddings of the file using the OpenAI embedding model. pub async fn embed_file, F>( file_name: T, - embeder: &Embedder, + embedder: &Embedder, config: Option<&TextEmbedConfig>, adapter: Option, ) -> Result>> @@ -118,11 +118,11 @@ where let semantic_encoder = config.semantic_encoder.clone(); let use_ocr = config.use_ocr.unwrap_or(false); - match embeder { - Embedder::Text(embeder) => { + match embedder { + Embedder::Text(embedder) => { emb_text( file_name, - embeder, + embedder, Some(chunk_size), Some(overlap_ratio), batch_size, @@ -133,7 +133,7 @@ where ) .await } - Embedder::Vision(embeder) => Ok(Some(vec![emb_image(file_name, embeder).unwrap()])), + Embedder::Vision(embedder) => Ok(Some(vec![emb_image(file_name, embedder).unwrap()])), } } @@ -141,7 +141,7 @@ where /// /// # Arguments /// -/// * `embeder` - The embedding model to use. Supported options are "OpenAI", "Jina", and "Bert". +/// * `embedder` - The embedding model to use. Supported options are "OpenAI", "Jina", and "Bert". /// * `webpage` - The webpage to embed. /// /// # Returns @@ -155,15 +155,15 @@ where /// # Example /// /// ``` -/// let embeddings = match embeder { +/// let embeddings = match embedder { /// "OpenAI" => webpage -/// .embed_webpage(&embedding_model::openai::OpenAIEmbeder::default()) +/// .embed_webpage(&embedding_model::openai::OpenAIEmbedder::default()) /// .unwrap(), /// "Jina" => webpage -/// .embed_webpage(&embedding_model::jina::JinaEmbeder::default()) +/// .embed_webpage(&embedding_model::jina::JinaEmbedder::default()) /// .unwrap(), /// "Bert" => webpage -/// .embed_webpage(&embedding_model::bert::BertEmbeder::default()) +/// .embed_webpage(&embedding_model::bert::BertEmbedder::default()) /// .unwrap(), /// _ => { /// return Err(PyValueError::new_err( @@ -174,7 +174,7 @@ where /// ``` pub async fn embed_webpage( url: String, - embeder: &Embedder, + embedder: &Embedder, config: Option<&TextEmbedConfig>, // Callback function adapter: Option, @@ -185,7 +185,7 @@ where let website_processor = file_processor::website_processor::WebsiteProcessor::new(); let webpage = website_processor.process_website(url.as_ref())?; - // if let Embeder::Clip(_) = embeder { + // if let Embedder::Clip(_) = embedder { // return Err(anyhow!("Clip model does not support webpage embedding")); // } @@ -196,7 +196,7 @@ where let batch_size = config.batch_size; let embeddings = webpage - .embed_webpage(embeder, chunk_size, overlap_ratio, batch_size) + .embed_webpage(embedder, chunk_size, overlap_ratio, batch_size) .await?; // Send embeddings to vector database @@ -327,12 +327,12 @@ fn emb_image>( pub async fn emb_audio>( audio_file: T, audio_decoder: &mut AudioDecoderModel, - embeder: &Arc, + embedder: &Arc, text_embed_config: Option<&TextEmbedConfig>, ) -> Result>> { let segments: Vec = audio_decoder.process_audio(&audio_file).unwrap(); let embeddings = embed_audio( - embeder, + embedder, segments, audio_file, text_embed_config @@ -349,7 +349,7 @@ pub async fn emb_audio>( /// # Arguments /// /// * `directory` - A `PathBuf` representing the directory containing the images to embed. -/// * `embeder` - A reference to the embedding model to use. +/// * `embedder` - A reference to the embedding model to use. /// * `config` - An optional `ImageEmbedConfig` object specifying the configuration for the embedding model. Default buffer size is 100. /// * `adapter` - An optional callback function to handle the embeddings. /// @@ -367,8 +367,8 @@ pub async fn emb_audio>( /// use std::sync::Arc; /// /// let directory = PathBuf::from("/path/to/directory"); -/// let embeder = Arc::new(Embeder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); -/// let embeddings = embed_image_directory(directory, &embeder, None).await.unwrap(); +/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); +/// let embeddings = embed_image_directory(directory, &embedder, None).await.unwrap(); /// ``` /// This will output the embeddings of the images in the specified directory using the specified embedding model. /// @@ -392,7 +392,7 @@ where let (tx, mut rx) = mpsc::unbounded_channel(); let (collector_tx, mut collector_rx) = mpsc::unbounded_channel(); - let embeder = embedding_model.clone(); + let embedder = embedding_model.clone(); let pb = indicatif::ProgressBar::new(file_parser.files.len() as u64); pb.set_style( @@ -412,8 +412,8 @@ where image_buffer.push(image); if image_buffer.len() == buffer_size { - // Ensure embeder is mutable and not wrapped in Arc - match process_images(&image_buffer, embeder.clone()).await { + // Ensure embedder is mutable and not wrapped in Arc + match process_images(&image_buffer, embedder.clone()).await { Ok(embeddings) => { let files = embeddings .iter() @@ -441,7 +441,7 @@ where // Process any remaining images if !image_buffer.is_empty() { - match process_images(&image_buffer, embeder).await { + match process_images(&image_buffer, embedder).await { Ok(embeddings) => { let files = embeddings .iter() @@ -494,9 +494,9 @@ where async fn process_images( image_buffer: &[String], - embeder: Arc, + embedder: Arc, ) -> Result>> { - let embeddings = embeder.embed_image_batch(image_buffer)?; + let embeddings = embedder.embed_image_batch(image_buffer)?; Ok(Arc::new(embeddings)) } @@ -505,7 +505,7 @@ async fn process_images( /// # Arguments /// /// * `directory` - A `PathBuf` representing the directory containing the files to embed. -/// * `embeder` - A reference to the embedding model to use. +/// * `embedder` - A reference to the embedding model to use. /// * `extensions` - An optional vector of strings representing the file extensions to consider for embedding. If `None`, all files in the directory will be considered. /// * `config` - An optional `TextEmbedConfig` object specifying the configuration for the embedding model. /// * `adapter` - An optional callback function to handle the embeddings. @@ -524,15 +524,15 @@ async fn process_images( /// use std::sync::Arc; /// /// let directory = PathBuf::from("/path/to/directory"); -/// let embeder = Arc::new(Embeder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); +/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); /// let config = Some(TextEmbedConfig::default()); /// let extensions = Some(vec!["txt".to_string(), "pdf".to_string()]); -/// let embeddings = embed_directory_stream(directory, &embeder, extensions, config, None).await.unwrap(); +/// let embeddings = embed_directory_stream(directory, &embedder, extensions, config, None).await.unwrap(); /// ``` /// This will output the embeddings of the files in the specified directory using the specified embedding model. pub async fn embed_directory_stream( directory: PathBuf, - embeder: &Arc, + embedder: &Arc, extensions: Option>, config: Option<&TextEmbedConfig>, adapter: Option, @@ -555,7 +555,7 @@ where let (tx, mut rx) = mpsc::unbounded_channel(); let (collector_tx, mut collector_rx) = mpsc::unbounded_channel(); - let embeder = embeder.clone(); + let embedder = embedder.clone(); let pb = indicatif::ProgressBar::new(files.len() as u64); pb.set_style( indicatif::ProgressStyle::with_template( @@ -576,7 +576,7 @@ where metadata_buffer.push(metadata); if chunk_buffer.len() == buffer_size { - match process_chunks(&chunk_buffer, &metadata_buffer, &embeder, batch_size) + match process_chunks(&chunk_buffer, &metadata_buffer, &embedder, batch_size) .await { Ok(embeddings) => { @@ -607,7 +607,7 @@ where // Process any remaining chunks if !chunk_buffer.is_empty() { - match process_chunks(&chunk_buffer, &metadata_buffer, &embeder, batch_size).await { + match process_chunks(&chunk_buffer, &metadata_buffer, &embedder, batch_size).await { Ok(embeddings) => { let files = embeddings .iter() diff --git a/rust/src/text_loader.rs b/rust/src/text_loader.rs index 5e8eb2b9..412ba826 100644 --- a/rust/src/text_loader.rs +++ b/rust/src/text_loader.rs @@ -104,11 +104,11 @@ impl TextLoader { .map(|chunk| chunk.to_string()) .collect(), SplittingStrategy::Semantic => { - let embeder = semantic_encoder.unwrap_or(Arc::new(Embedder::Text( + let embedder = semantic_encoder.unwrap_or(Arc::new(Embedder::Text( TextEmbedder::Jina(Box::new(JinaEmbedder::default())), ))); let chunker = StatisticalChunker { - encoder: embeder, + encoder: embedder, ..Default::default() }; @@ -211,10 +211,10 @@ mod tests { } #[test] - fn test_image_embeder() { + fn test_image_embedder() { let file_path = PathBuf::from("test_files/clip/cat1.jpg"); - let embeder = ClipEmbedder::default(); - let emb_data = embeder.embed_image(file_path, None).unwrap(); + let embedder = ClipEmbedder::default(); + let emb_data = embedder.embed_image(file_path, None).unwrap(); assert_eq!(emb_data.embedding.to_dense().unwrap().len(), 512); } } diff --git a/tests/model_tests/test_adapter.py b/tests/model_tests/test_adapter.py index cf96b09e..6d7225dd 100644 --- a/tests/model_tests/test_adapter.py +++ b/tests/model_tests/test_adapter.py @@ -7,13 +7,13 @@ def test_adapter_upsert_call_file( ): assert ( embed_anything.embed_file( - test_pdf_file, embeder=bert_model, adapter=dummy_adapter + test_pdf_file, embedder=bert_model, adapter=dummy_adapter ) is None ) assert ( embed_anything.embed_file( - test_txt_file, embeder=bert_model, adapter=dummy_adapter + test_txt_file, embedder=bert_model, adapter=dummy_adapter ) is None ) @@ -22,7 +22,7 @@ def test_adapter_upsert_call_file( def test_adapter_upsert_call_directory(bert_model, dummy_adapter, test_files_directory): assert ( embed_anything.embed_directory( - test_files_directory, embeder=bert_model, adapter=dummy_adapter + test_files_directory, embedder=bert_model, adapter=dummy_adapter ) is None )