praekeltfoundation · TatendaMugadza · Jan 11, 2024 · Jan 15, 2024 · Jan 15, 2024
diff --git a/README.md b/README.md
@@ -128,5 +128,9 @@ Pull requests are welcome. For major changes, please open an issue first to disc
 
 Please make sure to update tests as appropriate.
 
+## Content Embeddings
+if you want to generate content content embeddings, please make sure you define LOAD_TRANSFORMER_MODEL in your environment and set it to True
+NB: this causes the model to be loaded into memory and can significantly increase resource requirements
+
 ## License
 [MIT](https://choosealicense.com/licenses/mit/)
diff --git a/home/tests/test_api.py b/home/tests/test_api.py
@@ -110,6 +110,32 @@ def test_tag_filtering(self, uclient):
         content = json.loads(response.content)
         assert content["count"] == 2
 
+    def test_search_aaq_filtering(self, uclient):
+        """
+        If an s filter is provided, only pages with matching search tearm are returned.
+        """
+        page1 = ContentPage.objects.first()
+        page1.enable_whatsapp = True
+        page1.save_revision().publish()
+        # it should return 1 page for correct search term
+        response = uclient.get("/api/v2/pages/?s=help&whatsapp=true")
+        content = json.loads(response.content)
+        assert content["count"] == 1
+        # it should return 0 pages for meaningless search term
+        response = uclient.get("/api/v2/pages/?s=%23&whatsapp=true")
+        content = json.loads(response.content)
+        assert content["count"] == 0
+        # it should return 0 pages for correct search term if no platform is provided
+        response = uclient.get("/api/v2/pages/?s=help")
+        content = json.loads(response.content)
+        print("content is ",content)
+        assert content["count"] == 0
+        # it should not return search term matching pages if they are unpublished
+        page1.unpublish()
+        response = uclient.get("/api/v2/pages/?s=help(&whatsapp=true")
+        content = json.loads(response.content)
+        assert content["count"] == 0
+
     def test_platform_filtering(self, uclient):
         """
         If a platform filter is provided, only pages with content for that

diff --git a/home/word_embedding.py b/home/word_embedding.py
@@ -31,7 +31,7 @@ def retrieve_top_n_content_pieces(
         )  # Replace with your cosine similarity calculation
         documents_retrieved.append((page.pk, page.title, page.body, similarity_score))
     documents_retrieved = sorted(documents_retrieved, key=lambda x: x[3], reverse=True)
-    content_retrieved = [doc[0] for doc in documents_retrieved[0:n]]
+    content_retrieved = [doc[0] for doc in documents_retrieved[0:n] if doc[3] >= 0.25]
     return content_retrieved
 
 
@@ -70,16 +70,15 @@ def preprocess_content_for_embedding(content):
         extract = " ".join(extract)
         content = content.replace(url, extract)
     content = (
-        "".join(content.split("*", 2)[2:])
+        content
         .replace("\n\n", " ")
         .replace("\n", " ")
         .replace("  ", " ")
         .replace("*", "")
     )  # Remove content piece title
     if len(content) < 2:
         return content
-    if content[0] == " ":  # Remove space trailing content title
-        content = content[1:]
+    content = content.lstrip().rstrip()  # Remove spaces leading/trailing content
     emoji_pattern = re.compile(
         "["
         "\U0001F600-\U0001F64F"  # emoticons