update notebook

aodn · Nov 5, 2024 · 1ff7260 · 1ff7260
1 parent 248932f
commit 1ff7260
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -84,5 +84,4 @@ esManager.ini
 .idea
 node_modules/
 
-**/temp
 **/output
diff --git a/data_discovery_ai/utils/preprocessor.py b/data_discovery_ai/utils/preprocessor.py
@@ -20,6 +20,7 @@
 from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
 from tqdm import tqdm
 from pathlib import Path
+from typing import Dict
 
 # Base directory where your Poetry project's pyproject.toml is located
 BASE_DIR = Path(__file__).resolve().parent.parent
@@ -29,6 +30,33 @@
 logger.setLevel(logging.INFO)
 
 
+class Concept:
+    def __init__(self, id, url, vocab_type) -> None:
+        self.id = id
+        self.url = url
+        self.vocab_type = vocab_type
+
+    def to_json(self) -> Dict[str, Any]:
+        return {
+            "vocab_type": self.title,
+            "value": self.id,
+            "url": self.url,
+        }
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Concept):
+            return NotImplemented
+
+        return (
+            self.id == other.id
+            and self.url == other.url
+            and self.vocab_type == other.vocab_type
+        )
+
+    def __hash__(self):
+        return hash((self.id, self.url, self.vocab_type))
+
+
 def save_to_file(obj: Any, file_name: str) -> None:
     """
     Saves an object to a file using pickle serialization in the input folder.
@@ -65,7 +93,7 @@ def identify_sample(raw_data: pd.DataFrame, vocabs: List[str]) -> pd.DataFrame:
         ["_id", "_source.title", "_source.description", "_source.themes"]
     ]
     raw_data_cleaned.columns = ["id", "title", "description", "keywords"]
-
+    raw_data_cleaned["keywords"] = raw_data_cleaned["keywords"].apply(lambda k: eval(k))
     sampleSet = raw_data_cleaned[
         raw_data_cleaned["keywords"].apply(
             lambda terms: any(
@@ -216,7 +244,7 @@ def keywords_formatter(text: Union[str, List[dict]], vocabs: List[str]) -> List[
         text: Union[str, List[dict]. The input keywords, expected to be a list of dictionaries, can be passed as a string representation of the list.
         vocabs: List[str]. A list of vocabulary names to match against keyword titles.
     Output:
-        A list of formatted keywords, with duplicates removed, in the form `title:id`.
+        A list of formatted keywords, with duplicates removed, in the form `title;id`.
     """
     if type(text) is list:
         keywords = text
@@ -226,7 +254,10 @@ def keywords_formatter(text: Union[str, List[dict]], vocabs: List[str]) -> List[
     for keyword in keywords:
         for concept in keyword["concepts"]:
             if keyword["title"] in vocabs and concept["id"] != "":
-                concept_str = keyword["title"] + ":" + concept["id"]
+                keyword = Concept(
+                    id=concept["id"], url=concept["url"], vocab_type=keyword["title"]
+                )
+                concept_str = keyword.to_json()
                 k_list.append(concept_str)
     return list(set(k_list))
 

diff --git a/notebooks/KeywordClassificationNonTechNotebook.ipynb b/notebooks/KeywordClassificationNonTechNotebook.ipynb
diff --git a/notebooks/KeywordClassificationTechNotebook.ipynb b/notebooks/KeywordClassificationTechNotebook.ipynb
-Original file line number
+Diff line change
@@ Expand Up / @@ -84,5 +84,4 @@ esManager.ini @@
     .idea
     node_modules/
-    **/temp
     **/output