Skip to content

Commit

Permalink
update notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
shaunahu committed Nov 5, 2024
1 parent 248932f commit 1ff7260
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 39 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,4 @@ esManager.ini
.idea
node_modules/

**/temp
**/output
37 changes: 34 additions & 3 deletions data_discovery_ai/utils/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from tqdm import tqdm
from pathlib import Path
from typing import Dict

# Base directory where your Poetry project's pyproject.toml is located
BASE_DIR = Path(__file__).resolve().parent.parent
Expand All @@ -29,6 +30,33 @@
logger.setLevel(logging.INFO)


class Concept:
def __init__(self, id, url, vocab_type) -> None:
self.id = id
self.url = url
self.vocab_type = vocab_type

def to_json(self) -> Dict[str, Any]:
return {
"vocab_type": self.title,
"value": self.id,
"url": self.url,
}

def __eq__(self, other: object) -> bool:
if not isinstance(other, Concept):
return NotImplemented

return (
self.id == other.id
and self.url == other.url
and self.vocab_type == other.vocab_type
)

def __hash__(self):
return hash((self.id, self.url, self.vocab_type))


def save_to_file(obj: Any, file_name: str) -> None:
"""
Saves an object to a file using pickle serialization in the input folder.
Expand Down Expand Up @@ -65,7 +93,7 @@ def identify_sample(raw_data: pd.DataFrame, vocabs: List[str]) -> pd.DataFrame:
["_id", "_source.title", "_source.description", "_source.themes"]
]
raw_data_cleaned.columns = ["id", "title", "description", "keywords"]

raw_data_cleaned["keywords"] = raw_data_cleaned["keywords"].apply(lambda k: eval(k))
sampleSet = raw_data_cleaned[
raw_data_cleaned["keywords"].apply(
lambda terms: any(
Expand Down Expand Up @@ -216,7 +244,7 @@ def keywords_formatter(text: Union[str, List[dict]], vocabs: List[str]) -> List[
text: Union[str, List[dict]. The input keywords, expected to be a list of dictionaries, can be passed as a string representation of the list.
vocabs: List[str]. A list of vocabulary names to match against keyword titles.
Output:
A list of formatted keywords, with duplicates removed, in the form `title:id`.
A list of formatted keywords, with duplicates removed, in the form `title;id`.
"""
if type(text) is list:
keywords = text
Expand All @@ -226,7 +254,10 @@ def keywords_formatter(text: Union[str, List[dict]], vocabs: List[str]) -> List[
for keyword in keywords:
for concept in keyword["concepts"]:
if keyword["title"] in vocabs and concept["id"] != "":
concept_str = keyword["title"] + ":" + concept["id"]
keyword = Concept(
id=concept["id"], url=concept["url"], vocab_type=keyword["title"]
)
concept_str = keyword.to_json()
k_list.append(concept_str)
return list(set(k_list))

Expand Down
44 changes: 9 additions & 35 deletions notebooks/KeywordClassificationNonTechNotebook.ipynb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

90 changes: 90 additions & 0 deletions notebooks/KeywordClassificationTechNotebook.ipynb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1ff7260

Please sign in to comment.