Skip to content

Commit

Permalink
Merge pull request #8 from souzadevinicius/souzadevinicius/issue7
Browse files Browse the repository at this point in the history
upheno mappings should not contain cases where the source and the target is from the same ontology
  • Loading branch information
souzadevinicius authored Dec 13, 2023
2 parents 4b73d2f + 685fe81 commit 81d79dc
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 16 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "phenio-toolkit"
version = "0.1.4"
version = "0.1.5"
description = "phenio-toolkit"
authors = ["Vinicius de Souza <[email protected]>"]
license = "MIT"
Expand Down
39 changes: 24 additions & 15 deletions src/phenio_toolkit/mapping/lexical_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,19 @@ def __init__(
self.upheno_species_lexical = upheno_species_lexical
self.upheno_mapping_logical = upheno_mapping_logical
self.stopwords = stopwords
obo_converter = curies.get_obo_converter()
custom_converter = curies.Converter(
[
curies.Record(
prefix="MGPO",
prefix_synonyms=[],
uri_prefix="http://purl.obolibrary.org/obo/MGPO_",
uri_prefix_synonyms=[],
)
]
)
self.converter = curies.chain([obo_converter, custom_converter])


def _apply_stopword(self, label):
for stopword in self.stopwords:
Expand All @@ -67,6 +80,11 @@ def _load_upheno_mappings(self):
df_label.columns = ["iri", "label"]
return df, df_label, dfl

def _are_terms_from_same_ontology(self, row):
subject_prefix = row["subject_id"].split(":", maxsplit=1)[0]
object_prefix = row["object_id"].split(":", maxsplit=1)[0]
return subject_prefix == object_prefix

def _preprocess_labels(self, df):
df["label"] = df["label"].astype(str)
df["label_pp"] = df["label"].apply(lambda x: re.sub(r"[(][A-Z]+[)]", "", x))
Expand Down Expand Up @@ -101,7 +119,7 @@ def _compute_mappings(self, dd, l):
iris.extend(dd.get(lab))
done.add(lab)
iris = list(set(iris))
if len(iris) > 1:
if len(iris) > 1 :
pairs = _pairwise(iris)
for pair in pairs:
data.append([pair[0], pair[1]])
Expand Down Expand Up @@ -166,24 +184,12 @@ def generate_mapping_files(self, output):
df_m = df_m.drop(["iri", "cat_x", "cat_y"], axis=1)
df_m["cat"] = df_m["cat"].str.replace(r"(^nan-)|(-nan$)", "", regex=True)

obo_converter = curies.get_obo_converter()
custom_converter = curies.Converter(
[
curies.Record(
prefix="MGPO",
prefix_synonyms=[],
uri_prefix="http://purl.obolibrary.org/obo/MGPO_",
uri_prefix_synonyms=[],
)
]
)
converter = curies.chain([obo_converter, custom_converter])

df_m["subject_id"] = df_m.apply(
lambda x: converter.compress_or_standardize(x["p1"]), axis=1
lambda x: self.converter.compress_or_standardize(x["p1"]), axis=1
)

df_m["object_id"] = df_m.apply(lambda x: converter.compress_or_standardize(x["p2"]), axis=1)
df_m["object_id"] = df_m.apply(lambda x: self.converter.compress_or_standardize(x["p2"]), axis=1)

df_m["subject_source"] = df_m.apply(
lambda x: f"obo:{str(x['subject_id']).split(':', maxsplit=1)[0].lower()}", axis=1
Expand Down Expand Up @@ -225,4 +231,7 @@ def generate_mapping_files(self, output):
"mapping_justification",
]
]
df_m = df_m[~df_m.apply(self._are_terms_from_same_ontology, axis=1)]


df_m.to_csv(mapping_all, sep="\t", index=False)

0 comments on commit 81d79dc

Please sign in to comment.