Merge pull request #8 from souzadevinicius/souzadevinicius/issue7

upheno mappings should not contain cases where the source and the target is from the same ontology
souzadevinicius · Dec 13, 2023 · 81d79dc · 81d79dc
2 parents 4b73d2f + 685fe81
commit 81d79dc
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 16 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "phenio-toolkit"
-version = "0.1.4"
+version = "0.1.5"
 description = "phenio-toolkit"
 authors = ["Vinicius de Souza <[email protected]>"]
 license = "MIT"

diff --git a/src/phenio_toolkit/mapping/lexical_mapping.py b/src/phenio_toolkit/mapping/lexical_mapping.py
@@ -44,6 +44,19 @@ def __init__(
         self.upheno_species_lexical = upheno_species_lexical
         self.upheno_mapping_logical = upheno_mapping_logical
         self.stopwords = stopwords
+        obo_converter = curies.get_obo_converter()
+        custom_converter = curies.Converter(
+            [
+                curies.Record(
+                    prefix="MGPO",
+                    prefix_synonyms=[],
+                    uri_prefix="http://purl.obolibrary.org/obo/MGPO_",
+                    uri_prefix_synonyms=[],
+                )
+            ]
+        )
+        self.converter = curies.chain([obo_converter, custom_converter])
+
 
     def _apply_stopword(self, label):
         for stopword in self.stopwords:
@@ -67,6 +80,11 @@ def _load_upheno_mappings(self):
         df_label.columns = ["iri", "label"]
         return df, df_label, dfl
 
+    def _are_terms_from_same_ontology(self, row):
+        subject_prefix = row["subject_id"].split(":", maxsplit=1)[0]
+        object_prefix = row["object_id"].split(":", maxsplit=1)[0]
+        return subject_prefix == object_prefix
+
     def _preprocess_labels(self, df):
         df["label"] = df["label"].astype(str)
         df["label_pp"] = df["label"].apply(lambda x: re.sub(r"[(][A-Z]+[)]", "", x))
@@ -101,7 +119,7 @@ def _compute_mappings(self, dd, l):
                     iris.extend(dd.get(lab))
                     done.add(lab)
             iris = list(set(iris))
-            if len(iris) > 1:
+            if len(iris) > 1 :
                 pairs = _pairwise(iris)
                 for pair in pairs:
                     data.append([pair[0], pair[1]])
@@ -166,24 +184,12 @@ def generate_mapping_files(self, output):
         df_m = df_m.drop(["iri", "cat_x", "cat_y"], axis=1)
         df_m["cat"] = df_m["cat"].str.replace(r"(^nan-)|(-nan$)", "", regex=True)
 
-        obo_converter = curies.get_obo_converter()
-        custom_converter = curies.Converter(
-            [
-                curies.Record(
-                    prefix="MGPO",
-                    prefix_synonyms=[],
-                    uri_prefix="http://purl.obolibrary.org/obo/MGPO_",
-                    uri_prefix_synonyms=[],
-                )
-            ]
-        )
-        converter = curies.chain([obo_converter, custom_converter])
 
         df_m["subject_id"] = df_m.apply(
-            lambda x: converter.compress_or_standardize(x["p1"]), axis=1
+            lambda x: self.converter.compress_or_standardize(x["p1"]), axis=1
         )
 
-        df_m["object_id"] = df_m.apply(lambda x: converter.compress_or_standardize(x["p2"]), axis=1)
+        df_m["object_id"] = df_m.apply(lambda x: self.converter.compress_or_standardize(x["p2"]), axis=1)
 
         df_m["subject_source"] = df_m.apply(
             lambda x: f"obo:{str(x['subject_id']).split(':', maxsplit=1)[0].lower()}", axis=1
@@ -225,4 +231,7 @@ def generate_mapping_files(self, output):
                 "mapping_justification",
             ]
         ]
+        df_m = df_m[~df_m.apply(self._are_terms_from_same_ontology, axis=1)]
+
+
         df_m.to_csv(mapping_all, sep="\t", index=False)