🔧 Make paragraph regex configurable from lookup path (#117)

uclh-criu · Apr 22, 2024 · d89ff7f · d89ff7f
1 parent 57728d7
commit d89ff7f
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 21 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -31,7 +31,8 @@ jobs:
           pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl
       - name: run pytest
         run: pytest ./tests/*
-      - name: install ruff
-        run: pip install ruff
-      - name: ruff format
-        run: ruff format --check .
+      - name: Lint with Ruff
+        run: |
+          pip install ruff
+          ruff --output-format=github .
+        continue-on-error: true
diff --git a/src/miade/data/regex_para_chunk.csv → lookup_data/regex_para_chunk.csv b/src/miade/data/regex_para_chunk.csv → lookup_data/regex_para_chunk.csv
@@ -1,10 +1,10 @@
 paragraph,regex
-prob,^(prob|probs|problem|problems|diag|diagnosis|diagnoses|issue|issues)( list|)
-pmh,^(hx|pmhx|pmh|medical background|past medical history|past med hist)
+prob,^(patient |)(current |final |hospital |active inpatient |complete |active |acute |inpatient |clinical |ongoing |in-patient |)(prob|probs|problem|problems|diag|diagnosis|diagnoses|issue|issues|this admission)( list|)
+pmh,^(hx|pmhx|pmh|background|medical background|past medical history|past psychiatric history|past surgical history|past issues this admission|past med hist|bg)
 med,^(home |current |active |outpatient |gp |current outpatient |)(med|meds|medications|drug|drugs|rx)
 allergy,^(drug |med |medication |)(allerg|allergies|allergies|allergies and intolerances|intolerances|adverse effects|adverse reactions|adverse reaction risk)
 history,^(pc|hpc|presenting complaint|history of presenting complaint|history|hist|synopsis|summary|clinical summary)
 exam,^(exam|examination|o/e|o / e|oe)
 ddx,^(diff|differential|differential diagnosis|ddx)
-imp,^(imp|impression|diagnosis|formulation|diag|dx)
+imp,^(imp|impression|diagnosis|formulation|diag|dx|psychiatric formulation|clinical summary impression|clinical summary / impression)
 plan,^(plan|recommendations|recommendation|action|actions|goal|goals|advice|decision)
diff --git a/src/miade/annotators.py b/src/miade/annotators.py
@@ -44,6 +44,9 @@ class AllergenType(Enum):
 
 
 def load_lookup_data(filename: str, as_dict: bool = False, no_header: bool = False):
+    if not os.path.exists(filename):
+        log.error(f"Lookup data not configured, check {filename} exists!")
+
     if as_dict:
         return (
             pd.read_csv(
@@ -59,6 +62,22 @@ def load_lookup_data(filename: str, as_dict: bool = False, no_header: bool = Fal
         return pd.read_csv(filename).drop_duplicates()
 
 
+def load_regex_paragraph_mappings(data: pd.DataFrame) -> Dict:
+    regex_lookup = {}
+
+    for paragraph, regex in data.items():
+        paragraph_enum = None
+        try:
+            paragraph_enum = ParagraphType(paragraph)
+        except ValueError as e:
+            log.warning(e)
+
+        if paragraph_enum is not None:
+            regex_lookup[paragraph_enum] = regex
+
+    return regex_lookup
+
+
 def load_allergy_type_combinations(filename: str) -> Dict:
     df = pd.read_csv(filename)
 
@@ -145,6 +164,8 @@ def __init__(self, cat: CAT, config: AnnotatorConfig = None):
         if self.config.negation_detection == "negex":
             self._add_negex_pipeline()
 
+        self._load_paragraph_regex()
+
         # TODO make paragraph processing params configurable
         self.structured_prob_lists = {
             ParagraphType.prob: Relevance.PRESENT,
@@ -162,6 +183,10 @@ def _add_negex_pipeline(self) -> None:
         self.cat.pipe.spacy_nlp.enable_pipe("sentencizer")
         self.cat.pipe.spacy_nlp.add_pipe("negex")
 
+    def _load_paragraph_regex(self) -> None:
+        data = load_lookup_data(self.config.lookup_data_path + "regex_para_chunk.csv", as_dict=True)
+        self.paragraph_regex = load_regex_paragraph_mappings(data)
+
     @property
     @abstractmethod
     def concept_types(self):
@@ -209,10 +234,9 @@ def get_concepts(self, note: Note) -> List[Concept]:
 
         return concepts
 
-    @staticmethod
-    def preprocess(note: Note) -> Note:
+    def preprocess(self, note: Note) -> Note:
         note.clean_text()
-        note.get_paragraphs()
+        note.get_paragraphs(self.paragraph_regex)
 
         return note
 

diff --git a/src/miade/note.py b/src/miade/note.py
@@ -40,10 +40,9 @@ def load_regex_config_mappings(filename: str) -> Dict:
 class Note(object):
     """docstring for Note."""
 
-    def __init__(self, text: str, regex_config_path: str = "./data/regex_para_chunk.csv"):
+    def __init__(self, text: str):
         self.text = text
         self.raw_text = text
-        self.regex_config = load_regex_config_mappings(regex_config_path)
         self.paragraphs: Optional[List[Paragraph]] = []
 
     def clean_text(self) -> None:
@@ -59,7 +58,7 @@ def clean_text(self) -> None:
         # Remove spaces if the entire line (between two line breaks) is just spaces
         self.text = re.sub(r"(?<=\n)\s+(?=\n)", "", self.text)
 
-    def get_paragraphs(self) -> None:
+    def get_paragraphs(self, paragraph_regex: Dict) -> None:
         paragraphs = re.split(r"\n\n+", self.text)
         start = 0
 
@@ -86,7 +85,7 @@ def get_paragraphs(self) -> None:
             if heading:
                 heading = heading.lower()
                 # Iterate through the dictionary items and patterns
-                for paragraph_type, pattern in self.regex_config.items():
+                for paragraph_type, pattern in paragraph_regex.items():
                     if re.search(pattern, heading):
                         paragraph.type = paragraph_type
                         break  # Exit the loop if a match is found

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1,6 +1,5 @@
 from miade.core import NoteProcessor
 from miade.concept import Concept, Category
-from miade.annotators import Annotator
 from miade.metaannotations import MetaAnnotations
 from miade.utils.metaannotationstypes import (
     Presence,

diff --git a/tests/test_note.py b/tests/test_note.py
@@ -1,5 +1,4 @@
-from miade.annotators import Annotator, MedsAllergiesAnnotator, ProblemsAnnotator
-from miade.core import NoteProcessor
+from miade.annotators import MedsAllergiesAnnotator, ProblemsAnnotator
 from miade.concept import Concept, Category
 from miade.paragraph import Paragraph, ParagraphType
 from miade.metaannotations import MetaAnnotations
@@ -11,9 +10,9 @@
 )
 
 
-def test_note_cleaning_and_paragraphing(test_clean_and_paragraphing_note):
-    test_clean_and_paragraphing_note.clean_text()
-    test_clean_and_paragraphing_note.get_paragraphs()
+def test_note_cleaning_and_paragraphing(test_problems_medcat_model, test_clean_and_paragraphing_note):
+    annotator = ProblemsAnnotator(test_problems_medcat_model)
+    annotator.preprocess(test_clean_and_paragraphing_note)
 
     assert test_clean_and_paragraphing_note.paragraphs == [
         Paragraph(heading="", body="", type=ParagraphType.prose, start=0, end=182),
@@ -66,7 +65,7 @@ def test_prob_paragraph_note(
     ]
 
 
-def test_prob_paragraph_note(
+def test_med_paragraph_note(
     test_meds_algy_medcat_model, test_clean_and_paragraphing_note, test_paragraph_chunking_med_concepts
 ):
     annotator = MedsAllergiesAnnotator(test_meds_algy_medcat_model)