Skip to content

Commit

Permalink
🔧 Make paragraph regex configurable from lookup path (#117)
Browse files Browse the repository at this point in the history
  • Loading branch information
jenniferjiangkells authored Apr 22, 2024
1 parent 57728d7 commit d89ff7f
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 21 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ jobs:
pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl
- name: run pytest
run: pytest ./tests/*
- name: install ruff
run: pip install ruff
- name: ruff format
run: ruff format --check .
- name: Lint with Ruff
run: |
pip install ruff
ruff --output-format=github .
continue-on-error: true
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
paragraph,regex
prob,^(prob|probs|problem|problems|diag|diagnosis|diagnoses|issue|issues)( list|)
pmh,^(hx|pmhx|pmh|medical background|past medical history|past med hist)
prob,^(patient |)(current |final |hospital |active inpatient |complete |active |acute |inpatient |clinical |ongoing |in-patient |)(prob|probs|problem|problems|diag|diagnosis|diagnoses|issue|issues|this admission)( list|)
pmh,^(hx|pmhx|pmh|background|medical background|past medical history|past psychiatric history|past surgical history|past issues this admission|past med hist|bg)
med,^(home |current |active |outpatient |gp |current outpatient |)(med|meds|medications|drug|drugs|rx)
allergy,^(drug |med |medication |)(allerg|allergies|allergies|allergies and intolerances|intolerances|adverse effects|adverse reactions|adverse reaction risk)
history,^(pc|hpc|presenting complaint|history of presenting complaint|history|hist|synopsis|summary|clinical summary)
exam,^(exam|examination|o/e|o / e|oe)
ddx,^(diff|differential|differential diagnosis|ddx)
imp,^(imp|impression|diagnosis|formulation|diag|dx)
imp,^(imp|impression|diagnosis|formulation|diag|dx|psychiatric formulation|clinical summary impression|clinical summary / impression)
plan,^(plan|recommendations|recommendation|action|actions|goal|goals|advice|decision)
30 changes: 27 additions & 3 deletions src/miade/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ class AllergenType(Enum):


def load_lookup_data(filename: str, as_dict: bool = False, no_header: bool = False):
if not os.path.exists(filename):
log.error(f"Lookup data not configured, check {filename} exists!")

if as_dict:
return (
pd.read_csv(
Expand All @@ -59,6 +62,22 @@ def load_lookup_data(filename: str, as_dict: bool = False, no_header: bool = Fal
return pd.read_csv(filename).drop_duplicates()


def load_regex_paragraph_mappings(data: pd.DataFrame) -> Dict:
regex_lookup = {}

for paragraph, regex in data.items():
paragraph_enum = None
try:
paragraph_enum = ParagraphType(paragraph)
except ValueError as e:
log.warning(e)

if paragraph_enum is not None:
regex_lookup[paragraph_enum] = regex

return regex_lookup


def load_allergy_type_combinations(filename: str) -> Dict:
df = pd.read_csv(filename)

Expand Down Expand Up @@ -145,6 +164,8 @@ def __init__(self, cat: CAT, config: AnnotatorConfig = None):
if self.config.negation_detection == "negex":
self._add_negex_pipeline()

self._load_paragraph_regex()

# TODO make paragraph processing params configurable
self.structured_prob_lists = {
ParagraphType.prob: Relevance.PRESENT,
Expand All @@ -162,6 +183,10 @@ def _add_negex_pipeline(self) -> None:
self.cat.pipe.spacy_nlp.enable_pipe("sentencizer")
self.cat.pipe.spacy_nlp.add_pipe("negex")

def _load_paragraph_regex(self) -> None:
data = load_lookup_data(self.config.lookup_data_path + "regex_para_chunk.csv", as_dict=True)
self.paragraph_regex = load_regex_paragraph_mappings(data)

@property
@abstractmethod
def concept_types(self):
Expand Down Expand Up @@ -209,10 +234,9 @@ def get_concepts(self, note: Note) -> List[Concept]:

return concepts

@staticmethod
def preprocess(note: Note) -> Note:
def preprocess(self, note: Note) -> Note:
note.clean_text()
note.get_paragraphs()
note.get_paragraphs(self.paragraph_regex)

return note

Expand Down
7 changes: 3 additions & 4 deletions src/miade/note.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,9 @@ def load_regex_config_mappings(filename: str) -> Dict:
class Note(object):
"""docstring for Note."""

def __init__(self, text: str, regex_config_path: str = "./data/regex_para_chunk.csv"):
def __init__(self, text: str):
self.text = text
self.raw_text = text
self.regex_config = load_regex_config_mappings(regex_config_path)
self.paragraphs: Optional[List[Paragraph]] = []

def clean_text(self) -> None:
Expand All @@ -59,7 +58,7 @@ def clean_text(self) -> None:
# Remove spaces if the entire line (between two line breaks) is just spaces
self.text = re.sub(r"(?<=\n)\s+(?=\n)", "", self.text)

def get_paragraphs(self) -> None:
def get_paragraphs(self, paragraph_regex: Dict) -> None:
paragraphs = re.split(r"\n\n+", self.text)
start = 0

Expand All @@ -86,7 +85,7 @@ def get_paragraphs(self) -> None:
if heading:
heading = heading.lower()
# Iterate through the dictionary items and patterns
for paragraph_type, pattern in self.regex_config.items():
for paragraph_type, pattern in paragraph_regex.items():
if re.search(pattern, heading):
paragraph.type = paragraph_type
break # Exit the loop if a match is found
Expand Down
1 change: 0 additions & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from miade.core import NoteProcessor
from miade.concept import Concept, Category
from miade.annotators import Annotator
from miade.metaannotations import MetaAnnotations
from miade.utils.metaannotationstypes import (
Presence,
Expand Down
11 changes: 5 additions & 6 deletions tests/test_note.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from miade.annotators import Annotator, MedsAllergiesAnnotator, ProblemsAnnotator
from miade.core import NoteProcessor
from miade.annotators import MedsAllergiesAnnotator, ProblemsAnnotator
from miade.concept import Concept, Category
from miade.paragraph import Paragraph, ParagraphType
from miade.metaannotations import MetaAnnotations
Expand All @@ -11,9 +10,9 @@
)


def test_note_cleaning_and_paragraphing(test_clean_and_paragraphing_note):
test_clean_and_paragraphing_note.clean_text()
test_clean_and_paragraphing_note.get_paragraphs()
def test_note_cleaning_and_paragraphing(test_problems_medcat_model, test_clean_and_paragraphing_note):
annotator = ProblemsAnnotator(test_problems_medcat_model)
annotator.preprocess(test_clean_and_paragraphing_note)

assert test_clean_and_paragraphing_note.paragraphs == [
Paragraph(heading="", body="", type=ParagraphType.prose, start=0, end=182),
Expand Down Expand Up @@ -66,7 +65,7 @@ def test_prob_paragraph_note(
]


def test_prob_paragraph_note(
def test_med_paragraph_note(
test_meds_algy_medcat_model, test_clean_and_paragraphing_note, test_paragraph_chunking_med_concepts
):
annotator = MedsAllergiesAnnotator(test_meds_algy_medcat_model)
Expand Down

0 comments on commit d89ff7f

Please sign in to comment.