Skip to content

Commit

Permalink
v0.10.2 (#103)
Browse files Browse the repository at this point in the history
* 🐛 Streamlit dashboard preprocessing bug fix

* 🗃️ Update problems blacklist (removed some social concepts)
  • Loading branch information
jenniferjiangkells authored Nov 15, 2023
1 parent b90ccf1 commit 6e46f22
Show file tree
Hide file tree
Showing 4 changed files with 311 additions and 6 deletions.
5 changes: 0 additions & 5 deletions src/miade/data/problem_blacklist.csv
Original file line number Diff line number Diff line change
Expand Up @@ -15697,7 +15697,6 @@
105523009
105524003
105525002
105529008
105530003
105531004
105532006
Expand Down Expand Up @@ -19507,7 +19506,6 @@
160720000
160721001
160724009
160725005
160729004
160731008
160739005
Expand Down Expand Up @@ -21186,7 +21184,6 @@
170720001
170721002
170722009
170727003
170730005
170731009
170732002
Expand Down Expand Up @@ -37547,7 +37544,6 @@
300976007
300977003
300978008
300994001
301029007
301037004
301038009
Expand Down Expand Up @@ -66398,7 +66394,6 @@
298191000000102
298201000000100
298641000000100
298651000000102
298711000000107
298941000000105
298971000000104
Expand Down
1 change: 1 addition & 0 deletions streamlit_app/.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
VIZ_DATA_PATH=./samples/problems_synthetic_train_example.csv

TRAIN_CSV_DIR=./samples/
SYNTH_CSV_DIR=./samples/
TRAIN_JSON_DIR=./samples/
TEST_JSON_DIR=./samples/
Expand Down
22 changes: 21 additions & 1 deletion streamlit_app/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import json
from time import sleep

import numpy as np
Expand All @@ -20,6 +21,7 @@

from medcat.cat import CAT
from miade.utils.miade_meta_cat import MiADE_MetaCAT
from utils import *

load_dotenv(find_dotenv())

Expand Down Expand Up @@ -227,7 +229,7 @@ def aggrid_interactive_table(df: pd.DataFrame):


# load data
train_data_df = load_csv_data(os.getenv("VIZ_DATA_PATH"))
# train_data_df = load_csv_data(os.getenv("VIZ_DATA_PATH"))

tab1, tab2, tab3, tab4 = st.tabs(["Train", "Test", "Data", "Try"])

Expand All @@ -238,8 +240,26 @@ def aggrid_interactive_table(df: pd.DataFrame):
st.markdown("**Adjust** the sliders to vary the amount of synthetic data "
" you want to include in the training data in addition to your annotations:")
train_json_path = st.selectbox("Select annotated training data", TRAIN_JSON_OPTIONS)

train_csv = train_json_path.replace(".json", ".csv")
train_csv_path = os.path.join(os.getenv("TRAIN_CSV_DIR"), train_csv)

train_json_path = os.path.join(os.getenv("TRAIN_JSON_DIR"), train_json_path)

if not os.path.exists(train_csv_path):
with open(train_json_path) as file:
train_data = json.load(file)
train_text = load_documents(train_data)
train_annotations = load_annotations(train_data)
valid_train_ann = get_valid_annotations(train_annotations)
if "problems" in train_json_path:
train_data_df = get_probs_meta_classes_data(train_text, valid_train_ann)
else:
train_data_df = get_meds_meta_classes_data(train_text, valid_train_ann)
train_data_df.to_csv(train_csv_path, index=False)
else:
train_data_df = load_csv_data(train_csv_path)

synth_csv_path = st.selectbox("Select synthetic data file:", SYNTH_DATA_OPTIONS)
synth_csv_path = os.path.join(os.getenv("SYNTH_CSV_DIR"), synth_csv_path)

Expand Down
289 changes: 289 additions & 0 deletions streamlit_app/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
import pandas as pd

from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE

from typing import Optional


def load_documents(data):
documents = {}
for i in range(0,len(data['projects'][0]['documents'])):
documents[data['projects'][0]['documents'][i]['id']] = data['projects'][0]['documents'][i]['text']
return documents


def load_annotations(data):
annotations = []
for i in range(0,len(data['projects'][0]['documents'])):
document_id = data['projects'][0]['documents'][i]['id']
annotations.extend([Annotation.from_dict(ann, document_id) for ann in data['projects'][0]['documents'][i]['annotations']])
return annotations


def get_valid_annotations(data):
annotations = []
for ann in data:
if not ann.deleted and not ann.killed and not ann.irrelevant:
annotations.append(ann)
return annotations


def get_probs_meta_classes_data(documents, annotations, ):
r_labels = []
p_labels = []
l_labels = []
cuis = []
names = []
texts = []
tokens = []
for ann in annotations:
r_labels.append(ann.meta_relevance)
p_labels.append(ann.meta_presence)
l_labels.append(ann.meta_laterality)

cuis.append(ann.cui)
names.append(ann.value.lower())

document = documents[ann.document_id].lower()
_start = max(0, ann.start - 70)
_end = min(len(document), ann.end + 1 + 70)
texts.append(document[_start:_end])

# doc_text = tokenizer(document)
# ind = 0
# for ind, pair in enumerate(doc_text['offset_mapping']):
# if ann.start >= pair[0] and ann.start < pair[1]:
# break
# t_start = max(0, ind - 15)
# t_end = min(len(doc_text['input_ids']), ind + 1 + 10)
# tkns = doc_text['tokens'][t_start:t_end]
# tokens.append(tkns)

df = pd.DataFrame({"text": texts,
"cui": cuis,
"name": names,
# "tokens": tokens,
"relevance": r_labels,
"presence": p_labels,
"laterality (generic)": l_labels, })
return df


def get_meds_meta_classes_data(documents, annotations, ):
substance_labels = []
allergy_labels = []
severity_labels = []
reaction_labels = []
cuis = []
names = []
texts = []
tokens = []
for ann in annotations:
substance_labels.append(ann.meta_substance_cat)
allergy_labels.append(ann.meta_allergy_type)
severity_labels.append(ann.meta_severity)
reaction_labels.append(ann.meta_reaction_pos)

cuis.append(ann.cui)
names.append(ann.value.lower())

document = documents[ann.document_id].lower()
_start = max(0, ann.start - 70)
_end = min(len(document), ann.end + 1 + 70)
texts.append(document[_start:_end])

# doc_text = tokenizer(document)
# ind = 0
# for ind, pair in enumerate(doc_text['offset_mapping']):
# if ann.start >= pair[0] and ann.start < pair[1]:
# break
# t_start = max(0, ind - 15)
# t_end = min(len(doc_text['input_ids']), ind + 1 + 10)
# tkns = doc_text['tokens'][t_start:t_end]
# tokens.append(tkns)

df = pd.DataFrame({"text": texts,
"cui": cuis,
"name": names,
# "tokens": tokens,
"substance_category": substance_labels,
"allergy_type": allergy_labels,
"severity": severity_labels,
"reaction_pos": reaction_labels})
return df




class Annotation:
def __init__(
self,
alternative,
id,
document_id,
cui,
value,
deleted,
start,
end,
irrelevant,
killed,
manually_created,
meta_laterality,
meta_presence,
meta_relevance,
meta_allergy_type,
meta_substance_cat,
meta_severity,
meta_reaction_pos,
dictionary
):
self.alternative = alternative
self.id = id
self.value = value
self.document_id = document_id
self.cui = cui
self.deleted = deleted
self.start = start
self.end = end
self.irrelevant = irrelevant
self.killed = killed
self.manually_created = manually_created
self.meta_laterality = meta_laterality
self.meta_presence = meta_presence
self.meta_relevance = meta_relevance
self.meta_allergy_type = meta_allergy_type
self.meta_substance_cat = meta_substance_cat
self.meta_severity = meta_severity
self.meta_reaction_pos = meta_reaction_pos
self.dict: Optional[dict] = dictionary

@classmethod
def from_dict(cls, d, document_id):
meta_laterality = None
meta_presence = None
meta_relevance = None

meta_allergy_type = None
meta_substance_cat = None
meta_severity = None
meta_reaction_pos = None

meta_anns = d.get("meta_anns")
if meta_anns is not None:
meta_ann_l = meta_anns.get('laterality (generic)')
if meta_ann_l is not None:
meta_laterality = meta_ann_l['value']
meta_ann_r = meta_anns.get('relevance')
if meta_ann_r is not None:
meta_relevance = meta_ann_r['value']
meta_ann_p = meta_anns.get('presence')
if meta_ann_p is not None:
meta_presence = meta_ann_p['value']

meta_ann_allergy = meta_anns.get('allergy_type')
if meta_ann_allergy is not None:
meta_allergy_type = meta_ann_allergy['value']
meta_ann_substance = meta_anns.get('substance_category')
if meta_ann_substance is not None:
meta_substance_cat = meta_ann_substance['value']
meta_ann_severity = meta_anns.get('severity')
if meta_ann_severity is not None:
meta_severity = meta_ann_severity['value']
meta_ann_reaction = meta_anns.get('reaction_pos')
if meta_ann_reaction is not None:
meta_reaction_pos = meta_ann_reaction['value']
return cls(
alternative=d['alternative'],
id=d['id'],
document_id=document_id,
cui=d['cui'],
value=d['value'],
deleted=d['deleted'],
start=d['start'],
end=d['end'],
irrelevant=d['irrelevant'],
killed=d['killed'],
manually_created=d['manually_created'],
meta_laterality=meta_laterality,
meta_presence=meta_presence,
meta_relevance=meta_relevance,
meta_allergy_type=meta_allergy_type,
meta_substance_cat=meta_substance_cat,
meta_severity=meta_severity,
meta_reaction_pos=meta_reaction_pos,
dictionary=d,
)

def __str__(self):
return f"""
---
id: {self.id}
document_id: {self.document_id}
cui: {self.cui}
value: {self.value}
start: {self.start}
end: {self.end}
deleted: {self.deleted}
irrelevant: {self.irrelevant}
killed: {self.killed}
manually created: {self.manually_created}
laterality: {self.meta_laterality}
presence: {self.meta_presence}
relevance: {self.meta_relevance}
substance category: {self.meta_substance_cat}
allergy type: {self.meta_allergy_type}
severity: {self.meta_severity}
reaction pos: {self.reaction_pos}
---
"""

def __eq__(self, other):
return (
self.alternative == other.alternative
and
self.cui == other.cui
and
self.document_id == other.document_id
and
self.deleted == other.deleted
and
self.start == other.start
and
self.end == other.end
and
self.irrelevant == other.irrelevant
and
self.killed == other.killed
and
self.manually_created == other.manually_created
and
self.meta_laterality == other.meta_laterality
and
self.meta_presence == other.meta_presence
and
self.meta_relevance == other.meta_relevance
and
self.meta_substance_cat == other.meta_substance_cat
and
self.meta_allergy_type == other.meta_allergy_type
and
self.meta_severity == other.meta_severity
and
self.meta_reaction_pos == other.meta_reaction_pos

)

def is_same_model_annotation(self, other):
return (
self.cui == other.cui
and
self.start == other.start
and
self.end == other.end
)

0 comments on commit 6e46f22

Please sign in to comment.