diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..4fba4c5 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,39 @@ +# Pull Request Template for MiADE + +## Description + +Please include a summary of the change and which issue is fixed. Also, include relevant motivation and context. List any dependencies that are required for this change. + +Fixes # (issue) + +## Type of change + +Please delete options that are not relevant. + +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds functionality) +- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) +- [ ] This change requires a documentation update + +## How Has This Been Tested? + +Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. + +- [ ] Test A +- [ ] Test B + +## Checklist: + +Before submitting your pull request, please review the following checklist: + +- [ ] I have performed a self-review of my own code. +- [ ] I have commented my code, particularly in hard-to-understand areas. +- [ ] I have made corresponding changes to the documentation. +- [ ] My changes generate no new warnings. +- [ ] I have added tests that prove my fix is effective or that my feature works. +- [ ] New and existing unit tests pass locally with my changes. +- [ ] Any dependent changes have been merged and published in downstream modules. + +## Additional Information: + +Any additional information that you would like to provide about the pull request. \ No newline at end of file diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml new file mode 100644 index 0000000..c485139 --- /dev/null +++ b/.github/workflows/build_documentation.yml @@ -0,0 +1,21 @@ +name: Build documentation + +on: + pull_request: + branches: [master] + +jobs: + build: + name: Build + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.11 + + - name: Build the documentation + run: | + pip install -r docs-requirements.txt + pip install ./ + mkdocs build \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d807205..dc5121b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: python-version: 3.11 - name: install dependencies run: | - python -m pip install --upgrade pip==24.0 + python -m pip install pip==24.0 pip install pytest pip install torch --index-url https://download.pytorch.org/whl/cpu pip install ./ @@ -31,8 +31,10 @@ jobs: pip install -r requirements.txt - name: run pytest run: pytest ./tests/* - - name: Lint with Ruff + - name: Install ruff + run: pip install ruff + - name: Lint with ruff run: | - pip install ruff ruff --output-format=github . - continue-on-error: true + ruff check --fix + continue-on-error: true \ No newline at end of file diff --git a/.github/workflows/deploy_documentation.yml b/.github/workflows/deploy_documentation.yml new file mode 100644 index 0000000..de7eba4 --- /dev/null +++ b/.github/workflows/deploy_documentation.yml @@ -0,0 +1,30 @@ +name: Publish documentation + +on: + push: + branches: + - master + +permissions: + contents: write + +jobs: + deploy: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.11 + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + - uses: actions/cache@v3 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + - run: | + pip install -r docs-requirements.txt + pip install ./ + - name: Build documentation + run: mkdocs gh-deploy --force diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..581325c --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,53 @@ +name: Publish Python 🐍 distribution 📦 to PyPI + +on: + release: + types: + - created + +jobs: + build: + name: Build distribution 📦 + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v3 + with: + name: python-package-distributions + path: dist/ + + publish-to-pypi: + name: >- + Publish Python 🐍 distribution 📦 to PyPI + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + needs: + - build + runs-on: ubuntu-latest + environment: + name: release + url: https://pypi.org/p/miade # Replace with your PyPI project name + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + steps: + - name: Download all the dists + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2c5f129..23888e7 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ venv.bak/ *.zip src/miade/model_builders/output/ !src/miade/data/* +.DS_Store #testing tests/data/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..84dade4 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +miade@uclh.net. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f0156c6..8f8c277 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,94 @@ -## Versioning +# Contributing to MiADE + +Thank you for considering contributing to MiADE! + +## Code of Conduct + +This project and everyone participating in it is governed by the MiADE Code of Conduct. By participating, you are expected to uphold this code. Please report unacceptable behavior to miade@uclh.net. + +## I don't want to read this whole thing I just have a question!!! + +INSERT LINK TO SOME SORT OF FORUM + +## What should I know before I get started? + +### Dependencies + +You can find a list of dependencies in our [pyproject.toml](https://github.com/uclh-criu/miade/blob/master/pyproject.toml) file. MiADE is compatible with Python 3.8 and above. + +To install the project with the dev dependencies, run: + +```bash +pip install -e .[dev] +``` +The `-e` flag sets the install to auto-update, useful when developing. + +### Testing + +MiADE uses [pytest](https://docs.pytest.org/en/8.2.x/), which can be run with: + +```bash +pytest ./tests/* +``` +> Remember, if using a virtual environment, to install pytest within your environment itself, otherwise you will be using the system python install of pytest, which will use system python and will not find your modules. + +### Formatting and Linting + +We use [ruff](https://docs.astral.sh/ruff/) for linting and formatting. Run: + +```bash +ruff format +ruff check --fix +``` + +## How Can I Contribute? + +### Reporting Bugs + +#### Before Submitting A Bug Report + +* Check the documentation for tips on how to fix the issue on your own. +* Determine which repository the problem should be reported in - MiADE wraps around [MedCAT](https://github.com/CogStack/MedCAT/tree/master?tab=readme-ov-file), so if you encounter an issue related to MedCAT models, it is better to report it to these folks! +* Check if the issue has already been reported. If it has **and the issue is still open**, add a comment to the existing issue instead of opening a new one. + +#### How Do I Submit A (Good) Bug Report? + +Bugs are tracked as [GitHub issues](https://github.com/uclh-criu/miade/issues). Explain the problem and include additional details to help maintainers reproduce the problem: + +* **Use a clear and descriptive title** for the issue to identify the problem. +* **Describe the exact steps which reproduce the problem** in as many details as possible. +* **Provide specific examples to demonstrate the steps**. Include links to files or GitHub projects, or copy/pasteable snippets, which you use in those examples. + + +### Your First Code Contribution + +Unsure where to begin contributing to MiADE? You can start by looking through these `beginner` and `help-wanted` issues: + +* [Good first issues](https://github.com/uclh-criu/miade/issues?q=is:open+is:issue+label:%22good+first+issue%22) - issues which should only require a few lines of code, and a test or two. +* [Help wanted issues](https://github.com/uclh-criu/miade/issues?q=is:open+is:issue+label:%22help+wanted%22) - issues which should be a bit more involved than `beginner` issues. + +### Pull Requests + +The process described here has several goals: + +- Maintain MiADE's quality +- Fix problems that are important to users +- Engage the community in working toward the best possible version of MiADE +- Enable a sustainable system for MiADE's maintainers to review contributions + +Please follow these steps to have your contribution considered by the maintainers: + +1. Follow all instructions in [the template](https://github.com/uclh-criu/miade/blob/documentation/.github/PULL_REQUEST_TEMPLATE.md) +2. Follow the [styleguides](#styleguides) +3. After you submit your pull request, verify that all tests are passing + +## Styleguides + +We use [Google Python style docstring](https://google.github.io/styleguide/pyguide.html). + +### Versioning Versioning is performed through git tags, which should follow the [semantic versioning](https://semver.org/) approach prefixed with a "v". -E.g.: `v0.1.2-alpha` \ No newline at end of file +E.g.: `v0.1.2-alpha` + + +Thank you for reading through the contributing guide and for your interest in making MiADE better. We look forward to your contributions! \ No newline at end of file diff --git a/README.md b/README.md index 1d21654..4212c65 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,86 @@ -# +# [![Build Status](https://github.com/uclh-criu/miade/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/uclh-criu/miade/actions/workflows/ci.yml?query=Tests) +![License: Elastic License 2.0](https://img.shields.io/badge/License-Elastic%202.0-blue.svg) -A set of tools for extracting formattable data from clinical notes stored in electronic health record systems. -For the reference server implementation, see: [miade-server](https://github.com/uclh-criu/miade-server). +A set of tools for extracting formattable data from clinical notes stored in electronic health record systems. Powered by [MedCAT](https://github.com/CogStack/MedCAT) models. -Built with Cogstack's [MedCAT](https://github.com/CogStack/MedCAT) package. -## Contents - -1. [Contributors](#Contributors) -2. [Installing](#Installing) -3. [Testing](#Testing) -4. [Contributing](#Contributing) -5. [Licence](#Licence) +## Installing +### Install MiADE -## Contributors +To install the stable release: +```bash +pip install miade +``` -| Name | Email | -|-----------------|-----------------------------| -| James Brandreth | j.brandreth@ucl.ac.uk | -| Jennifer Jiang | jennifer.jiang.13@ucl.ac.uk | +To install the latest development version of MiADE, clone this repository and run: +```bash +pip install . +``` -## Installing +### Downloading models +You may also need to download these additional models to run MiADE: -As the drug dosage extraction module uses Med7, you will need to download the model: +[spaCy](https://spacy.io/models/en) - required for MedCAT ```bash -pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl +python -m spacy download en_core_web_md ``` -Then install MiADE: +[med7](https://huggingface.co/kormilitzin/en_core_med7_lg) - required for medication dosage extraction ```bash -pip install -e . +pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl ``` -The `-e` flag sets the install to auto-update, useful when developing. Remove for production. -## Testing +## Quickstart -This project uses pytest tests, which can be run with: -```bash -pytest ./tests/* +Initialise MiADE with the path that you have saved your trained MedCAT models: + +```python +miade = NoteProcessor(Path("path/to/model/dir")) +``` +Add annotators: + +```python +miade.add_annotator("problems") +miade.add_annotator("meds/allergies") +``` + +Create a note: + +```python +text = "Patient has penicillin allergy with rash" +note = Note(text) +``` + +Extract concepts: + +```python +concepts = miade.process(note) + +for concept in concepts: + print(concept) + +# {name: breaking out - eruption, id: 271807003, category: Category.REACTION, start: 204, end: 208, dosage: None, negex: False, meta: None} +# {name: penicillin, id: 764146007, category: Category.ALLERGY, start: 191, end: 201, dosage: None, negex: False, meta: None} ``` -> Remember, if using a virtual environment, to install pytest within your environment itself, otherwise you will be using the system python install of pytest, which will use system python and will not find your modules. ## Contributing See [contributing](CONTRIBUTING.md) +### Maintainers + +| Name | Email | +|-----------------|-----------------------------| +| James Brandreth | j.brandreth@ucl.ac.uk | +| Jennifer Jiang | jennifer.jiang.13@ucl.ac.uk | + + +## Acknowledgement + +This project wouldn't be possible without the work at [Cogstack](https://cogstack.org/), [spaCy](https://spacy.io/), and [med7](https://huggingface.co/kormilitzin/en_core_med7_lg)! + + ## Licence + +This project is licensed under the Elastic License 2.0. See [LICENSE](https://github.com/uclh-criu/miade/blob/documentation/LICENCE.md) for the full license text. \ No newline at end of file diff --git a/configs/miade_config.yaml b/configs/miade_config.yaml index a797120..c1999c3 100644 --- a/configs/miade_config.yaml +++ b/configs/miade_config.yaml @@ -9,10 +9,4 @@ general: lookup_data_path: ./lookup_data/ negation_detection: None structured_list_limit: 0 # if more than this number of concepts in structure section, ignore concepts in prose - disable: [] - add_numbering: True - meds/allergies: - lookup_data_path: ./lookup_data/ - negation_detection: None - disable: [] - add_numbering: False \ No newline at end of file + add_numbering: True \ No newline at end of file diff --git a/docs-requirements.txt b/docs-requirements.txt new file mode 100644 index 0000000..2e68d2a --- /dev/null +++ b/docs-requirements.txt @@ -0,0 +1,4 @@ +mkdocs +mkdocs-material +mkdocstrings +mkdocstrings-python \ No newline at end of file diff --git a/docs/about/overview.md b/docs/about/overview.md new file mode 100644 index 0000000..d93f861 --- /dev/null +++ b/docs/about/overview.md @@ -0,0 +1,12 @@ +# Project Overview + +## Background +Data about people’s health stored in electronic health records (EHRs) can play an important role in improving the quality of patient care. Much of the information in EHRs is recorded in ordinary language without any restriction on format ('free text'), as this is the natural way in which people communicate. However, if this information were stored in a standardised, structured format, computers will also be able to process the information to help clinicians find and interpret information for better and safer decision making. This would enable EHR systems such as Epic, the system in place at UCLH since April 2019, to support clinical decision making. For instance, the system may be able to ensure that a patient is not prescribed medicine that would give them an allergic reaction. + +## The challenge +Free text may contain words and abbreviations which may be interpreted in more than one way, such as 'HR', which can mean 'Hour' or 'Heart Rate'. Free text may also contain negations; for example, a diagnosis may be mentioned in the text but the rest of the sentence might say that it was ruled out. Although computers can be used to interpret free text, they cannot always get it right, so clinicians will always have to check the results to ensure patient safety. Expressing information in a structured way can avoid this problem, but has a big disadvantage - it can be time-consuming for clinicians to enter the information. This can mean that information is incomplete, or clinicians are so busy on the computer that they do not have time to listen to their patients. + +## Meeting the need +The aim of MiADE is to develop a system to support automatic conversion of the clinician’s free text into a structured format. The clinician can check the structured data immediately, before making it a formal part of the patient’s record. The system will record a patient’s diagnoses, medications and allergies in a structured way, using NHS-endorsed clinical data standards (e.g. FIHR and SNOMED CT). It will use a technique called Natural Language Processing (NLP). NLP has been used by research teams to extract information from existing EHRs but has rarely been used to improve the way information is entered in the first place. Our NLP system will continuously learn and improve as more text is analysed and checked by clinicians. + +We will first test the system in University College London Hospitals, where a new EHR system called Epic is in place. We will study how effective it is, and how clinicians and patients find it when it is used in consultations. Based on feedback, we will make improvements and install it for testing at a second site (Great Ormond Street Hospital). Our aim is for the system to be eventually rolled out to more hospitals and doctors’ surgeries across the NHS. \ No newline at end of file diff --git a/docs/about/team.md b/docs/about/team.md new file mode 100644 index 0000000..b9ade4f --- /dev/null +++ b/docs/about/team.md @@ -0,0 +1,3 @@ +# Team + +The [MiADE](https://www.ucl.ac.uk/health-informatics/research/medical-information-ai-data-extractor-miade) project is developed by a team of clinicians, developers, AI researchers, and data standard experts at [University College London (UCL)](https://www.ucl.ac.uk/health-informatics/) and the [University College London Hospitals (UCLH)](https://www.uclhospitals.brc.nihr.ac.uk/clinical-research-informatics-unit), in collaboration with the [Cogstack](https://cogstack.org/) at King's College London (KCL). \ No newline at end of file diff --git a/docs/api-reference/annotator.md b/docs/api-reference/annotator.md new file mode 100644 index 0000000..18b0243 --- /dev/null +++ b/docs/api-reference/annotator.md @@ -0,0 +1 @@ +::: miade.annotators.Annotator \ No newline at end of file diff --git a/docs/api-reference/concept.md b/docs/api-reference/concept.md new file mode 100644 index 0000000..79ad06f --- /dev/null +++ b/docs/api-reference/concept.md @@ -0,0 +1,3 @@ +::: miade.concept.Concept +## Category +::: miade.concept.Category \ No newline at end of file diff --git a/docs/api-reference/dosage.md b/docs/api-reference/dosage.md new file mode 100644 index 0000000..5cfab20 --- /dev/null +++ b/docs/api-reference/dosage.md @@ -0,0 +1 @@ +::: miade.dosage.Dosage \ No newline at end of file diff --git a/docs/api-reference/dosageextractor.md b/docs/api-reference/dosageextractor.md new file mode 100644 index 0000000..4da9149 --- /dev/null +++ b/docs/api-reference/dosageextractor.md @@ -0,0 +1 @@ +::: miade.dosageextractor.DosageExtractor \ No newline at end of file diff --git a/docs/api-reference/medsallergiesannotator.md b/docs/api-reference/medsallergiesannotator.md new file mode 100644 index 0000000..ea4af57 --- /dev/null +++ b/docs/api-reference/medsallergiesannotator.md @@ -0,0 +1 @@ +::: miade.annotators.MedsAllergiesAnnotator \ No newline at end of file diff --git a/docs/api-reference/metaannotations.md b/docs/api-reference/metaannotations.md new file mode 100644 index 0000000..08f74d8 --- /dev/null +++ b/docs/api-reference/metaannotations.md @@ -0,0 +1 @@ +::: miade.metaannotations.MetaAnnotations \ No newline at end of file diff --git a/docs/api-reference/note.md b/docs/api-reference/note.md new file mode 100644 index 0000000..a29487e --- /dev/null +++ b/docs/api-reference/note.md @@ -0,0 +1 @@ +::: miade.note.Note \ No newline at end of file diff --git a/docs/api-reference/noteprocessor.md b/docs/api-reference/noteprocessor.md new file mode 100644 index 0000000..f145c9f --- /dev/null +++ b/docs/api-reference/noteprocessor.md @@ -0,0 +1 @@ +::: miade.core.NoteProcessor \ No newline at end of file diff --git a/docs/api-reference/problemsannotator.md b/docs/api-reference/problemsannotator.md new file mode 100644 index 0000000..14b7429 --- /dev/null +++ b/docs/api-reference/problemsannotator.md @@ -0,0 +1 @@ +::: miade.annotators.ProblemsAnnotator \ No newline at end of file diff --git a/docs/assets/miade-logo-small.png b/docs/assets/miade-logo-small.png new file mode 100644 index 0000000..996594a Binary files /dev/null and b/docs/assets/miade-logo-small.png differ diff --git a/docs/assets/miade-logo.png b/docs/assets/miade-logo.png new file mode 100644 index 0000000..8c5e5e5 Binary files /dev/null and b/docs/assets/miade-logo.png differ diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..ca1e575 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,3 @@ +# Contributing + +Contribute to MiADE! [Contribution guide](https://github.com/uclh-criu/miade/blob/master/CONTRIBUTING.md) \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..cb08e2a --- /dev/null +++ b/docs/index.md @@ -0,0 +1,28 @@ +# Welcome to the MiADE Documentation + +![](assets/miade-logo.png) + +MiADE (Medical information AI Data Extractor) is a set of tools for extracting formattable data from clinical notes stored in electronic health record systems (EHRs). Powered by Cogstack's [MedCAT](https://github.com/CogStack/MedCAT). + +## Installing + +```bash +pip install miade +``` + +You may also need to download these additional models to run MiADE: + +[spaCy](https://spacy.io/models/en) +```bash +python -m spacy download en_core_web_md +``` +[med7](https://huggingface.co/kormilitzin/en_core_med7_lg) +```bash +pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl +``` + +## License + +MiADE is licensed under [Elastic License 2.0](https://www.elastic.co/licensing/elastic-license). + +The Elastic License 2.0 is a flexible license that allows you to use, copy, distribute, make available, and prepare derivative works of the software, as long as you do not provide the software to others as a managed service or include it in a free software directory. For the full license text, see our [license page](https://github.com/uclh-criu/miade/blob/master/LICENCE.md). \ No newline at end of file diff --git a/docs/user-guide/configurations.md b/docs/user-guide/configurations.md new file mode 100644 index 0000000..5d1e194 --- /dev/null +++ b/docs/user-guide/configurations.md @@ -0,0 +1,64 @@ +# Configurations + +## Annotator +The MiADE processor is configured by a `yaml` file that maps a human-readable key for each of your models to a MedCAT model ID and an MiADE `Annotator`. The config file must be in the same folder as the MedCAT models. + +**Required** + +- `models`: The models section maps human-readable key-value pairing to the MedCAT model ID to use in MiADE +- `annotators`: The annotators section maps human-readable key-value pairing to `Annotator` processing classes to use in MiADE + +**Optional** + + - `lookup_data_path`: Specifies the lookup data to use. If `None` a default MiADE set will be used. + - `negation_detection`: `negex` (default rule-based algorithm) or `None` (use MetaCAT models) + - `structured_list_limit`: Specifies the maximum number of concepts detected in a structured paragraph section. If there are more than the set number of concepts in a structured list, then concepts detected in prose are ignored (prioritises concepts detected in structured lists over free-form text to avoid returning too many irrelevant concepts). Default `100` so this feature is essentially disabled. + - `disable`: Disable any specific postprocessing pipeline components - the usage here is similar to [spaCy pipelines](https://spacy.io/usage/processing-pipelines#disabling). + - `add_numbering`: Option to add a number prefix to the concept display names e.g. "01 Diabetes" + + +```yaml title="config.yaml" +models: + problems: f25ec9423958e8d6 + meds/allergies: a146c741501cf1f7 +annotators: + problems: ProblemsAnnotator + meds/allergies: MedsAllergiesAnnotator +general: + problems: + lookup_data_path: ./custom_lookup_data/ + structured_list_limit: 0 # setting as 0 will ignore all concepts found in prose + add_numbering: True + meds/allergies: + disable: ["vtm_converter"] +``` +The default configurations for annotators are defined below: + +::: miade.utils.annotatorconfig.AnnotatorConfig + +## Lookup Table + +Lookup tables are used to convert and filter concepts in the MiADE postprocessing steps for `ProblemsAnnotator` and `MedsAllergiesAnnotator`. We have packaged default lookup data (curated and used at UCLH) with MiADE for sample use. + +For a more detailed explanation on the creation and format of the lookup data, check out [miade-dataset](https://github.com/uclh-criu/miade-datasets/tree/master). + +To customise your own lookup tables, you can pass in a directory which contains your lookup data in the `config.yaml` `lookup_data_path` field. Note you currently need to have **ALL** of the required lookup data in your directory (this will be improved in the future). + +**Problems** +``` +negated.csv +historic.csv +suspected.csv +problem_blacklist.csv +``` + +**MedsAllergies** +``` +reactions_subset.csv +allergens_subset.csv +allergy_type.csv +valid_meds.csv +vtm_to_text.csv +vtm_to_vmp.csv +``` + diff --git a/docs/user-guide/cookbook.md b/docs/user-guide/cookbook.md new file mode 100644 index 0000000..adc8ea1 --- /dev/null +++ b/docs/user-guide/cookbook.md @@ -0,0 +1,3 @@ +# Cookbook + +Coming soon! \ No newline at end of file diff --git a/docs/user-guide/quickstart.md b/docs/user-guide/quickstart.md new file mode 100644 index 0000000..6b70560 --- /dev/null +++ b/docs/user-guide/quickstart.md @@ -0,0 +1,228 @@ +# Quickstart +## Extract concepts and dosages from a Note using MiADE + +### Configuring the MiADE Processor +`NoteProcessor` is the MiADE core. It is initialised with a model directory path that contains all the MedCAT model pack `.zip` files we would like to use in our pipeline, and a `config.yaml` file that maps an alias to the model IDs and annotators we would like to use (model IDs can be found in MedCAT `model_cards` or usually will be in the name). + +An example project structure may look like this: +``` +your_project/ +├── model_directory/ +│ ├── medcat_problems_modelpack_f25ec9423958e8d6.zip +│ ├── medcat_meds_modelpack_a146c741501cf1f7.zip +│ └── config.yaml +└── miade_driver_code.py +``` + +```yaml title="config.yaml" +models: + problems: f25ec9423958e8d6 + meds/allergies: a146c741501cf1f7 +annotators: + problems: ProblemsAnnotator + meds/allergies: MedsAllergiesAnnotator +``` +We can initialise the MiADE `NoteProcessor` by passing in the model directory which contains our MedCAT models and `config.yaml` file: + +```python +miade = NoteProcessor(Path("path/to/model/dir")) +``` +Once `NoteProcessor` is initialised, we can add annotators by the aliases we have specified in `config.yaml` to our processor. Each annotator wraps around a MedCAT model and performs additional postprocessing pipeline steps: + +```python +miade.add_annotator("problems") +miade.add_annotator("meds/allergies") +``` +By default annotators will add [negSpacy](https://spacy.io/universe/project/negspacy) to MedCAT, which implements the negEx algorithm ([Chapman et al. 2001](https://www.sciencedirect.com/science/article/pii/S1532046401910299)) for negation detection. This allows the models to perform simple rule-based negation detection in the absence of trained MetaCAT models. You can disable this in the [configurations](configurations.md#configurations) if you wish to use your own MetaCAT instead. + +### Creating a Note + +Create a `Note` object which contains the text we would like to extract concepts and dosages from: + +```python +text = """ +Suspected heart failure + +PMH: +prev history of Hypothyroidism +MI 10 years ago + + +Current meds: +Losartan 100mg daily +Atorvastatin 20mg daily +Paracetamol 500mg tablets 2 tabs qds prn + +Allergies: +Penicillin - rash + +Referred with swollen ankles and shortness of breath since 2 weeks. +""" + +note = Note(text) +``` + +### Extracting Concepts and Dosages + +MiADE can extract concepts in any code system you train your MedCAT models on. Each concept is code system-agnostic and contains: + +- `name`: name of concept +- `id`: concept ID +- `category`: type of concept e.g. problems, medictions +- `start`: start index of concept span +- `end`: end index of concept span +- `dosage`: for medication concepts +- `negex`: Negex result if configured +- `meta`: Meta annotations if MetaCAT models are used + +The dosages associated with medication concepts are extracted by the built-in MiADE `DosageExtractor`, using a combination of NER model [med7](https://github.com/kormilitzin/med7) and [CALIBER rule-based drug dose lookup algorithm](https://rdrr.io/rforge/CALIBERdrugdose/). It returns the dosage information in a format that is can be easily translated to HL7 standards such as [CDA](https://www.hl7.org/implement/standards/product_brief.cfm?product_id=7) and [FHIR](https://www.hl7.org/fhir/overview.html): + +- `dose` +- `duration` +- `frequency` +- `route` + +Putting it all together, we can now extract concepts from our `Note` object: + +=== "as Concept object" + ```python + concepts = miade.process(note) + for concept in concepts: + print(concept) + + # SNOMED CT codes + # {name: breaking out - eruption, id: 271807003, category: Category.REACTION, start: 204, end: 208, dosage: None, negex: False, meta: None} + # {name: penicillin, id: 764146007, category: Category.ALLERGY, start: 191, end: 201, dosage: None, negex: False, meta: None} + ``` +=== "as Dict" + ```python + concepts = miade.get_concept_dicts(note) + print(concepts) + + # [{'name': 'hypothyroidism (historic)', + # 'id': '161443002', + # 'category': 'PROBLEM', + # 'start': 46, + # 'end': 60, + # 'dosage': None, + # 'negex': False, + # 'meta': [{'name': 'relevance', + # 'value': 'HISTORIC', + # 'confidence': 0.999841570854187}, + # ... + ``` + +#### Handling existing records: deduplication + +MiADE is built to handle existing medication records from EHR systems that can be sent alongside the note. It will perform basic deduplication matching on IDs for existing record concepts. +```python +# create list of concepts that already exists in patient record +record_concepts = [ + Concept(id="161443002", name="hypothyroidism (historic)", category=Category.PROBLEM), + Concept(id="267039000", name="swollen ankle", category=Category.PROBLEM) +] +``` + +We can pass in a list of existing concepts from the EHR to MiADE at runtime: + +```python +miade.process(note=note, record_concepts=record_concepts) +``` + +## Customising MiADE +### Training Custom MedCAT Models +MiADE provides command line interface scripts for automatically building MedCAT model packs. This includes the unsupervised training and supervised training steps of MedCAT models, and the training and packaging of MetaCAT models, which perform additional context detection using a Bi-LSTM model. For more information on MedCAT models, see MedCAT [documentation](https://github.com/CogStack/MedCAT) and [paper](https://arxiv.org/abs/2010.01165). + +The ```--synthetic-data-path``` option allows you to add synthetically generated training data in CSV format to the supervised and MetaCAT training steps. The CSV should have the following format: + +| text | cui | name | start | end | relevance | presence | laterality | +| ----------------------------- | ----------------- | -------------------------- | ----- | --- | --------- | --------- | -------------------- | +| no history of liver failure | 59927004 | hepatic failure | 14 | 26 | historic | negated | none + + +```bash +# Trains unsupervised training step of MedCAT model +miade train $MODEL_PACK_PATH $TEXT_DATA_PATH --tag "miade-example" +``` +```bash +# Trains supervised training step of MedCAT model +miade train-supervised $MODEL_PACK_PATH $MEDCAT_JSON_EXPORT --synthetic-data-path $SYNTHETIC_CSV_PATH +``` +```bash +# Creates BBPE tokenizer for MetaCAT +miade create-bbpe-tokenizer $TEXT_DATA_PATH +``` +```bash +# Initialises MetaCAT models to do training on +miade create-metacats $TOKENIZER_PATH $CATEGORY_NAMES +``` +```bash +# Trains the MetaCAT Bi-LSTM models +miade train-metacats $METACAT_MODEL_PATH $MEDCAT_JSON_EXPORT --synthetic-data-path $SYNTHETIC_CSV_PATH +``` +```bash +# Packages MetaCAT models with the main MedCAT model pack +miade add_metacat_models $MODEL_PACK_PATH $METACAT_MODEL_PATH +``` +### Creating Custom MiADE Annotators + +We can add custom annotators with more specialised postprocessing steps to MiADE by subclassing `Annotator` and initialising `NoteProcessor` with a list of custom annotators. + +Built-in `Annotator` pipeline methods include: + +- `"preprocess"`: performs basic text cleaning and structural information on the note +- `"medcat"`: returns MedCAT output as MiADE `Concepts` +- `"dosage_extractor"`: uses the MiADE built-in `DosageExtractor` to add dosages associated with medication concepts +- `"deduplicator"`: filters duplicate concepts in list + +You must specify the type of concepts your custom annotator returns (see [Category](../api-reference/concept.md#category)), a pipeline processing order, and implement a `postprocess()` function. An example custom `Annotator` class might look like this: + +```python +class CustomAnnotator(Annotator): + def __init__(self, cat: MiADE_CAT): + super().__init__(cat) + self.reactions = ["271807003"] + self.allergens = ["764146007"] + + @property + def concept_types(self) -> List[Category]: + return [Category.MEDICATION, Category.ALLERGY] + + @property + def pipeline(self) -> List[str]: + return ["preprocessor", "medcat", "postprocessor", "dosage_extractor", "deduplicator"] + + def postprocess(self, concepts: List[Concept]) -> List[Concept]: + # some example post-processing code + for concept in concepts: + if concept.id in self.reactions: + concept.category = Category.REACTION + elif concept.id in self.allergens: + concept.category = Category.ALLERGY + return concepts +``` + +Add the custom annotator to config file: + + +```yaml title="config.yaml" +models: + problems: f25ec9423958e8d6 + meds/allergies: a146c741501cf1f7 + custom: a146c741501cf1f7 +annotators: + problems: ProblemsAnnotator + meds/allergies: MedsAllergiesAnnotator + custom: CustomAnnotator +``` + +Initialise MiADE with the custom annotator: + +```python +miade = NoteProcessor(Path("path/to/model/dir"), custom_annotators=[CustomAnnotator]) +miade.add_annotator("custom") +``` + +## Going further + +Check out our [cookbook](cookbook.md#cookbook)! \ No newline at end of file diff --git a/lookup_data/regex_para_chunk.csv b/lookup_data/regex_para_chunk.csv deleted file mode 100644 index d21cef7..0000000 --- a/lookup_data/regex_para_chunk.csv +++ /dev/null @@ -1,10 +0,0 @@ -paragraph,regex -prob,^(patient |)(current |final |hospital |active inpatient |complete |active |acute |inpatient |clinical |ongoing |in-patient |)(prob|probs|problem|problems|diag|diagnosis|diagnoses|issue|issues|this admission)( list|) -pmh,^(hx|pmhx|pmh|background|medical background|past medical history|past psychiatric history|past surgical history|past issues this admission|past med hist|bg) -med,^(home |current |active |outpatient |gp |current outpatient |)(med|meds|medications|drug|drugs|rx) -allergy,^(drug |med |medication |)(allerg|allergies|allergies|allergies and intolerances|intolerances|adverse effects|adverse reactions|adverse reaction risk) -history,^(pc|hpc|presenting complaint|history of presenting complaint|history|hist|synopsis|summary|clinical summary) -exam,^(exam|examination|o/e|o / e|oe) -ddx,^(diff|differential|differential diagnosis|ddx) -imp,^(imp|impression|diagnosis|formulation|diag|dx|psychiatric formulation|clinical summary impression|clinical summary / impression) -plan,^(plan|recommendations|recommendation|action|actions|goal|goals|advice|decision) diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..ffc71da --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,74 @@ +site_name: MiADE +site_description: A set of tools for extracting formattable data from clinical notes stored in electronic health record systems. + +# Repository +repo_name: uclh-criu/miade +repo_url: https://github.com/uclh-criu/miade + +# Configuration for the theme +theme: + name: material + logo: assets/miade-logo-small.png + favicon: assets/miade-logo-small.png + palette: + primary: teal + accent: red + features: + - navigation.tabs + - navigation.tabs.sticky + - navigation.instant + - navigation.sections + - navigation.expand + - navigation.path + - navigation.indexes + - toc.integrate + - content.code.copy + - content.code.select + +# Extensions +markdown_extensions: + - admonition + - toc: + permalink: true + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + +# Pages +nav: + - Home: index.md + - User Guide: + - Quickstart: user-guide/quickstart.md + - Configurations: user-guide/configurations.md + - Cookbook: user-guide/cookbook.md + - API Reference: + - Pipeline Components: + - NoteProcessor: api-reference/noteprocessor.md + - Annotator: api-reference/annotator.md + - ProblemsAnnotator: api-reference/problemsannotator.md + - MedsAllergiesAnnotator: api-reference/medsallergiesannotator.md + - DosageExtractor: api-reference/dosageextractor.md + - Objects: + - Note: api-reference/note.md + - Concept: api-reference/concept.md + - Dosage: api-reference/dosage.md + - MetaAnnotations: api-reference/metaannotations.md + - About: + - Overview: about/overview.md + - Team: about/team.md + - Community: contributing.md + +plugins: + - search + - mkdocstrings: + default_handler: python + handlers: + python: + rendering: + show_source: true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ffde221..35c61d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,8 +8,8 @@ build-backend = "setuptools.build_meta" [project] name = "miade" authors = [ - {name = "James Brandreth", email = "j.brandreth@ucl.ac.uk"}, - {name = "Jennifer Jiang", email= "jennifer.jiang.13@ucl.ac.uk"} + {name = "Jennifer Jiang-Kells", email= "jennifer.jiang.13@ucl.ac.uk"}, + {name = "James Brandreth", email = "j.brandreth@ucl.ac.uk"} ] description = "A set of tools for extracting formattable data from clinical notes stored in electronic health record systems." requires-python = ">=3.8" @@ -26,12 +26,28 @@ dependencies = [ "pydantic>=1.10.0", # compatibility with spacy "negspacy>=1.0.3", ] +readme = "README.md" +keywords = ["nlp", "natural-language-processing", "ml", "ehr", "electronic-health-records", "ai", "health", "healthcare"] +classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Healthcare Industry", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Operating System :: OS Independent", +] + +[project.urls] +Homepage = "https://github.com/uclh-criu/miade" +Documentation = "https://uclh-criu.github.io/miade/" [project.optional-dependencies] dev = [ "pytest>=7.1.3", - "jupyterlab>=3.4.7", - "black>=22.12.0", + "ruff>=0.5.2", ] dashboard = [ "streamlit>=1.20.0", diff --git a/src/miade/annotators.py b/src/miade/annotators.py index 7b78f95..90719c2 100644 --- a/src/miade/annotators.py +++ b/src/miade/annotators.py @@ -1,5 +1,7 @@ +import io import os import logging +import pkgutil import re from enum import Enum @@ -43,23 +45,41 @@ class AllergenType(Enum): ANIMAL = "animal" -def load_lookup_data(filename: str, as_dict: bool = False, no_header: bool = False): - if not os.path.exists(filename): - log.error(f"Lookup data not configured, check {filename} exists!") +def load_lookup_data(filename: str, is_package_data: bool = False, as_dict: bool = False, no_header: bool = False): + """ + Load lookup data from a CSV file. + + Args: + filename (str): The path to the CSV file. + package_data (bool, optional): If True, indicates that the filename is a package data file. Defaults to False. + as_dict (bool, optional): If True, return the data as a dictionary. Defaults to False. + no_header (bool, optional): If True, assume the CSV file has no header. Defaults to False. + + Returns: + pandas.DataFrame or dict: The loaded data. + + Raises: + FileNotFoundError: If the specified file does not exist. + """ + if is_package_data: + lookup_data = pkgutil.get_data(__name__, filename) + lookup_data = io.BytesIO(lookup_data) + else: + lookup_data = filename if as_dict: return ( pd.read_csv( - filename, + lookup_data, index_col=0, ) .squeeze("columns") .T.to_dict() ) if no_header: - return pd.read_csv(filename, header=None) + return pd.read_csv(lookup_data, header=None) else: - return pd.read_csv(filename).drop_duplicates() + return pd.read_csv(lookup_data).drop_duplicates() def load_regex_paragraph_mappings(data: pd.DataFrame) -> Dict: @@ -78,8 +98,24 @@ def load_regex_paragraph_mappings(data: pd.DataFrame) -> Dict: return regex_lookup -def load_allergy_type_combinations(filename: str) -> Dict: - df = pd.read_csv(filename) +def load_allergy_type_combinations(filename: str, is_package_data: bool = False) -> Dict: + """ + Load allergy type combinations from a CSV file and return a dictionary. + + Args: + filename (str): The path to the CSV file containing the allergy type combinations. + + Returns: + dict: A dictionary where the keys are tuples of (allergenType, adverseReactionType) + and the values are tuples of (reaction_id, reaction_name). + """ + if is_package_data: + lookup_data = pkgutil.get_data(__name__, filename) + lookup_data = io.BytesIO(lookup_data) + else: + lookup_data = filename + + df = pd.read_csv(lookup_data) # Convert 'allergenType' and 'adverseReactionType' columns to lowercase df["allergenType"] = df["allergenType"].str.lower() @@ -96,11 +132,15 @@ def load_allergy_type_combinations(filename: str) -> Dict: def get_dosage_string(med: Concept, next_med: Optional[Concept], text: str) -> str: """ - Finds chunks of text that contain single dosage instructions to input into DosageProcessor - :param med: (Concept) medications concept - :param next_med: (Concept) next consecutive medication concept if there is one - :param text: (str) whole text - :return: (str) dosage text + Finds chunks of text that contain single dosage instructions to input into DosageProcessor. + + Args: + med (Concept): The medications concept. + next_med (Concept, optional): The next consecutive medication concept if there is one. + text (str): The whole text. + + Returns: + str: The dosage text. """ sents = sent_regex.findall(text[med.start : next_med.start] if next_med is not None else text[med.start :]) @@ -123,15 +163,18 @@ def get_dosage_string(med: Concept, next_med: Optional[Concept], text: str) -> s def calculate_word_distance(start1: int, end1: int, start2: int, end2: int, note: Note) -> int: """ - Calculates how many words are in between the given text spans based on character indices. - :param start1: Character index of the start of word 1. - :param end1: Character index of the end of word 1. - :param start2: Character index of the start of word 2. - :param end2: Character index of the end of word 2. - :param note: Note object that contains the whole text. - :return: Number of words between the two text spans. - """ + Calculates the number of words between two text spans based on character indices. + Args: + start1 (int): Character index of the start of word 1. + end1 (int): Character index of the end of word 1. + start2 (int): Character index of the start of word 2. + end2 (int): Character index of the end of word 2. + note (Note): Note object that contains the whole text. + + Returns: + int: Number of words between the two text spans. + """ if start1 > end1 or start2 > end2: return -1 # Invalid input: start index should be less than or equal to the end index @@ -154,7 +197,13 @@ def calculate_word_distance(start1: int, end1: int, start2: int, end2: int, note class Annotator(ABC): """ - Docstring for Annotator + An abstract base class for annotators. + + Annotators are responsible for processing medical notes and extracting relevant concepts from them. + + Attributes: + cat (CAT): The MedCAT instance used for concept extraction. + config (AnnotatorConfig): The configuration for the annotator. """ def __init__(self, cat: CAT, config: AnnotatorConfig = None): @@ -179,6 +228,9 @@ def __init__(self, cat: CAT, config: AnnotatorConfig = None): self.irrelevant_paragraphs = [ParagraphType.ddx, ParagraphType.exam, ParagraphType.plan] def _add_negex_pipeline(self) -> None: + """ + Adds the negex pipeline to the MedCAT instance. + """ self.cat.pipe.spacy_nlp.add_pipe("sentencizer") self.cat.pipe.spacy_nlp.enable_pipe("sentencizer") self.cat.pipe.spacy_nlp.add_pipe("negex") @@ -190,22 +242,41 @@ def _load_paragraph_regex(self) -> None: @property @abstractmethod def concept_types(self): + """ + Abstract property that should return a list of concept types supported by the annotator. + """ pass @property @abstractmethod def pipeline(self): - pass - - @abstractmethod - def process_paragraphs(self): + """ + Abstract property that should return a list of pipeline steps for the annotator. + """ pass @abstractmethod def postprocess(self): + """ + Abstract method that should implement the logic for post-processing extracted concepts. + """ pass - def run_pipeline(self, note: Note, record_concepts: List[Concept]) -> List[Concept]: + def run_pipeline( + self, note: Note, record_concepts: List[Concept], dosage_extractor: Optional[DosageExtractor] = None + ) -> List[Concept]: + """ + Runs the annotation pipeline on a given note and returns the extracted concepts. + + Args: + note (Note): The input note to process. + record_concepts (List[Concept]): The list of concepts from existing EHR records. + dosage_extractor (Optional[DosageExtractor]): An optional dosage extractor to add dosages to concepts. + + Returns: + List[Concept]: The extracted concepts from the note. + """ + # TODO: make this more extensible concepts: List[Concept] = [] for pipe in self.pipeline: @@ -220,10 +291,21 @@ def run_pipeline(self, note: Note, record_concepts: List[Concept]) -> List[Conce concepts = self.postprocess(concepts) elif pipe == "deduplicator": concepts = self.deduplicate(concepts, record_concepts) + elif pipe == "dosage_extractor" and dosage_extractor is not None: + concepts = self.add_dosages_to_concepts(dosage_extractor, concepts, note) return concepts def get_concepts(self, note: Note) -> List[Concept]: + """ + Extracts concepts from a note using the MedCAT instance. + + Args: + note (Note): The input note to extract concepts from. + + Returns: + The extracted concepts from the note. + """ concepts: List[Concept] = [] for entity in self.cat.get_entities(note)["entities"].values(): try: @@ -235,6 +317,15 @@ def get_concepts(self, note: Note) -> List[Concept]: return concepts def preprocess(self, note: Note) -> Note: + """ + Preprocesses a note by cleaning its text and splitting it into paragraphs. + + Args: + note (Note): The input note to preprocess. + + Returns: + The preprocessed note. + """ note.clean_text() note.get_paragraphs(self.paragraph_regex) @@ -242,6 +333,16 @@ def preprocess(self, note: Note) -> Note: @staticmethod def deduplicate(concepts: List[Concept], record_concepts: Optional[List[Concept]]) -> List[Concept]: + """ + Removes duplicate concepts from the extracted concepts list by strict ID matching. + + Args: + concepts (List[Concept]): The list of extracted concepts. + record_concepts (Optional[List[Concept]]): The list of concepts from existing EHR records. + + Returns: + The deduplicated list of concepts. + """ if record_concepts is not None: record_ids = {record_concept.id for record_concept in record_concepts} record_names = {record_concept.name for record_concept in record_concepts} @@ -266,8 +367,47 @@ def deduplicate(concepts: List[Concept], record_concepts: Optional[List[Concept] return filtered_concepts + @staticmethod + def add_dosages_to_concepts( + dosage_extractor: DosageExtractor, concepts: List[Concept], note: Note + ) -> List[Concept]: + """ + Gets dosages for medication concepts + + Args: + dosage_extractor (DosageExtractor): The dosage extractor object + concepts (List[Concept]): List of concepts extracted + note (Note): The input note + + Returns: + List of concepts with dosages for medication concepts + """ + + for ind, concept in enumerate(concepts): + next_med_concept = concepts[ind + 1] if len(concepts) > ind + 1 else None + dosage_string = get_dosage_string(concept, next_med_concept, note.text) + if len(dosage_string.split()) > 2: + concept.dosage = dosage_extractor(dosage_string) + concept.category = Category.MEDICATION if concept.dosage is not None else None + if concept.dosage is not None: + log.debug( + f"Extracted dosage for medication concept " + f"({concept.id} | {concept.name}): {concept.dosage.text} {concept.dosage.dose}" + ) + + return concepts + @staticmethod def add_numbering_to_name(concepts: List[Concept]) -> List[Concept]: + """ + Adds numbering to the names of problem concepts to control output ordering. + + Args: + concepts (List[Concept]): The list of concepts to add numbering to. + + Returns: + The list of concepts with numbering added to their names. + """ # Prepend numbering to problem concepts e.g. 00 asthma, 01 stroke... for i, concept in enumerate(concepts): concept.name = f"{i:02} {concept.name}" @@ -278,8 +418,23 @@ def __call__( self, note: Note, record_concepts: Optional[List[Concept]] = None, - ): - concepts = self.run_pipeline(note, record_concepts) + dosage_extractor: Optional[DosageExtractor] = None, + ) -> List[Concept]: + """ + Runs the annotation pipeline on a given note and returns the extracted concepts. + + Args: + note (Note): The input note to process. + record_concepts (Optional[List[Concept]]): The list of concepts from existing EHR records. + dosage_extractor (Optional[DosageExtractor]): The dosage extractor to use for extracting dosage information. + + Returns: + List[Concept]: The extracted concepts from the note. + """ + if dosage_extractor is not None: + concepts = self.run_pipeline(note, record_concepts, dosage_extractor) + else: + concepts = self.run_pipeline(note, record_concepts) if self.config.add_numbering: concepts = self.add_numbering_to_name(concepts) @@ -288,30 +443,89 @@ def __call__( class ProblemsAnnotator(Annotator): + """ + Annotator class for identifying and processing problems in medical notes. + + This class extends the base `Annotator` class and provides specific functionality + for identifying and processing problems in medical notes. It implements methods + for loading problem lookup data, processing meta annotations, filtering concepts, + and post-processing the annotated concepts. + + Attributes: + cat (CAT): The CAT (Concept Annotation Tool) instance used for annotation. + config (AnnotatorConfig): The configuration object for the annotator. + + Properties: + concept_types (list): A list of concept types supported by this annotator. + pipeline (list): The list of processing steps in the annotation pipeline. + """ + def __init__(self, cat: CAT, config: AnnotatorConfig = None): super().__init__(cat, config) self._load_problems_lookup_data() @property - def concept_types(self): + def concept_types(self) -> List[Category]: + """ + Get the list of concept types supported by this annotator. + + Returns: + [Category.PROBLEM] + """ return [Category.PROBLEM] @property - def pipeline(self): + def pipeline(self) -> List[str]: + """ + Get the list of processing steps in the annotation pipeline. + + Returns: + ["preprocessor", "medcat", "paragrapher", "postprocessor", "deduplicator"] + """ return ["preprocessor", "medcat", "paragrapher", "postprocessor", "deduplicator"] def _load_problems_lookup_data(self) -> None: - if not os.path.isdir(self.config.lookup_data_path): - raise RuntimeError(f"No lookup data configured: {self.config.lookup_data_path} does not exist!") + """ + Load the problem lookup data. Load prepackaged lookups if lookup_data_path is None. + + Raises: + RuntimeError: If the lookup data directory does not exist. + """ + if self.config.lookup_data_path is None: + data_path = "./data/" + is_package_data = True + log.info("Loading preconfigured lookup data for ProblemsAnnotator") else: - self.negated_lookup = load_lookup_data(self.config.lookup_data_path + "negated.csv", as_dict=True) - self.historic_lookup = load_lookup_data(self.config.lookup_data_path + "historic.csv", as_dict=True) - self.suspected_lookup = load_lookup_data(self.config.lookup_data_path + "suspected.csv", as_dict=True) - self.filtering_blacklist = load_lookup_data( - self.config.lookup_data_path + "problem_blacklist.csv", no_header=True - ) + data_path = self.config.lookup_data_path + is_package_data = False + log.info(f"Loading lookup data from {data_path} for ProblemsAnnotator") + if not os.path.isdir(data_path): + raise RuntimeError(f"No lookup data configured: {data_path} does not exist!") + + self.negated_lookup = load_lookup_data(data_path + "negated.csv", is_package_data=is_package_data, as_dict=True) + self.historic_lookup = load_lookup_data( + data_path + "historic.csv", is_package_data=is_package_data, as_dict=True + ) + self.suspected_lookup = load_lookup_data( + data_path + "suspected.csv", is_package_data=is_package_data, as_dict=True + ) + self.filtering_blacklist = load_lookup_data( + data_path + "problem_blacklist.csv", is_package_data=is_package_data, no_header=True + ) def _process_meta_annotations(self, concept: Concept) -> Optional[Concept]: + """ + Process the meta annotations for a concept. + + Args: + concept (Concept): The concept to process. + + Returns: + The processed concept, or None if it should be removed. + + Raises: + ValueError: If the concept has an invalid negex value. + """ # Add, convert, or ignore concepts meta_ann_values = [meta_ann.value for meta_ann in concept.meta] if concept.meta is not None else [] @@ -375,6 +589,15 @@ def _process_meta_annotations(self, concept: Concept) -> Optional[Concept]: return concept def _is_blacklist(self, concept): + """ + Check if a concept is in the filtering blacklist. + + Args: + concept: The concept to check. + + Returns: + True if the concept is in the blacklist, False otherwise. + """ # filtering blacklist if int(concept.id) in self.filtering_blacklist.values: log.debug(f"Removed concept ({concept.id} | {concept.name}): concept in problems blacklist") @@ -384,6 +607,14 @@ def _is_blacklist(self, concept): def _process_meta_ann_by_paragraph( self, concept: Concept, paragraph: Paragraph, prob_concepts_in_structured_sections: List[Concept] ): + """ + Process the meta annotations for a concept based on the paragraph type. + + Args: + concept (Concept): The concept to process. + paragraph (Paragraph): The paragraph containing the concept. + prob_concepts_in_structured_sections (List[Concept]): The list of problem concepts in structured sections. + """ # if paragraph is structured problems section, add to prob list and convert to corresponding relevance if paragraph.type in self.structured_prob_lists: prob_concepts_in_structured_sections.append(concept) @@ -408,6 +639,16 @@ def _process_meta_ann_by_paragraph( meta.value = Relevance.IRRELEVANT def process_paragraphs(self, note: Note, concepts: List[Concept]) -> List[Concept]: + """ + Process the paragraphs in a note and filter the concepts. + + Args: + note (Note): The note to process. + concepts (List[Concept]): The list of concepts to filter. + + Returns: + The filtered list of concepts. + """ prob_concepts_in_structured_sections: List[Concept] = [] for paragraph in note.paragraphs: @@ -429,6 +670,15 @@ def process_paragraphs(self, note: Note, concepts: List[Concept]) -> List[Concep return concepts def postprocess(self, concepts: List[Concept]) -> List[Concept]: + """ + Post-process the concepts and filter out irrelevant concepts. + + Args: + concepts (List[Concept]): The list of concepts to post-process. + + Returns: + The filtered list of concepts. + """ # deepcopy so we still have reference to original list of concepts all_concepts = deepcopy(concepts) filtered_concepts = [] @@ -446,16 +696,46 @@ def postprocess(self, concepts: List[Concept]) -> List[Concept]: class MedsAllergiesAnnotator(Annotator): + """ + Annotator class for medication and allergy concepts. + + This class extends the `Annotator` base class and provides methods for running a pipeline of + annotation tasks on a given note, as well as validating and converting concepts related to + medications and allergies. + + Attributes: + valid_meds (List[int]): A list of valid medication IDs. + reactions_subset_lookup (Dict[int, str]): A dictionary mapping reaction IDs to their corresponding subset IDs. + allergens_subset_lookup (Dict[int, str]): A dictionary mapping allergen IDs to their corresponding subset IDs. + allergy_type_lookup (Dict[str, List[str]]): A dictionary mapping allergen types to their corresponding codes. + vtm_to_vmp_lookup (Dict[str, str]): A dictionary mapping VTM (Virtual Therapeutic Moiety) IDs to VMP (Virtual Medicinal Product) IDs. + vtm_to_text_lookup (Dict[str, str]): A dictionary mapping VTM IDs to their corresponding text. + """ + def __init__(self, cat: CAT, config: AnnotatorConfig = None): super().__init__(cat, config) self._load_med_allergy_lookup_data() @property - def concept_types(self): + def concept_types(self) -> List[Category]: + """ + Returns a list of concept types. + + Returns: + [Category.MEDICATION, Category.ALLERGY, Category.REACTION] + """ return [Category.MEDICATION, Category.ALLERGY, Category.REACTION] @property - def pipeline(self): + def pipeline(self) -> List[str]: + """ + Returns a list of annotators in the pipeline. + + The annotators are executed in the order they appear in the list. + + Returns: + ["preprocessor", "medcat", "paragrapher", "postprocessor", "dosage_extractor", "vtm_converter", "deduplicator"] + """ return [ "preprocessor", "medcat", @@ -469,6 +749,17 @@ def pipeline(self): def run_pipeline( self, note: Note, record_concepts: List[Concept], dosage_extractor: Optional[DosageExtractor] ) -> List[Concept]: + """ + Runs the annotation pipeline on the given note. + + Args: + note (Note): The input note to run the pipeline on. + record_concepts (List[Concept]): The list of previously recorded concepts. + dosage_extractor (Optional[DosageExtractor]): The dosage extractor function. + + Returns: + The list of annotated concepts. + """ concepts: List[Concept] = [] for pipe in self.pipeline: @@ -483,9 +774,7 @@ def run_pipeline( concepts = self.postprocess(concepts, note) elif pipe == "deduplicator": concepts = self.deduplicate(concepts, record_concepts) - elif pipe == "add_numbering": - concepts = self.add_numbering_to_name(concepts) - elif pipe == "VTM_converter": + elif pipe == "vtm_converter": concepts = self.convert_VTM_to_VMP_or_text(concepts) elif pipe == "dosage_extractor" and dosage_extractor is not None: concepts = self.add_dosages_to_concepts(dosage_extractor, concepts, note) @@ -493,27 +782,62 @@ def run_pipeline( return concepts def _load_med_allergy_lookup_data(self) -> None: - if not os.path.isdir(self.config.lookup_data_path): - raise RuntimeError(f"No lookup data configured: {self.config.lookup_data_path} does not exist!") + """ + Loads the medication and allergy lookup data. + """ + if self.config.lookup_data_path is None: + data_path = "./data/" + is_package_data = True + log.info("Loading preconfigured lookup data for MedsAllergiesAnnotator") else: - self.valid_meds = load_lookup_data(self.config.lookup_data_path + "valid_meds.csv", no_header=True) - self.reactions_subset_lookup = load_lookup_data( - self.config.lookup_data_path + "reactions_subset.csv", as_dict=True - ) - self.allergens_subset_lookup = load_lookup_data( - self.config.lookup_data_path + "allergens_subset.csv", as_dict=True - ) - self.allergy_type_lookup = load_allergy_type_combinations(self.config.lookup_data_path + "allergy_type.csv") - self.vtm_to_vmp_lookup = load_lookup_data(self.config.lookup_data_path + "vtm_to_vmp.csv") - self.vtm_to_text_lookup = load_lookup_data(self.config.lookup_data_path + "vtm_to_text.csv", as_dict=True) + data_path = self.config.lookup_data_path + is_package_data = False + log.info(f"Loading lookup data from {data_path} for MedsAllergiesAnnotator") + if not os.path.isdir(data_path): + raise RuntimeError(f"No lookup data configured: {data_path} does not exist!") + + self.valid_meds = load_lookup_data( + data_path + "valid_meds.csv", is_package_data=is_package_data, no_header=True + ) + self.reactions_subset_lookup = load_lookup_data( + data_path + "reactions_subset.csv", is_package_data=is_package_data, as_dict=True + ) + self.allergens_subset_lookup = load_lookup_data( + data_path + "allergens_subset.csv", is_package_data=is_package_data, as_dict=True + ) + self.allergy_type_lookup = load_allergy_type_combinations( + data_path + "allergy_type.csv", is_package_data=is_package_data + ) + self.vtm_to_vmp_lookup = load_lookup_data(data_path + "vtm_to_vmp.csv", is_package_data=is_package_data) + self.vtm_to_text_lookup = load_lookup_data( + data_path + "vtm_to_text.csv", is_package_data=is_package_data, as_dict=True + ) def _validate_meds(self, concept) -> bool: + """ + Validates if the concept is a valid medication. + + Args: + concept: The concept to validate. + + Returns: + True if the concept is a valid medication, False otherwise. + """ # check if substance is valid med if int(concept.id) in self.valid_meds.values: return True return False def _validate_and_convert_substance(self, concept) -> bool: + """ + Validates and converts a substance concept for allergy. + + Args: + concept: The substance concept to be validated and converted. + + Returns: + True if the substance is valid and converted successfully, False otherwise. + """ # check if substance is valid substance for allergy - if it is, convert it to Epic subset and return that concept lookup_result = self.allergens_subset_lookup.get(int(concept.id)) if lookup_result is not None: @@ -539,6 +863,16 @@ def _validate_and_convert_substance(self, concept) -> bool: return False def _validate_and_convert_reaction(self, concept) -> bool: + """ + Validates and converts a reaction concept to the Epic subset. + + Args: + concept: The concept to be validated and converted. + + Returns: + True if the concept is a valid reaction and successfully converted to the Epic subset, + False otherwise. + """ # check if substance is valid reaction - if it is, convert it to Epic subset and return that concept lookup_result = self.reactions_subset_lookup.get(int(concept.id), None) if lookup_result is not None: @@ -553,6 +887,16 @@ def _validate_and_convert_reaction(self, concept) -> bool: return False def _validate_and_convert_concepts(self, concept: Concept) -> Concept: + """ + Validates and converts the given concept based on its metadata annotations. + + Args: + concept (Concept): The concept to be validated and converted. + + Returns: + The validated and converted concept. + + """ meta_ann_values = [meta_ann.value for meta_ann in concept.meta] if concept.meta is not None else [] # assign categories @@ -583,33 +927,19 @@ def _validate_and_convert_concepts(self, concept: Concept) -> Concept: return concept @staticmethod - def add_dosages_to_concepts( - dosage_extractor: DosageExtractor, concepts: List[Concept], note: Note - ) -> List[Concept]: - """ - Gets dosages for medication concepts - :param dosage_extractor: - :param concepts: (List) list of concepts extracted - :param note: (Note) input note - :return: (List) list of concepts with dosages for medication concepts + def _link_reactions_to_allergens(concept_list: List[Concept], note: Note, link_distance: int = 5) -> List[Concept]: """ + Links reaction concepts to allergen concepts based on their proximity in the given concept list. - for ind, concept in enumerate(concepts): - next_med_concept = concepts[ind + 1] if len(concepts) > ind + 1 else None - dosage_string = get_dosage_string(concept, next_med_concept, note.text) - if len(dosage_string.split()) > 2: - concept.dosage = dosage_extractor(dosage_string) - concept.category = Category.MEDICATION if concept.dosage is not None else None - if concept.dosage is not None: - log.debug( - f"Extracted dosage for medication concept " - f"({concept.id} | {concept.name}): {concept.dosage.text} {concept.dosage.dose}" - ) + Args: + concept_list (List[Concept]): The list of concepts to search for reaction and allergen concepts. + note (Note): The note object containing the text. + link_distance (int, optional): The maximum distance between a reaction and an allergen to be considered linked. + Defaults to 5. - return concepts - - @staticmethod - def _link_reactions_to_allergens(concept_list: List[Concept], note: Note, link_distance: int = 5) -> List[Concept]: + Returns: + The updated concept list with reaction concepts removed and linked to their corresponding allergen concepts. + """ allergy_concepts = [concept for concept in concept_list if concept.category == Category.ALLERGY] reaction_concepts = [concept for concept in concept_list if concept.category == Category.REACTION] @@ -661,6 +991,15 @@ def _link_reactions_to_allergens(concept_list: List[Concept], note: Note, link_d @staticmethod def _convert_allergy_severity_to_code(concept: Concept) -> bool: + """ + Converts allergy severity to corresponding codes and links them to the concept. + + Args: + concept (Concept): The concept to convert severity for. + + Returns: + True if the conversion is successful, False otherwise. + """ meta_ann_values = [meta_ann.value for meta_ann in concept.meta] if concept.meta is not None else [] if Severity.MILD in meta_ann_values: concept.linked_concepts.append(Concept(id="L", name="Low", category=Category.SEVERITY)) @@ -682,6 +1021,15 @@ def _convert_allergy_severity_to_code(concept: Concept) -> bool: return True def _convert_allergy_type_to_code(self, concept: Concept) -> bool: + """ + Converts the allergy type of a concept to a code and adds it as a linked concept. + + Args: + concept (Concept): The concept whose allergy type needs to be converted. + + Returns: + True if the conversion and linking were successful, False otherwise. + """ # get the ALLERGYTYPE meta-annotation allergy_type = [meta_ann for meta_ann in concept.meta if meta_ann.name == "allergy_type"] if len(allergy_type) != 1: @@ -716,6 +1064,16 @@ def _convert_allergy_type_to_code(self, concept: Concept) -> bool: return True def _process_meta_ann_by_paragraph(self, concept: Concept, paragraph: Paragraph): + """ + Process the meta annotations for a given concept and paragraph. + + Args: + concept (Concept): The concept object. + paragraph (Paragraph): The paragraph object. + + Returns: + None + """ # if paragraph is structured meds to convert to corresponding relevance if paragraph.type in self.structured_med_lists: for meta in concept.meta: @@ -743,6 +1101,16 @@ def _process_meta_ann_by_paragraph(self, concept: Concept, paragraph: Paragraph) meta.value = SubstanceCategory.IRRELEVANT def process_paragraphs(self, note: Note, concepts: List[Concept]) -> List[Concept]: + """ + Process the paragraphs in a note and update the list of concepts. + + Args: + note (Note): The note object containing the paragraphs. + concepts (List[Concept]): The list of concepts to be updated. + + Returns: + The updated list of concepts. + """ for paragraph in note.paragraphs: for concept in concepts: if concept.start >= paragraph.start and concept.end <= paragraph.end: @@ -753,6 +1121,16 @@ def process_paragraphs(self, note: Note, concepts: List[Concept]) -> List[Concep return concepts def postprocess(self, concepts: List[Concept], note: Note) -> List[Concept]: + """ + Postprocesses a list of concepts and links reactions to allergens. + + Args: + concepts (List[Concept]): The list of concepts to be postprocessed. + note (Note): The note object associated with the concepts. + + Returns: + The postprocessed list of concepts. + """ # deepcopy so we still have reference to original list of concepts all_concepts = deepcopy(concepts) processed_concepts = [] @@ -766,6 +1144,16 @@ def postprocess(self, concepts: List[Concept], note: Note) -> List[Concept]: return processed_concepts def convert_VTM_to_VMP_or_text(self, concepts: List[Concept]) -> List[Concept]: + """ + Converts medication concepts from VTM (Virtual Therapeutic Moiety) to VMP (Virtual Medicinal Product) or text. + + Args: + concepts (List[Concept]): A list of medication concepts. + + Returns: + A list of medication concepts with updated IDs, names, and dosages. + + """ # Get medication concepts med_concepts = [concept for concept in concepts if concept.category == Category.MEDICATION] self.vtm_to_vmp_lookup["dose"] = self.vtm_to_vmp_lookup["dose"].astype(float) @@ -826,16 +1214,3 @@ def convert_VTM_to_VMP_or_text(self, concepts: List[Concept]) -> List[Concept]: concept.name = lookup_result return concepts - - def __call__( - self, - note: Note, - record_concepts: Optional[List[Concept]] = None, - dosage_extractor: Optional[DosageExtractor] = None, - ): - concepts = self.run_pipeline(note, record_concepts, dosage_extractor) - - if self.config.add_numbering: - concepts = self.add_numbering_to_name(concepts) - - return concepts diff --git a/src/miade/concept.py b/src/miade/concept.py index 1e22036..74e134c 100644 --- a/src/miade/concept.py +++ b/src/miade/concept.py @@ -16,7 +16,20 @@ class Category(Enum): class Concept(object): - """docstring for Concept.""" + """Represents a concept in the system. + + Attributes: + id (str): The unique identifier of the concept. + name (str): The name of the concept. + category (Optional[Enum]): The category of the concept (optional). + start (Optional[int]): The start position of the concept (optional). + end (Optional[int]): The end position of the concept (optional). + dosage (Optional[Dosage]): The dosage of the concept (optional). + linked_concepts (Optional[List[Concept]]): The linked concepts of the concept (optional). + negex (Optional[bool]): The negex value of the concept (optional). + meta_anns (Optional[List[MetaAnnotations]]): The meta annotations of the concept (optional). + debug_dict (Optional[Dict]): The debug dictionary of the concept (optional). + """ def __init__( self, @@ -46,7 +59,16 @@ def __init__( self.linked_concepts = [] @classmethod - def from_entity(cls, entity: [Dict]): + def from_entity(cls, entity: Dict) -> Concept: + """ + Converts an entity dictionary into a Concept object. + + Args: + entity (Dict): The entity dictionary containing the necessary information. + + Returns: + The Concept object created from the entity dictionary. + """ meta_anns = None if entity["meta_anns"]: meta_anns = [MetaAnnotations(**value) for value in entity["meta_anns"].values()] diff --git a/src/miade/core.py b/src/miade/core.py index 91d109f..6f4103c 100644 --- a/src/miade/core.py +++ b/src/miade/core.py @@ -19,13 +19,22 @@ log = logging.getLogger(__name__) -def create_annotator(name: str, model_factory: ModelFactory): +def create_annotator(name: str, model_factory: ModelFactory) -> Annotator: """ - Returns Annotator created from ModelFactory configs - :param name: (str) alias of model - :param model_factory: (ModelFactory) model factory loaded from config.yaml containing mapping of alias/name - to MedCAT model id and MiADE annotator - :return: Annotator + Returns Annotator created from ModelFactory configs. + + Args: + name (str): Alias of the model. + model_factory (ModelFactory): Model factory loaded from config.yaml containing mapping of alias/name + to MedCAT model id and MiADE annotator. + + Returns: + Annotator object created from the ModelFactory configs. + + Raises: + ValueError: If the MedCAT model for the given name does not exist, either because it is not configured + in config.yaml or missing from the models directory. + """ name = name.lower() if name not in model_factory.models: @@ -47,10 +56,14 @@ class NoteProcessor: """ Main processor of MiADE which extract, postprocesses, and deduplicates concepts given annotators (MedCAT models), Note, and existing concepts - :param model_directory (Path) path to directory that contains medcat models and a config.yaml file - :param log_level (int) log level - Default - INFO - :param device (str) whether inference should be run on cpu or gpu - default "cpu" - :param custom_annotators (List[Annotators]) List of custom annotators + + Args: + model_directory (Path): Path to directory that contains medcat models and a config.yaml file + model_config_path (Path, optional): Path to the model config file. Defaults to None. + log_level (int, optional): Log level. Defaults to logging.INFO. + dosage_extractor_log_level (int, optional): Log level for dosage extractor. Defaults to logging.INFO. + device (str, optional): Device to run inference on (cpu or gpu). Defaults to "cpu". + custom_annotators (List[Annotator], optional): List of custom annotators. Defaults to None. """ def __init__( @@ -76,9 +89,11 @@ def __init__( def _load_config(self) -> Dict: """ - Loads configuration file (config.yaml) in configured model path, default to model directory if not - passed explicitly - :return: (Dict) config file + Loads the configuration file (config.yaml) in the configured model path. + If the model path is not explicitly passed, it defaults to the model directory. + + Returns: + A dictionary containing the loaded config file. """ if self.model_config_path is None: config_path = os.path.join(self.model_directory, "config.yaml") @@ -97,12 +112,18 @@ def _load_config(self) -> Dict: def _load_model_factory(self, custom_annotators: Optional[List[Annotator]] = None) -> ModelFactory: """ - Loads model factory which maps model alias to medcat model id and miade annotator - There could be a less redundant way to structure the model configs - for now, if it ain't broke... - :param custom_annotators (List[Annotators]) List of custom annotators to initialise - :return: ModelFactory object - """ + Loads the model factory which maps model aliases to MedCAT model IDs and MiADE annotators. + Args: + custom_annotators (List[Annotators], optional): List of custom annotators to initialize. Defaults to None. + + Returns: + The initialized ModelFactory object. + + Raises: + Exception: If there is an error loading MedCAT models. + + """ meta_cat_config_dict = {"general": {"device": self.device}} config_dict = self._load_config() loaded_models = {} @@ -168,9 +189,16 @@ def _load_model_factory(self, custom_annotators: Optional[List[Annotator]] = Non def add_annotator(self, name: str) -> None: """ - Adds annotators to processor - :param name: (str) alias of annotator to add - :return: None + Adds an annotator to the processor. + + Args: + name (str): The alias of the annotator to add. + + Returns: + None + + Raises: + Exception: If there is an error creating the annotator. """ try: annotator = create_annotator(name, self.model_factory) @@ -184,9 +212,13 @@ def add_annotator(self, name: str) -> None: def remove_annotator(self, name: str) -> None: """ - Removes annotators from processor - :param name: (str) alias of annotator to remove - :return: None + Removes an annotator from the processor. + + Args: + name (str): The alias of the annotator to remove. + + Returns: + None """ annotator_found = False annotator_name = self.model_factory.annotators[name] @@ -201,11 +233,27 @@ def remove_annotator(self, name: str) -> None: if not annotator_found: log.warning(f"Annotator {type(name).__name__} not found in processor") - def print_model_cards(self): + def print_model_cards(self) -> None: + """ + Prints the model cards for each annotator in the `annotators` list. + + Each model card includes the name of the annotator's class and its category. + """ for annotator in self.annotators: print(f"{type(annotator).__name__}: {annotator.cat}") def process(self, note: Note, record_concepts: Optional[List[Concept]] = None) -> List[Concept]: + """ + Process the given note and extract concepts using the loaded annotators. + + Args: + note (Note): The note to be processed. + record_concepts (Optional[List[Concept]]): A list of existing concepts in the EHR record. + + Returns: + A list of extracted concepts. + + """ if not self.annotators: log.warning("No annotators loaded, use .add_annotator() to load annotators") return [] @@ -227,11 +275,15 @@ def get_concept_dicts( self, note: Note, filter_uncategorized: bool = True, record_concepts: Optional[List[Concept]] = None ) -> List[Dict]: """ - Returns concepts in dictionary format - :param note: (Note) note containing text to extract concepts from - :param filter_uncategorized (bool) if True, does not return concepts where category=None, default TRUE - :param record_concepts: (List[Concepts] list of concepts in existing record - :return: List[Dict] extracted concepts in json compatible dict format + Returns concepts in dictionary format. + + Args: + note (Note): Note containing text to extract concepts from. + filter_uncategorized (bool): If True, does not return concepts where category=None. Default is True. + record_concepts (Optional[List[Concept]]): List of concepts in existing record. + + Returns: + Extracted concepts in JSON-compatible dictionary format. """ concepts = self.process(note, record_concepts) concept_list = [] diff --git a/lookup_data/allergens_subset.csv b/src/miade/data/allergens_subset.csv similarity index 100% rename from lookup_data/allergens_subset.csv rename to src/miade/data/allergens_subset.csv diff --git a/lookup_data/allergy_type.csv b/src/miade/data/allergy_type.csv similarity index 100% rename from lookup_data/allergy_type.csv rename to src/miade/data/allergy_type.csv diff --git a/lookup_data/historic.csv b/src/miade/data/historic.csv similarity index 100% rename from lookup_data/historic.csv rename to src/miade/data/historic.csv diff --git a/lookup_data/negated.csv b/src/miade/data/negated.csv similarity index 100% rename from lookup_data/negated.csv rename to src/miade/data/negated.csv diff --git a/lookup_data/problem_blacklist.csv b/src/miade/data/problem_blacklist.csv similarity index 100% rename from lookup_data/problem_blacklist.csv rename to src/miade/data/problem_blacklist.csv diff --git a/lookup_data/reactions_subset.csv b/src/miade/data/reactions_subset.csv similarity index 100% rename from lookup_data/reactions_subset.csv rename to src/miade/data/reactions_subset.csv diff --git a/lookup_data/suspected.csv b/src/miade/data/suspected.csv similarity index 100% rename from lookup_data/suspected.csv rename to src/miade/data/suspected.csv diff --git a/lookup_data/valid_meds.csv b/src/miade/data/valid_meds.csv similarity index 100% rename from lookup_data/valid_meds.csv rename to src/miade/data/valid_meds.csv diff --git a/lookup_data/vtm_to_text.csv b/src/miade/data/vtm_to_text.csv similarity index 100% rename from lookup_data/vtm_to_text.csv rename to src/miade/data/vtm_to_text.csv diff --git a/lookup_data/vtm_to_vmp.csv b/src/miade/data/vtm_to_vmp.csv similarity index 100% rename from lookup_data/vtm_to_vmp.csv rename to src/miade/data/vtm_to_vmp.csv diff --git a/src/miade/dosage.py b/src/miade/dosage.py index 24b48f8..bc1fe98 100644 --- a/src/miade/dosage.py +++ b/src/miade/dosage.py @@ -64,11 +64,16 @@ class Route(BaseModel): def parse_dose(text: str, quantities: List[str], units: List[str], results: Dict) -> Optional[Dose]: """ - :param text: (str) string containing dose - :param quantities: (list) list of quantity entities NER - :param units: (list) list of unit entities from NER - :param results: (dict) dosage lookup results - :return: dose: (Dose) pydantic model containing dose in CDA format; returns None if inconclusive + Parses the dose information from the given text, quantities, units, and results. + + Args: + text (str): String containing the dose. + quantities (List[str]): List of quantity entities from NER. + units (List[str]): List of unit entities from NER. + results (Dict): Dosage lookup results. + + Returns: + Pydantic model containing the dose in CDA format. Returns None if inconclusive. """ quantity_dosage = Dose(source=text) @@ -148,9 +153,14 @@ def parse_dose(text: str, quantities: List[str], units: List[str], results: Dict def parse_frequency(text: str, results: Dict) -> Optional[Frequency]: """ - :param text: (str) processed text which the lookup is performed on - :param results: (dict) dosage lookup results - :return: dose: (Frequency) pydantic model containing frequency in CDA format; returns None if inconclusive + Parses the frequency of a dosage from processed text. + + Args: + text (str): The processed text on which the lookup is performed. + results (Dict): The dosage lookup results. + + Returns: + A Frequency object containing the frequency in CDA format. Returns None if inconclusive. """ # TODO: extract frequency range @@ -186,11 +196,16 @@ def parse_duration( text: str, results: Dict, total_dose: Optional[float], daily_dose: Optional[float] ) -> Optional[Duration]: """ - :param text: (str) string containing duration - :param results: (dict) dosage lookup results - :param total_dose: (float) total dose of the medication if extracted - :param daily_dose: (float) total dose of the medication in a day if extracted - :return: dose: (Duration) pydantic model containing duration in CDA format; returns None if inconclusive + Parses the duration of a medication dosage. + + Args: + text (str): String containing the duration. + results (dict): Dosage lookup results. + total_dose (float): Total dose of the medication if extracted. + daily_dose (float): Total dose of the medication in a day if extracted. + + Returns: + Pydantic model containing duration in CDA format; returns None if inconclusive. """ duration_dosage = Duration(source=text) @@ -216,9 +231,14 @@ def parse_duration( def parse_route(text: str, dose: Optional[Dose]) -> Optional[Route]: """ - :param text: (str) string containing route - :param dose: (Dose) dose object - :return: (Route) pydantic model containing route in CDA format; returns None if inconclusive + Parses the route from the given text and dose. + + Args: + text (str): String containing the route. + dose (Optional[Dose]): Dose object. + + Returns: + Pydantic model containing the route in CDA format. Returns None if inconclusive. """ # prioritise oral and inhalation route_dosage = Route(source=text) @@ -267,12 +287,16 @@ def __init__( @classmethod def from_doc(cls, doc: Doc, calculate: bool = True): """ - Parses dosage from a spacy doc object - :param doc: (Doc) spacy doc object with processed dosage text - :param calculate: (bool) whether to calculate duration if total and daily dose is given - :return: - """ + Parses dosage from a spacy doc object. + Args: + doc (Doc): Spacy doc object with processed dosage text. + calculate (bool, optional): Whether to calculate duration if total and daily dose is given. Defaults to True. + + Returns: + An instance of the class with the parsed dosage information. + + """ quantities = [] units = [] dose_start = 1000 diff --git a/src/miade/dosageextractor.py b/src/miade/dosageextractor.py index d3269c6..592df44 100644 --- a/src/miade/dosageextractor.py +++ b/src/miade/dosageextractor.py @@ -15,6 +15,10 @@ class DosageExtractor: """ Parses and extracts drug dosage + + Attributes: + model (str): The name of the model to be used for dosage extraction. + dosage_extractor (Language): The Spacy pipeline for dosage extraction. """ def __init__(self, model: str = "en_core_med7_lg"): @@ -25,7 +29,9 @@ def _create_drugdoseade_pipeline(self) -> Language: """ Creates a spacy pipeline with given model (default med7) and customised pipeline components for dosage extraction - :return: nlp (spacy.Language) + + Returns: + nlp (spacy.Language): The Spacy pipeline for dosage extraction. """ nlp = spacy.load(self.model) nlp.add_pipe("preprocessor", first=True) @@ -39,9 +45,13 @@ def _create_drugdoseade_pipeline(self) -> Language: def extract(self, text: str, calculate: bool = True) -> Optional[Dosage]: """ Processes a string that contains dosage instructions (excluding drug concept as this is handled by core) - :param text: (str) string containing dosage - :param calculate: (bool) whether to calculate duration from total and daily dose, if given - :return: dosage: (Dosage) dosage object with parsed dosages in CDA format + + Args: + text (str): The string containing dosage instructions. + calculate (bool): Whether to calculate duration from total and daily dose, if given. + + Returns: + The dosage object with parsed dosages in CDA format. """ doc = self.dosage_extractor(text) diff --git a/src/miade/drugdoseade/entities_refiner.py b/src/miade/drugdoseade/entities_refiner.py index 48a67c3..224f7ed 100644 --- a/src/miade/drugdoseade/entities_refiner.py +++ b/src/miade/drugdoseade/entities_refiner.py @@ -1,5 +1,6 @@ import logging +from spacy.tokens import Doc from spacy.language import Language from spacy.tokens import Span @@ -8,8 +9,17 @@ @Language.component("entities_refiner") -def EntitiesRefiner(doc): - """Refines NER results""" +def EntitiesRefiner(doc) -> Doc: + """ + Refines NER results by merging consecutive labels with the same tag, + removing strength labels, and merging drug labels with dosage labels. + + Args: + doc (spacy.tokens.Doc): The input document containing named entities. + + Returns: + spacy.tokens.Doc: The refined document with updated named entities. + """ new_ents = [] for ind, ent in enumerate(doc.ents): diff --git a/src/miade/drugdoseade/pattern_matcher.py b/src/miade/drugdoseade/pattern_matcher.py index 316dee1..1e9a818 100644 --- a/src/miade/drugdoseade/pattern_matcher.py +++ b/src/miade/drugdoseade/pattern_matcher.py @@ -57,6 +57,16 @@ def __init__(self, nlp: Language, patterns: Dict): ) def __call__(self, doc: Doc) -> Doc: + """ + Process the given document and extract dosage information. + + Args: + doc (Doc): The input document to process. + + Returns: + The processed document with extracted dosage information. + + """ new_entities = [] dose_string = doc.text diff --git a/src/miade/drugdoseade/preprocessor.py b/src/miade/drugdoseade/preprocessor.py index 0168687..111b4b2 100644 --- a/src/miade/drugdoseade/preprocessor.py +++ b/src/miade/drugdoseade/preprocessor.py @@ -68,6 +68,16 @@ def __init__(self, nlp: Language, singleword: Dict, multiword: Dict): Doc.set_extension("original_text", default="") def __call__(self, doc: Doc) -> Doc: + """ + Preprocesses a spaCy `Doc` object by performing various text transformations. + + Args: + doc (Doc): The input spaCy `Doc` object to be preprocessed. + + Returns: + The preprocessed spaCy `Doc` object. + + """ processed_text = [] # singleword replacement diff --git a/src/miade/drugdoseade/utils.py b/src/miade/drugdoseade/utils.py index 8f0ac7c..072ef55 100644 --- a/src/miade/drugdoseade/utils.py +++ b/src/miade/drugdoseade/utils.py @@ -7,9 +7,19 @@ log = logging.getLogger(__name__) -def word_replace(word: str, dictionary: Dict[str, str], processed_text: List[str]): - """Replaces words with entries from CALIBERdrugdose singleword dict""" +def word_replace(word: str, dictionary: Dict[str, str], processed_text: List[str]) -> List[str]: + """ + Replaces words with entries from CALIBERdrugdose singleword dict + Args: + word (str): The word to be replaced. + dictionary (Dict[str, str]): A dictionary containing word replacements. + processed_text (List[str]): A list to store the processed text. + + Returns: + The processed text with word replacements. + + """ replacement = dictionary.get(word, None) if isinstance(replacement, str): # replace with dict entry @@ -35,7 +45,17 @@ def word_replace(word: str, dictionary: Dict[str, str], processed_text: List[str return processed_text -def numbers_replace(text): +def numbers_replace(text) -> str: + """ + Replaces numbers and units in the given text according to specific patterns. + + Args: + text (str): The input text to be processed. + + Returns: + The processed text with numbers and units replaced. + + """ # 10 ml etc text = re.sub( r" (\d+) o (ml|microgram|mcg|gram|mg) ", diff --git a/src/miade/metaannotations.py b/src/miade/metaannotations.py index 9bdb16a..ed05f58 100644 --- a/src/miade/metaannotations.py +++ b/src/miade/metaannotations.py @@ -24,6 +24,15 @@ class MetaAnnotations(BaseModel): + """ + Represents a meta annotation with a name, value, and optional confidence. + + Attributes: + name (str): The name of the meta annotation. + value (Enum): The value of the meta annotation. + confidence (float, optional): The confidence level of the meta annotation. + """ + name: str value: Enum confidence: Optional[float] diff --git a/src/miade/model_builders/preprocess_snomeduk.py b/src/miade/model_builders/preprocess_snomeduk.py index 11242e8..89f338a 100644 --- a/src/miade/model_builders/preprocess_snomeduk.py +++ b/src/miade/model_builders/preprocess_snomeduk.py @@ -1,5 +1,5 @@ """This module is essentially the same as the MedCAT util preprocess_snomed.py - with a few minor changes adapted to reading snomed UK folder paths""" +with a few minor changes adapted to reading snomed UK folder paths""" import os import re diff --git a/src/miade/note.py b/src/miade/note.py index 555b6f2..3264601 100644 --- a/src/miade/note.py +++ b/src/miade/note.py @@ -10,7 +10,15 @@ class Note(object): - """docstring for Note.""" + """ + Represents a note object. + + Attributes: + text (str): The text content of the note. + raw_text (str): The raw text content of the note. + regex_config (str): The path to the regex configuration file. + paragraphs (Optional[List[Paragraph]]): A list of paragraphs in the note. + """ # TODO: refactor paragraph methods to a separate class. It's too much. @@ -22,6 +30,13 @@ def __init__(self, text: str): self.numbered_list: List[tuple] = [] def clean_text(self) -> None: + """ + Cleans the text content of the note. + + This method performs various cleaning operations on the text content of the note, + such as replacing spaces, removing punctuation, and removing empty lines. + """ + # Replace all types of spaces with a single normal space, preserving "\n" self.text = re.sub(r"(?:(?!\n)\s)+", " ", self.text) diff --git a/src/miade/paragraph.py b/src/miade/paragraph.py index e58b950..2788d9b 100644 --- a/src/miade/paragraph.py +++ b/src/miade/paragraph.py @@ -15,6 +15,17 @@ class ParagraphType(Enum): class Paragraph(object): + """ + Represents a paragraph in a document. + + Attributes: + heading (str): The heading of the paragraph. + body (str): The body text of the paragraph. + type (ParagraphType): The type of the paragraph. + start (int): The starting position of the paragraph. + end (int): The ending position of the paragraph. + """ + def __init__(self, heading: str, body: str, type: ParagraphType, start: int, end: int): self.heading: str = heading self.body: str = body diff --git a/src/miade/utils/annotatorconfig.py b/src/miade/utils/annotatorconfig.py index e545334..2e1a8b1 100644 --- a/src/miade/utils/annotatorconfig.py +++ b/src/miade/utils/annotatorconfig.py @@ -3,7 +3,7 @@ class AnnotatorConfig(BaseModel): - lookup_data_path: Optional[str] = "./lookup_data/" + lookup_data_path: Optional[str] = None negation_detection: Optional[str] = "negex" structured_list_limit: Optional[int] = 100 disable: List[str] = [] diff --git a/src/miade/utils/logger.py b/src/miade/utils/logger.py index a1cad54..787cc52 100644 --- a/src/miade/utils/logger.py +++ b/src/miade/utils/logger.py @@ -1,5 +1,5 @@ -"""Loggers -""" +"""Loggers""" + import logging diff --git a/tests/conftest.py b/tests/conftest.py index a1d17a9..ef6f4f5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -178,7 +178,7 @@ def test_clean_and_paragraphing_note() -> Note: return Note( """ This is an example of text with various types of spaces: -\tTabs, \u00A0Non-breaking spaces, \u2003Em spaces, \u2002En spaces. +\tTabs, \u00a0Non-breaking spaces, \u2003Em spaces, \u2002En spaces. Some lines may contain only punctuation and spaces, like this: !? ... - -- ??? \n diff --git a/tests/data/models/config.yaml b/tests/data/models/config.yaml index 502301c..6083240 100644 --- a/tests/data/models/config.yaml +++ b/tests/data/models/config.yaml @@ -8,12 +8,6 @@ annotators: custom: CustomAnnotator general: problems: - lookup_data_path: "./lookup_data/" - negation_detection: negex # negex or metacat or none - disable: [] add_numbering: True meds/allergies: - lookup_data_path: "./lookup_data/" - negation_detection: None - disable: [] - add_numbering: False + lookup_data_path: "./src/miade/data/" diff --git a/tests/test_annotator.py b/tests/test_annotator.py index 4d61b3a..4cd4ea2 100644 --- a/tests/test_annotator.py +++ b/tests/test_annotator.py @@ -215,3 +215,19 @@ def test_vtm_med_conversions(test_meds_algy_medcat_model, test_vtm_concepts): duration=None, route=None, ) + + +def test_annotator_config(test_meds_algy_medcat_model, test_problems_medcat_model, test_config): + # check that all loads ok if pass in explicit path + test_config.lookup_data_path = "./src/miade/data/" + + meds_annotator = MedsAllergiesAnnotator(test_meds_algy_medcat_model, test_config) + assert meds_annotator.allergens_subset_lookup + assert meds_annotator.reactions_subset_lookup + assert meds_annotator.allergy_type_lookup + + probs_annotator = ProblemsAnnotator(test_problems_medcat_model, test_config) + assert probs_annotator.historic_lookup + assert probs_annotator.negated_lookup + assert probs_annotator.suspected_lookup + assert probs_annotator.filtering_blacklist is not None