diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml new file mode 100644 index 0000000..59a7355 --- /dev/null +++ b/.github/workflows/pypi_publish.yml @@ -0,0 +1,31 @@ +name: Publish Python package to PyPI + +on: + push: + branches: + - master # Set this to your default branch + +jobs: + build-and-publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' # Use the Python version compatible with your project + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install build + + - name: Build package + run: python -m build + + - name: Publish to PyPI + if: github.event_name == 'push' && startsWith(github.ref, 'refs/heads/master') + uses: pypa/gh-action-pypi-publish@v1.4.2 + with: + password: ${{ secrets.PYPI_TOKEN }} + user: sysrev diff --git a/.github/workflows/sysrev.yml.old b/.github/workflows/sysrev.yml.old new file mode 100644 index 0000000..444838b --- /dev/null +++ b/.github/workflows/sysrev.yml.old @@ -0,0 +1,23 @@ +name: Biobricks codecov + +on: [push, pull_request] + +env: + BIOBRICKS_TEST_TOKEN: ${{ secrets.BIOBRICKS_TEST_TOKEN }} + +jobs: + build: + runs-on: ubuntu-latest + name: Test biobricks + steps: + - uses: actions/checkout@v2 # Updated to use v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.10' + - name: Install requirements + run: pip install -r requirements.txt # Removed the working-directory override for this step + - name: Run tests and collect coverage + run: pytest --cov=./ --cov-report=xml + working-directory: ./ # Set the working directory only for the test step + - name: Upload coverage reports to Codecov with GitHub Action + uses: codecov/codecov-action@v3 diff --git a/PySysrev/__init__.py b/PySysrev/__init__.py deleted file mode 100644 index 87ffb0d..0000000 --- a/PySysrev/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .funcs import getAnnotations -from .funcs import processAnnotations diff --git a/PySysrev/funcs.py b/PySysrev/funcs.py deleted file mode 100644 index 3cda01c..0000000 --- a/PySysrev/funcs.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals, print_function -import pandas -import requests -from pathlib import Path - -def getAnnotations(project_id): - url = 'https://sysrev.com/web-api/project-annotations?project-id=' + str(project_id) - response = requests.get(url) - result = response.json()["result"] - - data = {} - data['selection'] = [x['selection'] for x in result] - data['annotation'] = [x['annotation'] for x in result] - data['semantic_class'] = [x['semantic-class'] for x in result] - data['external_id'] = [x['pmid'] for x in result] - data['sysrev_id'] = [x['article-id'] for x in result] - data['text'] = [x['context']['text-context'] for x in result] - data['start'] = [x['context']['start-offset'] if 'start-offset' in list(x['context'].keys()) else None for x in result] - data['end'] = [x['context']['end-offset'] if 'end-offset' in list(x['context'].keys()) else None for x in result] - data['datasource'] = ['pubmed'] * len(result) - - df = pandas.DataFrame.from_dict(data) - return df - -def processAnnotations(project_id, label): - - def remove_overlapping_entities(df): - idx_to_remove = [] - for text_id in df.text.unique(): - all_ranges = [] - sub_df = df[df['text'] == text_id][['sysrev_id', 'text', 'start', 'end']] - for index, row in sub_df.iterrows(): - r_start = int(row['start']) - r_end = int(row['end']) - if all([True if x not in all_ranges else False for x in range(r_start, r_end)]): - all_ranges.extend(list(range(r_start, r_end))) - else: - idx_to_remove.append(index) - return idx_to_remove - - df = getAnnotations(project_id) - df = df.drop_duplicates(subset=['text', 'start', 'end']) - df = df[(df['start'].notnull()) & (df['end'].notnull())] - df = df[df['end'] - df['start'] < 50] - df = df.reset_index(drop=True) - overlapping_idx = remove_overlapping_entities(df) - df = df.drop(df.index[overlapping_idx]) - annotations = df.to_dict('records') - - def process_annotation(annotation): - return [annotation["text"], {"entities": [(int(annotation["start"]), int(annotation["end"]),label)]}] - - processed_annotations = list(map(process_annotation, annotations)) - - def combine_annotations(processed_annotations): - combined_annotations = {} - for text,entities in processed_annotations: - if combined_annotations.get(text) is None: - combined_annotations[text] = [] - combined_annotations[text].append(entities["entities"][0]) - for key in combined_annotations: - combined_annotations[key] = list(set(combined_annotations[key])) - return combined_annotations - - combined_processed_annotations = combine_annotations(processed_annotations) - final_json = [] - for k in combined_processed_annotations: - final_json.append([k,{"entities":combined_processed_annotations[k]}]) - - return final_json diff --git a/README b/README index 68d4b89..909924d 100644 --- a/README +++ b/README @@ -1,9 +1,4 @@ -A Python client for sysrev.com - -See Demo.ipynb for implementation of client on example project. - -To install PySysrev, simply run `pip install PySysrev` - -Then inside Python, run the following commands: ->>> import PySysrev ->>> df = PySysrev.getAnnotations(3144) +# Sysrev Python Client +The Sysrev package provides: +1. `SysrevClient` object for using the sysrev.com API +2. (soon) a method for synchronizing local sysrev projects with remote sysrev projects. \ No newline at end of file diff --git a/setup.py b/setup.py index b40384a..77b1c29 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,12 @@ from setuptools import setup -setup(name='PySysrev', - version='1.2.1', +setup(name='sysrev', + version='1.2.2', description='Gets annotations from Sysrev API', url='https://github.com/sysrev/PySysrev', - author='nole-lin', - author_email='nole@insilica.co', - packages=['PySysrev'], + author='Thomas Luechtefeld', + author_email='tom@insilica.co', + packages=['sysrev'], install_requires=[ 'pandas', 'requests', diff --git a/sysrev/__init__.py b/sysrev/__init__.py new file mode 100644 index 0000000..3cd14b9 --- /dev/null +++ b/sysrev/__init__.py @@ -0,0 +1 @@ +from .funcs import SysrevClient diff --git a/sysrev/funcs.py b/sysrev/funcs.py new file mode 100644 index 0000000..72c2ed8 --- /dev/null +++ b/sysrev/funcs.py @@ -0,0 +1,103 @@ +import requests + +class LabelTransformer: + + def handle_boolean(self, label_value): + if isinstance(label_value, bool): + return label_value + elif str(label_value).lower() in ['yes', 'no']: + return str(label_value).lower() == 'yes' + else: + raise ValueError("Invalid boolean value") + + def handle_categorical_or_string(self, label_value): + if isinstance(label_value, str): + return [label_value] + elif isinstance(label_value, list) and all(isinstance(item, str) for item in label_value): + return label_value + else: + raise ValueError("Invalid value for categorical or string type") + + def transform_label(self, label_type, label_value): + if label_type == 'boolean': + return self.handle_boolean(label_value) + elif label_type in ['categorical', 'string']: + return self.handle_categorical_or_string(label_value) + else: + raise ValueError("Invalid label type") + +class SysrevClient(): + + def __init__(self, api_key, base_url="https://www.sysrev.com"): + self.api_key = api_key + self.base_url = base_url + + def get_project_info(self, project_id): + endpoint = f"{self.base_url}/api-json/project-info" + headers = {"Authorization": f"Bearer {self.api_key}"} + response = requests.get(endpoint, headers=headers, params={"project-id": project_id}) + return response.json() + + def set_labels(self, project_id, article_id, label_ids, label_values, label_types, confirm=False, change=False, resolve=False): + endpoint = f"{self.base_url}/api-json/set-labels" + headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} + + assert len(label_ids) == len(label_values) == len(label_types), "Length of label_ids, label_values, and label_types should be the same." + + # construct label_values_dict + tf = LabelTransformer() + label_values_dict = {label_ids[i]: tf.transform_label(label_types[i], label_values[i]) for i in range(len(label_ids))} + + # Constructing the data payload as per the server's expectation + data = {"project-id": project_id, "article-id": article_id, "label-values": label_values_dict} + data.update({ "confirm?": confirm, "change?": change, "resolve?": resolve }) + + # Sending a POST request to the server + response = requests.post(endpoint, json=data, headers=headers) + return response.json() + + def get_project_articles(self, project_id, offset=0, limit=10, sort_by=None, sort_dir=None): + endpoint = f"{self.base_url}/api-json/project-articles" + headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} + body = {"project-id": project_id, "n-offset": offset, "n-count": limit} + + # Add optional sorting keys if provided + if sort_by: body["sort-by"] = sort_by + if sort_dir: body["sort-dir"] = sort_dir + + # Make the POST request with the simplified body + response = requests.post(endpoint, headers=headers, json=body) + return response.json() + + def fetch_all_articles(self, project_id, limit=10, sort_by=None, sort_dir=None): + offset = 0 + while True: + result = self.get_project_articles(project_id, offset=offset, limit=limit, sort_by=sort_by, sort_dir=sort_dir) + articles = result.get('result', []) + if not articles: + break # Stop iteration if no articles are left + yield from articles # Yield each article in the current batch + offset += len(articles) + + def get_article_info(self, project_id, article_id): + endpoint = f"{self.base_url}/api-json/article-info/{article_id}" + headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} + body = {"project-id": project_id,} + return requests.get(endpoint, headers=headers, json=body) + + def upload_jsonlines(self, file_path, project_id): + url = f"{self.base_url}/api-json/import-files/{project_id}" + headers = {"Authorization": f"Bearer {self.api_key}"} + + # Prepare the file for upload + with open(file_path, 'rb') as f: + files = {'file': (file_path.split('/')[-1], f, 'application/octet-stream')} + # Let requests handle "Content-Type" + response = requests.post(url, headers=headers, files=files) + + return response + + def get_article_file(self, project_id, article_id, hash): + url = f"{self.base_url}/api-json/files/{project_id}/article/{article_id}/download/{hash}" + headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} + \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100644 index dda65db..0000000 --- a/test.py +++ /dev/null @@ -1 +0,0 @@ -import PySysrev