From 494e9386f54a3fd9b432f9347271ffea98a7c7f5 Mon Sep 17 00:00:00 2001 From: Thomas Luechtefeld Date: Tue, 16 Apr 2024 08:06:10 -0400 Subject: [PATCH] add tables to sync, inc version --- setup.py | 2 +- sysrev/client.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b49dfc3..b1de944 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='sysrev', - version='1.3.1', + version='1.3.2', description='get sysrev project data and use the sysrev api', long_description=long_description, long_description_content_type='text/markdown', # Specify the content type here diff --git a/sysrev/client.py b/sysrev/client.py index 46c230a..22a5f40 100644 --- a/sysrev/client.py +++ b/sysrev/client.py @@ -111,6 +111,23 @@ def sync(self, client, project_id): article_data_df = pd.DataFrame(article_data) article_data_df['resolve'] = article_data_df['resolve'].apply(json.dumps) + article_info = [] + for article_id in tqdm.tqdm(article_data_df['article-id'], total=n_articles): + article_info.append(client.get_article_info(project_id, article_id)) + + full_texts = pd.DataFrame([{**ft} for a in article_info for ft in a['article'].get('full-texts', []) ]) + full_texts.columns = [col.split('/')[-1] for col in full_texts.columns] + + auto_labels = pd.DataFrame([ + {**{'article-id': a['article'].get('article-id'), 'label-id': label_id}, **details} for a in article_info + for label_id, details in a['article'].get('auto-labels', {}).items() ]) + auto_labels['answer'] = auto_labels['answer'].apply(json.dumps) + + csl_citations = pd.DataFrame([ + {**{k: json.dumps(v) if isinstance(v, (dict, list)) else v for k, v in item['itemData'].items()}, + 'article-id': a['article'].get('article-id')} + for a in article_info for item in a['article'].get('csl-citation', {}).get('citationItems', [])]) + # write everything to .sr/sr.sqlite conn = sqlite3.connect('.sr/sr.sqlite') @@ -118,6 +135,9 @@ def sync(self, client, project_id): labels_df.to_sql('labels', conn, if_exists='replace', index=False) article_label_df.to_sql('article_label', conn, if_exists='replace', index=False) article_data_df.to_sql('article_data', conn, if_exists='replace', index=False) + full_texts.to_sql('full_texts', conn, if_exists='replace', index=False) + auto_labels.to_sql('auto_labels', conn, if_exists='replace', index=False) + csl_citations.to_sql('csl_citations', conn, if_exists='replace', index=False) conn.close() class Client(): @@ -185,7 +205,8 @@ def get_article_info(self, project_id, article_id): endpoint = f"{self.base_url}/api-json/article-info/{article_id}" headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} body = {"project-id": project_id,} - return requests.get(endpoint, headers=headers, json=body) + response = requests.get(endpoint, headers=headers, json=body) + return response.json()['result'] def upload_jsonlines(self, file_path, project_id): url = f"{self.base_url}/api-json/import-files/{project_id}"