Skip to content

Commit

Permalink
add a sync function for building a sqlite db
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlue committed Apr 15, 2024
1 parent 0cf5ad4 commit 841e65e
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 3 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,7 @@ __pycache__/
/dist/
/build/
/sysrev.egg-info/

scratch.py
.env
.sr
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name='sysrev',
version='1.2.4',
version='1.3.1',
description='get sysrev project data and use the sysrev api',
long_description=long_description,
long_description_content_type='text/markdown', # Specify the content type here
Expand Down
106 changes: 104 additions & 2 deletions sysrev/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import requests
import requests, sqlite3, pathlib, pandas as pd, json, tqdm

class LabelTransformer:

Expand All @@ -25,19 +25,121 @@ def transform_label(self, label_type, label_value):
return self.handle_categorical_or_string(label_value)
else:
raise ValueError("Invalid label type")

class Synchronizer:

def create_sqlite_db():
pathlib.Path(".sr").mkdir(exist_ok=True)
conn = sqlite3.connect('.sr/sr.sqlite')
c = conn.cursor()

# Create article_data table first
c.execute('''
CREATE TABLE IF NOT EXISTS article_data (
primary_title TEXT,
consensus INTEGER,
article_id TEXT PRIMARY KEY,
updated_time TEXT,
notes TEXT,
resolve INTEGER
);
''')

# Create labels table
c.execute('''
CREATE TABLE IF NOT EXISTS labels (
label_id INTEGER PRIMARY KEY,
label_id_local TEXT,
category TEXT,
definition TEXT,
name TEXT,
consensus INTEGER,
question TEXT,
project_ordering INTEGER,
short_label TEXT,
label_id_global TEXT,
root_label_id_local TEXT,
global_label_id TEXT,
project_id INTEGER,
enabled INTEGER,
value_type TEXT,
required INTEGER,
owner_project_id INTEGER
);
''')

# Create article_label table with foreign key references to both labels and article_data
c.execute('''
CREATE TABLE IF NOT EXISTS article_label (
article_id TEXT,
label_id INTEGER,
user_id INTEGER,
answer TEXT,
inclusion INTEGER,
updated_time TEXT,
confirm_time TEXT,
resolve INTEGER,
PRIMARY KEY (article_id, label_id),
FOREIGN KEY (label_id) REFERENCES labels (label_id),
FOREIGN KEY (article_id) REFERENCES article_data (article_id)
);
''')

# Indexes for improved query performance
c.execute('CREATE INDEX IF NOT EXISTS idx_labels_project_id ON labels (project_id);')
c.execute('CREATE INDEX IF NOT EXISTS idx_article_label_user_id ON article_label (user_id);')

# Commit changes and close connection
conn.commit()
conn.close()

def sync(self, client, project_id):
project_info = client.get_project_info(project_id)

labels = client.get_labels(project_id)
labels_df = pd.DataFrame(labels)
labels_df['definition'] = labels_df['definition'].apply(json.dumps)

n_articles = project_info['result']['project']['stats']['articles']
articles = [resp for resp in tqdm.tqdm(client.fetch_all_articles(project_id), total=n_articles)]

article_labels = [a['labels'] for a in articles if a['labels'] is not None]
article_labels = [lbl for lbls in article_labels for lbl in lbls]
article_label_df = pd.DataFrame(article_labels)

article_data = [{k: v for k, v in a.items() if k != 'labels'} for a in articles]
article_data_df = pd.DataFrame(article_data)
article_data_df['resolve'] = article_data_df['resolve'].apply(json.dumps)

# write everything to .sr/sr.sqlite
conn = sqlite3.connect('.sr/sr.sqlite')

# Writing data to tables
labels_df.to_sql('labels', conn, if_exists='replace', index=False)
article_label_df.to_sql('article_label', conn, if_exists='replace', index=False)
article_data_df.to_sql('article_data', conn, if_exists='replace', index=False)

conn.close()
class Client():

def __init__(self, api_key, base_url="https://www.sysrev.com"):
self.api_key = api_key
self.base_url = base_url

def sync(self, project_id):
Synchronizer().sync(self, project_id)

def get_project_info(self, project_id):
endpoint = f"{self.base_url}/api-json/project-info"
headers = {"Authorization": f"Bearer {self.api_key}"}
response = requests.get(endpoint, headers=headers, params={"project-id": project_id})
return response.json()


def get_labels(self, project_id):
raw_labels = self.get_project_info(project_id)['result']['project']['labels']
labels = [{"label_id": label_id} | raw_labels[label_id] for label_id in raw_labels.keys()]
return labels

def set_labels(self, project_id, article_id, label_ids, label_values, label_types, confirm=False, change=False, resolve=False):
endpoint = f"{self.base_url}/api-json/set-labels"
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
Expand Down

0 comments on commit 841e65e

Please sign in to comment.