Skip to content

Commit

Permalink
Refactored synchronization scripts to enhance modularity and synchron…
Browse files Browse the repository at this point in the history
…ize the Google Sheet and database daily, removing SEC data gathering, and handling conflicts as specified. It handles conflicts by:

- Updating empty cells in one source with data from the other source.
- Leaving cells unchanged when both sources have conflicting non-empty data.
  • Loading branch information
JamesAlfonse authored Dec 4, 2024
1 parent b293797 commit 5fbccc1
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 58 deletions.
46 changes: 0 additions & 46 deletions .github/workflows/SQL_To_Sheets.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Update Database with Google Sheets Data
name: Synchronize Google Sheet and Database

on:
schedule:
Expand All @@ -9,7 +9,7 @@ concurrency:
group: database-update

jobs:
update_database:
sync_data:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
Expand All @@ -22,24 +22,21 @@ jobs:

- name: Install Python Dependencies
run: |
pip install gspread oauth2client
pip install pandas gspread oauth2client
- name: Read Google Sheet and Write to SQL Database
- name: Synchronize Google Sheet and Database
env:
SHEET_ID: ${{ secrets.SHEET_ID }}
GOOGLE_API_KEYS: ${{ secrets.GOOGLE_API_KEYS }}
run: |
python 'src/scripts/update_sql.py'
- name: Change Primary Key in Database
run: python src/scripts/change_primary_key_to_cik.py
python 'src/scripts/synchronize_google_sheet_and_db.py'
- name: Commit and Push Database Files
run: |
git config --global user.name 'JamesAlfonse'
git config --global user.email 'jamesalfonse@gmail.com'
git add 'data/Full_Database_Backend.db' 'data/Full_Database_Backend.json' # Add both the database and the JSON file
git commit -m "Updated database and JSON files" -a || echo "No changes to commit."
git config --global user.name 'YourName'
git config --global user.email 'YourEmail@example.com'
git add 'data/Full_Database_Backend.db' 'data/Full_Database_Backend.json'
git commit -m "Synchronized database with Google Sheet" -a || echo "No changes to commit."
git push
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
37 changes: 37 additions & 0 deletions src/scripts/data_merger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pandas as pd

class DataMerger:
def merge_dataframes(self, df_sheet, df_db):
columns = ['CIK', 'Ticker', 'Exchange', 'CompanyNameIssuer', 'TransferAgent', 'OnlinePurchase', 'DTCMemberNum', 'TAURL',
'TransferAgentPct', 'IREmails', 'IRPhoneNum', 'IRCompanyAddress', 'IRURL', 'IRContactInfo',
'SharesOutstanding', 'CUSIP', 'CompanyInfoURL', 'CompanyInfo', 'FullProgressPct', 'DRS',
'PercentSharesDRSd', 'SubmissionReceived', 'TimestampsUTC', 'LearnMoreAboutDRS', 'CertificatesOffered',
'SandP500', 'IncorporatedIn']

df_sheet = df_sheet.reindex(columns=columns)
df_db = df_db.reindex(columns=columns)

df_sheet.replace('', pd.NA, inplace=True)
df_db.replace('', pd.NA, inplace=True)

df_merged = pd.merge(df_sheet, df_db, on=['CIK', 'Ticker'], how='outer', suffixes=('_sheet', '_db'), indicator=True)

df_db_updates = pd.DataFrame()
df_sheet_updates = pd.DataFrame()

for col in columns:
if col in ['CIK', 'Ticker']:
continue
col_sheet = col + '_sheet'
col_db = col + '_db'

condition_db_update = df_merged[col_db].isna() & df_merged[col_sheet].notna()
df_db_updates.loc[condition_db_update, col] = df_merged.loc[condition_db_update, col_sheet]

condition_sheet_update = df_merged[col_sheet].isna() & df_merged[col_db].notna()
df_sheet_updates.loc[condition_sheet_update, col] = df_merged.loc[condition_sheet_update, col_db]

df_db_updates[['CIK', 'Ticker']] = df_merged.loc[df_db_updates.index, ['CIK', 'Ticker']]
df_sheet_updates[['CIK', 'Ticker']] = df_merged.loc[df_sheet_updates.index, ['CIK', 'Ticker']]

return df_db_updates, df_sheet_updates
96 changes: 96 additions & 0 deletions src/scripts/database_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import sqlite3
import pandas as pd
import json

class DatabaseHandler:
def __init__(self, db_file_path):
self.db_file_path = db_file_path
self.ensure_database_schema()

def ensure_database_schema(self):
conn = sqlite3.connect(self.db_file_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS full_database_backend (
CIK TEXT,
Ticker TEXT,
Exchange TEXT,
CompanyNameIssuer TEXT,
TransferAgent TEXT,
OnlinePurchase TEXT,
DTCMemberNum TEXT,
TAURL TEXT,
TransferAgentPct TEXT,
IREmails TEXT,
IRPhoneNum TEXT,
IRCompanyAddress TEXT,
IRURL TEXT,
IRContactInfo TEXT,
SharesOutstanding TEXT,
CUSIP TEXT,
CompanyInfoURL TEXT,
CompanyInfo TEXT,
FullProgressPct TEXT,
DRS TEXT,
PercentSharesDRSd TEXT,
SubmissionReceived TEXT,
TimestampsUTC TEXT,
LearnMoreAboutDRS TEXT,
CertificatesOffered TEXT,
SandP500 TEXT,
IncorporatedIn TEXT,
PRIMARY KEY (CIK, Ticker)
)
''')
conn.commit()
conn.close()

def read_database_to_dataframe(self):
conn = sqlite3.connect(self.db_file_path)
query = "SELECT * FROM full_database_backend"
df_db = pd.read_sql_query(query, conn)
conn.close()
return df_db

def update_database(self, df_updates):
if df_updates.empty:
print("No updates to apply to the database.")
return

conn = sqlite3.connect(self.db_file_path)
cursor = conn.cursor()
for index, row in df_updates.iterrows():
CIK = row['CIK']
Ticker = row['Ticker']
columns_to_update = [col for col in df_updates.columns if col not in ['CIK', 'Ticker'] and pd.notna(row[col])]
set_clause = ', '.join([f"{col} = ?" for col in columns_to_update])
values = [row[col] for col in columns_to_update]
values.extend([CIK, Ticker])

if set_clause:
sql = f"UPDATE full_database_backend SET {set_clause} WHERE CIK = ? AND Ticker = ?"
cursor.execute(sql, values)
if cursor.rowcount == 0:
columns = ['CIK', 'Ticker'] + columns_to_update
placeholders = ', '.join(['?'] * len(columns))
insert_values = [row[col] for col in columns]
sql_insert = f"INSERT INTO full_database_backend ({', '.join(columns)}) VALUES ({placeholders})"
cursor.execute(sql_insert, insert_values)
else:
print(f"No updates for record with CIK={CIK} and Ticker={Ticker}")

conn.commit()
conn.close()
print("Database updated successfully.")

def export_database_to_json(self, json_file_path):
conn = sqlite3.connect(self.db_file_path)
cursor = conn.cursor()
cursor.execute('SELECT * FROM full_database_backend')
rows = cursor.fetchall()
column_names = [description[0] for description in cursor.description]
data_json = [dict(zip(column_names, row)) for row in rows]
with open(json_file_path, 'w', encoding='utf-8') as f:
json.dump(data_json, f, ensure_ascii=False, indent=4)
conn.close()
print(f"Exported database to {json_file_path}")
55 changes: 55 additions & 0 deletions src/scripts/google_sheet_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import gspread
import pandas as pd

class GoogleSheetHandler:
def __init__(self, sheet_id, creds_json):
self.gc = gspread.service_account_from_dict(creds_json)
self.sheet = self.gc.open_by_key(sheet_id)

def read_sheet_to_dataframe(self, worksheet_name):
worksheet = self.sheet.worksheet(worksheet_name)
data = worksheet.get_all_values()
headers = data[0]
records = data[1:]
df_sheet = pd.DataFrame(records, columns=headers)
return df_sheet

def update_google_sheet(self, worksheet_name, df_updates):
if df_updates.empty:
print("No updates to apply to the Google Sheet.")
return

worksheet = self.sheet.worksheet(worksheet_name)
headers = worksheet.row_values(1)
data = worksheet.get_all_values()
records = data[1:]
df_sheet_all = pd.DataFrame(records, columns=headers)
df_sheet_all['Row_Number'] = range(2, len(df_sheet_all) + 2)
df_sheet_all['CIK'] = df_sheet_all['CIK'].fillna('')
df_sheet_all['Ticker'] = df_sheet_all['Ticker'].fillna('')
key_to_row = df_sheet_all.set_index(['CIK', 'Ticker'])['Row_Number'].to_dict()

updates = []
new_rows = []
for index, row in df_updates.iterrows():
CIK = row['CIK']
Ticker = row['Ticker']
key = (CIK, Ticker)
if key in key_to_row:
row_number = key_to_row[key]
for col in df_updates.columns:
if col not in ['CIK', 'Ticker'] and pd.notna(row[col]):
col_index = headers.index(col) + 1
cell = gspread.Cell(row_number, col_index, row[col])
updates.append(cell)
else:
new_row = [row.get(col, '') if pd.notna(row.get(col, '')) else '' for col in headers]
new_rows.append(new_row)

if updates:
worksheet.update_cells(updates, value_input_option='USER_ENTERED')
print(f"Updated {len(updates)} cells in Google Sheet.")

if new_rows:
worksheet.append_rows(new_rows, value_input_option='USER_ENTERED')
print(f"Added {len(new_rows)} new rows to Google Sheet.")
36 changes: 36 additions & 0 deletions src/scripts/synchronize_google_sheet_and_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import json
from google_sheet_utils import GoogleSheetHandler
from database_utils import DatabaseHandler
from data_merger import DataMerger

def main():
# Load credentials and environment variables
sheet_id = os.environ['SHEET_ID']
creds_json = json.loads(os.environ['GOOGLE_API_KEYS'])
db_file_path = 'data/Full_Database_Backend.db'
json_file_path = 'data/Full_Database_Backend.json'

# Initialize handlers
sheet_handler = GoogleSheetHandler(sheet_id, creds_json)
db_handler = DatabaseHandler(db_file_path)
data_merger = DataMerger()

# Read data from sources
df_sheet = sheet_handler.read_sheet_to_dataframe('Full_Database_Backend')
df_db = db_handler.read_database_to_dataframe()

# Merge data
df_db_updates, df_sheet_updates = data_merger.merge_dataframes(df_sheet, df_db)

# Apply updates
db_handler.update_database(df_db_updates)
sheet_handler.update_google_sheet('Full_Database_Backend', df_sheet_updates)

# Export database to JSON
db_handler.export_database_to_json(json_file_path)

print("Synchronization between Google Sheet and database completed successfully.")

if __name__ == "__main__":
main()

0 comments on commit 5fbccc1

Please sign in to comment.