-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactored synchronization scripts to enhance modularity and synchron…
…ize the Google Sheet and database daily, removing SEC data gathering, and handling conflicts as specified. It handles conflicts by: - Updating empty cells in one source with data from the other source. - Leaving cells unchanged when both sources have conflicting non-empty data.
- Loading branch information
1 parent
b293797
commit 5fbccc1
Showing
6 changed files
with
233 additions
and
58 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import pandas as pd | ||
|
||
class DataMerger: | ||
def merge_dataframes(self, df_sheet, df_db): | ||
columns = ['CIK', 'Ticker', 'Exchange', 'CompanyNameIssuer', 'TransferAgent', 'OnlinePurchase', 'DTCMemberNum', 'TAURL', | ||
'TransferAgentPct', 'IREmails', 'IRPhoneNum', 'IRCompanyAddress', 'IRURL', 'IRContactInfo', | ||
'SharesOutstanding', 'CUSIP', 'CompanyInfoURL', 'CompanyInfo', 'FullProgressPct', 'DRS', | ||
'PercentSharesDRSd', 'SubmissionReceived', 'TimestampsUTC', 'LearnMoreAboutDRS', 'CertificatesOffered', | ||
'SandP500', 'IncorporatedIn'] | ||
|
||
df_sheet = df_sheet.reindex(columns=columns) | ||
df_db = df_db.reindex(columns=columns) | ||
|
||
df_sheet.replace('', pd.NA, inplace=True) | ||
df_db.replace('', pd.NA, inplace=True) | ||
|
||
df_merged = pd.merge(df_sheet, df_db, on=['CIK', 'Ticker'], how='outer', suffixes=('_sheet', '_db'), indicator=True) | ||
|
||
df_db_updates = pd.DataFrame() | ||
df_sheet_updates = pd.DataFrame() | ||
|
||
for col in columns: | ||
if col in ['CIK', 'Ticker']: | ||
continue | ||
col_sheet = col + '_sheet' | ||
col_db = col + '_db' | ||
|
||
condition_db_update = df_merged[col_db].isna() & df_merged[col_sheet].notna() | ||
df_db_updates.loc[condition_db_update, col] = df_merged.loc[condition_db_update, col_sheet] | ||
|
||
condition_sheet_update = df_merged[col_sheet].isna() & df_merged[col_db].notna() | ||
df_sheet_updates.loc[condition_sheet_update, col] = df_merged.loc[condition_sheet_update, col_db] | ||
|
||
df_db_updates[['CIK', 'Ticker']] = df_merged.loc[df_db_updates.index, ['CIK', 'Ticker']] | ||
df_sheet_updates[['CIK', 'Ticker']] = df_merged.loc[df_sheet_updates.index, ['CIK', 'Ticker']] | ||
|
||
return df_db_updates, df_sheet_updates |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import sqlite3 | ||
import pandas as pd | ||
import json | ||
|
||
class DatabaseHandler: | ||
def __init__(self, db_file_path): | ||
self.db_file_path = db_file_path | ||
self.ensure_database_schema() | ||
|
||
def ensure_database_schema(self): | ||
conn = sqlite3.connect(self.db_file_path) | ||
cursor = conn.cursor() | ||
cursor.execute(''' | ||
CREATE TABLE IF NOT EXISTS full_database_backend ( | ||
CIK TEXT, | ||
Ticker TEXT, | ||
Exchange TEXT, | ||
CompanyNameIssuer TEXT, | ||
TransferAgent TEXT, | ||
OnlinePurchase TEXT, | ||
DTCMemberNum TEXT, | ||
TAURL TEXT, | ||
TransferAgentPct TEXT, | ||
IREmails TEXT, | ||
IRPhoneNum TEXT, | ||
IRCompanyAddress TEXT, | ||
IRURL TEXT, | ||
IRContactInfo TEXT, | ||
SharesOutstanding TEXT, | ||
CUSIP TEXT, | ||
CompanyInfoURL TEXT, | ||
CompanyInfo TEXT, | ||
FullProgressPct TEXT, | ||
DRS TEXT, | ||
PercentSharesDRSd TEXT, | ||
SubmissionReceived TEXT, | ||
TimestampsUTC TEXT, | ||
LearnMoreAboutDRS TEXT, | ||
CertificatesOffered TEXT, | ||
SandP500 TEXT, | ||
IncorporatedIn TEXT, | ||
PRIMARY KEY (CIK, Ticker) | ||
) | ||
''') | ||
conn.commit() | ||
conn.close() | ||
|
||
def read_database_to_dataframe(self): | ||
conn = sqlite3.connect(self.db_file_path) | ||
query = "SELECT * FROM full_database_backend" | ||
df_db = pd.read_sql_query(query, conn) | ||
conn.close() | ||
return df_db | ||
|
||
def update_database(self, df_updates): | ||
if df_updates.empty: | ||
print("No updates to apply to the database.") | ||
return | ||
|
||
conn = sqlite3.connect(self.db_file_path) | ||
cursor = conn.cursor() | ||
for index, row in df_updates.iterrows(): | ||
CIK = row['CIK'] | ||
Ticker = row['Ticker'] | ||
columns_to_update = [col for col in df_updates.columns if col not in ['CIK', 'Ticker'] and pd.notna(row[col])] | ||
set_clause = ', '.join([f"{col} = ?" for col in columns_to_update]) | ||
values = [row[col] for col in columns_to_update] | ||
values.extend([CIK, Ticker]) | ||
|
||
if set_clause: | ||
sql = f"UPDATE full_database_backend SET {set_clause} WHERE CIK = ? AND Ticker = ?" | ||
cursor.execute(sql, values) | ||
if cursor.rowcount == 0: | ||
columns = ['CIK', 'Ticker'] + columns_to_update | ||
placeholders = ', '.join(['?'] * len(columns)) | ||
insert_values = [row[col] for col in columns] | ||
sql_insert = f"INSERT INTO full_database_backend ({', '.join(columns)}) VALUES ({placeholders})" | ||
cursor.execute(sql_insert, insert_values) | ||
else: | ||
print(f"No updates for record with CIK={CIK} and Ticker={Ticker}") | ||
|
||
conn.commit() | ||
conn.close() | ||
print("Database updated successfully.") | ||
|
||
def export_database_to_json(self, json_file_path): | ||
conn = sqlite3.connect(self.db_file_path) | ||
cursor = conn.cursor() | ||
cursor.execute('SELECT * FROM full_database_backend') | ||
rows = cursor.fetchall() | ||
column_names = [description[0] for description in cursor.description] | ||
data_json = [dict(zip(column_names, row)) for row in rows] | ||
with open(json_file_path, 'w', encoding='utf-8') as f: | ||
json.dump(data_json, f, ensure_ascii=False, indent=4) | ||
conn.close() | ||
print(f"Exported database to {json_file_path}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import gspread | ||
import pandas as pd | ||
|
||
class GoogleSheetHandler: | ||
def __init__(self, sheet_id, creds_json): | ||
self.gc = gspread.service_account_from_dict(creds_json) | ||
self.sheet = self.gc.open_by_key(sheet_id) | ||
|
||
def read_sheet_to_dataframe(self, worksheet_name): | ||
worksheet = self.sheet.worksheet(worksheet_name) | ||
data = worksheet.get_all_values() | ||
headers = data[0] | ||
records = data[1:] | ||
df_sheet = pd.DataFrame(records, columns=headers) | ||
return df_sheet | ||
|
||
def update_google_sheet(self, worksheet_name, df_updates): | ||
if df_updates.empty: | ||
print("No updates to apply to the Google Sheet.") | ||
return | ||
|
||
worksheet = self.sheet.worksheet(worksheet_name) | ||
headers = worksheet.row_values(1) | ||
data = worksheet.get_all_values() | ||
records = data[1:] | ||
df_sheet_all = pd.DataFrame(records, columns=headers) | ||
df_sheet_all['Row_Number'] = range(2, len(df_sheet_all) + 2) | ||
df_sheet_all['CIK'] = df_sheet_all['CIK'].fillna('') | ||
df_sheet_all['Ticker'] = df_sheet_all['Ticker'].fillna('') | ||
key_to_row = df_sheet_all.set_index(['CIK', 'Ticker'])['Row_Number'].to_dict() | ||
|
||
updates = [] | ||
new_rows = [] | ||
for index, row in df_updates.iterrows(): | ||
CIK = row['CIK'] | ||
Ticker = row['Ticker'] | ||
key = (CIK, Ticker) | ||
if key in key_to_row: | ||
row_number = key_to_row[key] | ||
for col in df_updates.columns: | ||
if col not in ['CIK', 'Ticker'] and pd.notna(row[col]): | ||
col_index = headers.index(col) + 1 | ||
cell = gspread.Cell(row_number, col_index, row[col]) | ||
updates.append(cell) | ||
else: | ||
new_row = [row.get(col, '') if pd.notna(row.get(col, '')) else '' for col in headers] | ||
new_rows.append(new_row) | ||
|
||
if updates: | ||
worksheet.update_cells(updates, value_input_option='USER_ENTERED') | ||
print(f"Updated {len(updates)} cells in Google Sheet.") | ||
|
||
if new_rows: | ||
worksheet.append_rows(new_rows, value_input_option='USER_ENTERED') | ||
print(f"Added {len(new_rows)} new rows to Google Sheet.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
import json | ||
from google_sheet_utils import GoogleSheetHandler | ||
from database_utils import DatabaseHandler | ||
from data_merger import DataMerger | ||
|
||
def main(): | ||
# Load credentials and environment variables | ||
sheet_id = os.environ['SHEET_ID'] | ||
creds_json = json.loads(os.environ['GOOGLE_API_KEYS']) | ||
db_file_path = 'data/Full_Database_Backend.db' | ||
json_file_path = 'data/Full_Database_Backend.json' | ||
|
||
# Initialize handlers | ||
sheet_handler = GoogleSheetHandler(sheet_id, creds_json) | ||
db_handler = DatabaseHandler(db_file_path) | ||
data_merger = DataMerger() | ||
|
||
# Read data from sources | ||
df_sheet = sheet_handler.read_sheet_to_dataframe('Full_Database_Backend') | ||
df_db = db_handler.read_database_to_dataframe() | ||
|
||
# Merge data | ||
df_db_updates, df_sheet_updates = data_merger.merge_dataframes(df_sheet, df_db) | ||
|
||
# Apply updates | ||
db_handler.update_database(df_db_updates) | ||
sheet_handler.update_google_sheet('Full_Database_Backend', df_sheet_updates) | ||
|
||
# Export database to JSON | ||
db_handler.export_database_to_json(json_file_path) | ||
|
||
print("Synchronization between Google Sheet and database completed successfully.") | ||
|
||
if __name__ == "__main__": | ||
main() |