Refactored synchronization scripts to enhance modularity and synchron…

…ize the Google Sheet and database daily, removing SEC data gathering, and handling conflicts as specified. It handles conflicts by: - Updating empty cells in one source with data from the other source. - Leaving cells unchanged when both sources have conflicting non-empty data.
WhyDRS · Dec 4, 2024 · 5fbccc1 · 5fbccc1
1 parent b293797
commit 5fbccc1
Show file tree

Hide file tree

Showing 6 changed files with 233 additions and 58 deletions.
diff --git a/.github/workflows/SQL_To_Sheets.yml b/.github/workflows/SQL_To_Sheets.yml
diff --git a/.github/workflows/Sheets_To_SQL.yml → ...ub/workflows/Synchronize_Sheet_and_DB.yml b/.github/workflows/Sheets_To_SQL.yml → ...ub/workflows/Synchronize_Sheet_and_DB.yml
@@ -1,4 +1,4 @@
-name: Update Database with Google Sheets Data
+name: Synchronize Google Sheet and Database
 
 on:
   schedule:
@@ -9,7 +9,7 @@ concurrency:
   group: database-update
 
 jobs:
-  update_database:
+  sync_data:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout Repository
@@ -22,24 +22,21 @@ jobs:
 
     - name: Install Python Dependencies
       run: |
-        pip install gspread oauth2client
+        pip install pandas gspread oauth2client
     
-    - name: Read Google Sheet and Write to SQL Database
+    - name: Synchronize Google Sheet and Database
       env:
         SHEET_ID: ${{ secrets.SHEET_ID }}
         GOOGLE_API_KEYS: ${{ secrets.GOOGLE_API_KEYS }}
       run: |
-        python 'src/scripts/update_sql.py'
-
-    - name: Change Primary Key in Database
-      run: python src/scripts/change_primary_key_to_cik.py
+        python 'src/scripts/synchronize_google_sheet_and_db.py'
     
     - name: Commit and Push Database Files
       run: |
-        git config --global user.name 'JamesAlfonse'
-        git config --global user.email 'jamesalfonse@gmail.com'
-        git add 'data/Full_Database_Backend.db' 'data/Full_Database_Backend.json'  # Add both the database and the JSON file
-        git commit -m "Updated database and JSON files" -a || echo "No changes to commit."
+        git config --global user.name 'YourName'
+        git config --global user.email 'YourEmail@example.com'
+        git add 'data/Full_Database_Backend.db' 'data/Full_Database_Backend.json'
+        git commit -m "Synchronized database with Google Sheet" -a || echo "No changes to commit."
         git push
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/src/scripts/data_merger.py b/src/scripts/data_merger.py
@@ -0,0 +1,37 @@
+import pandas as pd
+
+class DataMerger:
+    def merge_dataframes(self, df_sheet, df_db):
+        columns = ['CIK', 'Ticker', 'Exchange', 'CompanyNameIssuer', 'TransferAgent', 'OnlinePurchase', 'DTCMemberNum', 'TAURL',
+                   'TransferAgentPct', 'IREmails', 'IRPhoneNum', 'IRCompanyAddress', 'IRURL', 'IRContactInfo',
+                   'SharesOutstanding', 'CUSIP', 'CompanyInfoURL', 'CompanyInfo', 'FullProgressPct', 'DRS',
+                   'PercentSharesDRSd', 'SubmissionReceived', 'TimestampsUTC', 'LearnMoreAboutDRS', 'CertificatesOffered',
+                   'SandP500', 'IncorporatedIn']
+
+        df_sheet = df_sheet.reindex(columns=columns)
+        df_db = df_db.reindex(columns=columns)
+
+        df_sheet.replace('', pd.NA, inplace=True)
+        df_db.replace('', pd.NA, inplace=True)
+
+        df_merged = pd.merge(df_sheet, df_db, on=['CIK', 'Ticker'], how='outer', suffixes=('_sheet', '_db'), indicator=True)
+
+        df_db_updates = pd.DataFrame()
+        df_sheet_updates = pd.DataFrame()
+
+        for col in columns:
+            if col in ['CIK', 'Ticker']:
+                continue
+            col_sheet = col + '_sheet'
+            col_db = col + '_db'
+
+            condition_db_update = df_merged[col_db].isna() & df_merged[col_sheet].notna()
+            df_db_updates.loc[condition_db_update, col] = df_merged.loc[condition_db_update, col_sheet]
+
+            condition_sheet_update = df_merged[col_sheet].isna() & df_merged[col_db].notna()
+            df_sheet_updates.loc[condition_sheet_update, col] = df_merged.loc[condition_sheet_update, col_db]
+
+        df_db_updates[['CIK', 'Ticker']] = df_merged.loc[df_db_updates.index, ['CIK', 'Ticker']]
+        df_sheet_updates[['CIK', 'Ticker']] = df_merged.loc[df_sheet_updates.index, ['CIK', 'Ticker']]
+
+        return df_db_updates, df_sheet_updates
diff --git a/src/scripts/database_utils.py b/src/scripts/database_utils.py
@@ -0,0 +1,96 @@
+import sqlite3
+import pandas as pd
+import json
+
+class DatabaseHandler:
+    def __init__(self, db_file_path):
+        self.db_file_path = db_file_path
+        self.ensure_database_schema()
+
+    def ensure_database_schema(self):
+        conn = sqlite3.connect(self.db_file_path)
+        cursor = conn.cursor()
+        cursor.execute('''
+        CREATE TABLE IF NOT EXISTS full_database_backend (
+            CIK TEXT,
+            Ticker TEXT,
+            Exchange TEXT,
+            CompanyNameIssuer TEXT,
+            TransferAgent TEXT,
+            OnlinePurchase TEXT,
+            DTCMemberNum TEXT,
+            TAURL TEXT,
+            TransferAgentPct TEXT,
+            IREmails TEXT,
+            IRPhoneNum TEXT,
+            IRCompanyAddress TEXT,
+            IRURL TEXT,
+            IRContactInfo TEXT,
+            SharesOutstanding TEXT,
+            CUSIP TEXT,
+            CompanyInfoURL TEXT,
+            CompanyInfo TEXT,
+            FullProgressPct TEXT,
+            DRS TEXT,
+            PercentSharesDRSd TEXT,
+            SubmissionReceived TEXT,
+            TimestampsUTC TEXT,
+            LearnMoreAboutDRS TEXT,
+            CertificatesOffered TEXT,
+            SandP500 TEXT,
+            IncorporatedIn TEXT,
+            PRIMARY KEY (CIK, Ticker)
+        )
+        ''')
+        conn.commit()
+        conn.close()
+
+    def read_database_to_dataframe(self):
+        conn = sqlite3.connect(self.db_file_path)
+        query = "SELECT * FROM full_database_backend"
+        df_db = pd.read_sql_query(query, conn)
+        conn.close()
+        return df_db
+
+    def update_database(self, df_updates):
+        if df_updates.empty:
+            print("No updates to apply to the database.")
+            return
+
+        conn = sqlite3.connect(self.db_file_path)
+        cursor = conn.cursor()
+        for index, row in df_updates.iterrows():
+            CIK = row['CIK']
+            Ticker = row['Ticker']
+            columns_to_update = [col for col in df_updates.columns if col not in ['CIK', 'Ticker'] and pd.notna(row[col])]
+            set_clause = ', '.join([f"{col} = ?" for col in columns_to_update])
+            values = [row[col] for col in columns_to_update]
+            values.extend([CIK, Ticker])
+
+            if set_clause:
+                sql = f"UPDATE full_database_backend SET {set_clause} WHERE CIK = ? AND Ticker = ?"
+                cursor.execute(sql, values)
+                if cursor.rowcount == 0:
+                    columns = ['CIK', 'Ticker'] + columns_to_update
+                    placeholders = ', '.join(['?'] * len(columns))
+                    insert_values = [row[col] for col in columns]
+                    sql_insert = f"INSERT INTO full_database_backend ({', '.join(columns)}) VALUES ({placeholders})"
+                    cursor.execute(sql_insert, insert_values)
+            else:
+                print(f"No updates for record with CIK={CIK} and Ticker={Ticker}")
+
+        conn.commit()
+        conn.close()
+        print("Database updated successfully.")
+
+    def export_database_to_json(self, json_file_path):
+        conn = sqlite3.connect(self.db_file_path)
+        cursor = conn.cursor()
+        cursor.execute('SELECT * FROM full_database_backend')
+        rows = cursor.fetchall()
+        column_names = [description[0] for description in cursor.description]
+        data_json = [dict(zip(column_names, row)) for row in rows]
+        with open(json_file_path, 'w', encoding='utf-8') as f:
+            json.dump(data_json, f, ensure_ascii=False, indent=4)
+        conn.close()
+        print(f"Exported database to {json_file_path}")
diff --git a/src/scripts/google_sheet_utils.py b/src/scripts/google_sheet_utils.py
@@ -0,0 +1,55 @@
+import gspread
+import pandas as pd
+
+class GoogleSheetHandler:
+    def __init__(self, sheet_id, creds_json):
+        self.gc = gspread.service_account_from_dict(creds_json)
+        self.sheet = self.gc.open_by_key(sheet_id)
+
+    def read_sheet_to_dataframe(self, worksheet_name):
+        worksheet = self.sheet.worksheet(worksheet_name)
+        data = worksheet.get_all_values()
+        headers = data[0]
+        records = data[1:]
+        df_sheet = pd.DataFrame(records, columns=headers)
+        return df_sheet
+
+    def update_google_sheet(self, worksheet_name, df_updates):
+        if df_updates.empty:
+            print("No updates to apply to the Google Sheet.")
+            return
+
+        worksheet = self.sheet.worksheet(worksheet_name)
+        headers = worksheet.row_values(1)
+        data = worksheet.get_all_values()
+        records = data[1:]
+        df_sheet_all = pd.DataFrame(records, columns=headers)
+        df_sheet_all['Row_Number'] = range(2, len(df_sheet_all) + 2)
+        df_sheet_all['CIK'] = df_sheet_all['CIK'].fillna('')
+        df_sheet_all['Ticker'] = df_sheet_all['Ticker'].fillna('')
+        key_to_row = df_sheet_all.set_index(['CIK', 'Ticker'])['Row_Number'].to_dict()
+
+        updates = []
+        new_rows = []
+        for index, row in df_updates.iterrows():
+            CIK = row['CIK']
+            Ticker = row['Ticker']
+            key = (CIK, Ticker)
+            if key in key_to_row:
+                row_number = key_to_row[key]
+                for col in df_updates.columns:
+                    if col not in ['CIK', 'Ticker'] and pd.notna(row[col]):
+                        col_index = headers.index(col) + 1
+                        cell = gspread.Cell(row_number, col_index, row[col])
+                        updates.append(cell)
+            else:
+                new_row = [row.get(col, '') if pd.notna(row.get(col, '')) else '' for col in headers]
+                new_rows.append(new_row)
+
+        if updates:
+            worksheet.update_cells(updates, value_input_option='USER_ENTERED')
+            print(f"Updated {len(updates)} cells in Google Sheet.")
+
+        if new_rows:
+            worksheet.append_rows(new_rows, value_input_option='USER_ENTERED')
+            print(f"Added {len(new_rows)} new rows to Google Sheet.")
diff --git a/src/scripts/synchronize_google_sheet_and_db.py b/src/scripts/synchronize_google_sheet_and_db.py
@@ -0,0 +1,36 @@
+import os
+import json
+from google_sheet_utils import GoogleSheetHandler
+from database_utils import DatabaseHandler
+from data_merger import DataMerger
+
+def main():
+    # Load credentials and environment variables
+    sheet_id = os.environ['SHEET_ID']
+    creds_json = json.loads(os.environ['GOOGLE_API_KEYS'])
+    db_file_path = 'data/Full_Database_Backend.db'
+    json_file_path = 'data/Full_Database_Backend.json'
+
+    # Initialize handlers
+    sheet_handler = GoogleSheetHandler(sheet_id, creds_json)
+    db_handler = DatabaseHandler(db_file_path)
+    data_merger = DataMerger()
+
+    # Read data from sources
+    df_sheet = sheet_handler.read_sheet_to_dataframe('Full_Database_Backend')
+    df_db = db_handler.read_database_to_dataframe()
+
+    # Merge data
+    df_db_updates, df_sheet_updates = data_merger.merge_dataframes(df_sheet, df_db)
+
+    # Apply updates
+    db_handler.update_database(df_db_updates)
+    sheet_handler.update_google_sheet('Full_Database_Backend', df_sheet_updates)
+
+    # Export database to JSON
+    db_handler.export_database_to_json(json_file_path)
+
+    print("Synchronization between Google Sheet and database completed successfully.")
+
+if __name__ == "__main__":
+    main()