Updated synchronization scripts to handle differences in Google Sheet…

… schema. The scripts now process only the first 27 columns using column indexes, accommodating sheets with more columns and differing header names. Ensured that headers are preserved and only intended columns are affected during synchronization. Adjusted data merging logic to use column positions, and updated database interactions accordingly.
WhyDRS · Dec 4, 2024 · ac418dc · ac418dc
1 parent ce42e52
commit ac418dc
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 129 deletions.
diff --git a/src/scripts/data_merger.py b/src/scripts/data_merger.py
@@ -2,26 +2,19 @@
 
 class DataMerger:
     def merge_dataframes(self, df_sheet, df_db):
-        # Define the columns of interest (first 27 columns)
-        columns = [
-            'Ticker', 'Exchange', 'CompanyNameIssuer', 'TransferAgent', 'OnlinePurchase', 'DTCMemberNum', 'TAURL',
-            'TransferAgentPct', 'IREmails', 'IRPhoneNum', 'IRCompanyAddress', 'IRURL', 'IRContactInfo',
-            'SharesOutstanding', 'CUSIP', 'CompanyInfoURL', 'CompanyInfo', 'FullProgressPct', 'CIK', 'DRS',
-            'PercentSharesDRSd', 'SubmissionReceived', 'TimestampsUTC', 'LearnMoreAboutDRS', 'CertificatesOffered',
-            'SandP500', 'IncorporatedIn'
-        ]
-
-        # Ensure the DataFrames have the correct columns
-        df_sheet = df_sheet.reindex(columns=columns)
-        df_db = df_db.reindex(columns=columns)
-
-        df_sheet.replace(['', ' '], pd.NA, inplace=True)
-        df_db.replace(['', ' '], pd.NA, inplace=True)
-
-        # Merge on three keys: CIK, Ticker, CompanyNameIssuer
+        # We only consider the first 27 columns
+        df_sheet = df_sheet.iloc[:, :27]
+        df_db = df_db.iloc[:, :27]
+
+        # Replace empty strings with NaN for comparison
+        df_sheet.replace('', pd.NA, inplace=True)
+        df_db.replace('', pd.NA, inplace=True)
+
+        # Merge on composite key (CIK, Ticker, CompanyNameIssuer)
         df_merged = pd.merge(
             df_sheet, df_db,
-            on=['CIK', 'Ticker', 'CompanyNameIssuer'],
+            left_on=[df_sheet.columns[18], df_sheet.columns[0], df_sheet.columns[2]],
+            right_on=[df_db.columns[18], df_db.columns[0], df_db.columns[2]],
             how='outer',
             suffixes=('_sheet', '_db'),
             indicator=True
@@ -30,24 +23,31 @@ def merge_dataframes(self, df_sheet, df_db):
         df_db_updates = pd.DataFrame()
         df_sheet_updates = pd.DataFrame()
 
-        for col in columns:
-            if col in ['CIK', 'Ticker', 'CompanyNameIssuer']:
+        for i in range(27):
+            if i in [0, 2, 18]:  # Skip keys
                 continue
-            col_sheet = col + '_sheet'
-            col_db = col + '_db'
+            col_sheet = df_sheet.columns[i] + '_sheet'
+            col_db = df_db.columns[i] + '_db'
 
             # Update database where DB has NaN and sheet has data
             condition_db_update = df_merged[col_db].isna() & df_merged[col_sheet].notna()
-            updates_db = df_merged.loc[condition_db_update, ['CIK', 'Ticker', 'CompanyNameIssuer', col_sheet]].rename(columns={col_sheet: col})
-            df_db_updates = pd.concat([df_db_updates, updates_db], ignore_index=True)
+            df_db_updates.loc[condition_db_update, i] = df_merged.loc[condition_db_update, col_sheet]
 
             # Update sheet where sheet has NaN and DB has data
             condition_sheet_update = df_merged[col_sheet].isna() & df_merged[col_db].notna()
-            updates_sheet = df_merged.loc[condition_sheet_update, ['CIK', 'Ticker', 'CompanyNameIssuer', col_db]].rename(columns={col_db: col})
-            df_sheet_updates = pd.concat([df_sheet_updates, updates_sheet], ignore_index=True)
+            df_sheet_updates.loc[condition_sheet_update, i] = df_merged.loc[condition_sheet_update, col_db]
+
+        # Include primary keys
+        df_db_updates[0] = df_merged[df_sheet.columns[0] + '_sheet']
+        df_db_updates[2] = df_merged[df_sheet.columns[2] + '_sheet']
+        df_db_updates[18] = df_merged[df_sheet.columns[18] + '_sheet']
+
+        df_sheet_updates[0] = df_merged[df_sheet.columns[0] + '_sheet']
+        df_sheet_updates[2] = df_merged[df_sheet.columns[2] + '_sheet']
+        df_sheet_updates[18] = df_merged[df_sheet.columns[18] + '_sheet']
 
-        # Remove duplicates in updates
-        df_db_updates = df_db_updates.drop_duplicates(subset=['CIK', 'Ticker', 'CompanyNameIssuer', col])
-        df_sheet_updates = df_sheet_updates.drop_duplicates(subset=['CIK', 'Ticker', 'CompanyNameIssuer', col])
+        # Reorder columns
+        df_db_updates = df_db_updates.sort_index(axis=1)
+        df_sheet_updates = df_sheet_updates.sort_index(axis=1)
 
         return df_db_updates, df_sheet_updates
diff --git a/src/scripts/database_utils.py b/src/scripts/database_utils.py
@@ -60,23 +60,22 @@ def update_database(self, df_updates):
         conn = sqlite3.connect(self.db_file_path)
         cursor = conn.cursor()
         for index, row in df_updates.iterrows():
-            CIK = row['CIK']
-            Ticker = row['Ticker']
-            CompanyNameIssuer = row['CompanyNameIssuer']
-            columns_to_update = [col for col in df_updates.columns if col not in ['CIK', 'Ticker', 'CompanyNameIssuer'] and pd.notna(row[col])]
-            set_clause = ', '.join([f"{col} = ?" for col in columns_to_update])
-            values = [row[col] for col in columns_to_update]
+            CIK = row.iloc[18]  # 19th column
+            Ticker = row.iloc[0]  # 1st column
+            CompanyNameIssuer = row.iloc[2]  # 3rd column
+            columns_to_update = [i for i in range(27) if i not in [0, 2, 18] and pd.notna(row.iloc[i])]
+            set_clause = ', '.join([f"col{i+1} = ?" for i in columns_to_update])
+            values = [row.iloc[i] for i in columns_to_update]
             values.extend([CIK, Ticker, CompanyNameIssuer])
 
             if set_clause:
                 sql = f"UPDATE full_database_backend SET {set_clause} WHERE CIK = ? AND Ticker = ? AND CompanyNameIssuer = ?"
                 cursor.execute(sql, values)
                 if cursor.rowcount == 0:
                     # Insert new record
-                    columns = ['CIK', 'Ticker', 'CompanyNameIssuer'] + columns_to_update
-                    placeholders = ', '.join(['?'] * len(columns))
-                    insert_values = [row[col] for col in columns]
-                    sql_insert = f"INSERT INTO full_database_backend ({', '.join(columns)}) VALUES ({placeholders})"
+                    placeholders = ', '.join(['?'] * 27)
+                    insert_values = [row.iloc[i] if pd.notna(row.iloc[i]) else '' for i in range(27)]
+                    sql_insert = f"INSERT INTO full_database_backend VALUES ({placeholders})"
                     cursor.execute(sql_insert, insert_values)
             else:
                 print(f"No updates for record with CIK={CIK}, Ticker={Ticker}, CompanyNameIssuer={CompanyNameIssuer}")

diff --git a/src/scripts/google_sheet_utils.py b/src/scripts/google_sheet_utils.py
@@ -8,14 +8,10 @@ def __init__(self, sheet_id, creds_json):
 
     def read_sheet_to_dataframe(self, worksheet_name):
         worksheet = self.sheet.worksheet(worksheet_name)
+        # Read only the first 27 columns
         data = worksheet.get_all_values()
-        headers = data[0]
-        records = data[1:]
-
-        # Only consider the first 27 columns
-        headers = headers[:27]
-        records = [row[:27] + ['']*(27 - len(row)) for row in records]  # Ensure each row has 27 elements
-
+        headers = data[0][:27]  # First 27 headers
+        records = [row[:27] for row in data[1:]]  # First 27 columns of data
         df_sheet = pd.DataFrame(records, columns=headers)
         return df_sheet
 
@@ -25,47 +21,39 @@ def update_google_sheet(self, worksheet_name, df_updates):
             return
 
         worksheet = self.sheet.worksheet(worksheet_name)
-        headers = worksheet.row_values(1)
         data = worksheet.get_all_values()
-        records = data[1:]
-
-        # Only consider the first 27 columns
-        headers_27 = headers[:27]
-        df_sheet_all = pd.DataFrame(records, columns=headers)
-        df_sheet_all = df_sheet_all.iloc[:, :27]  # Only first 27 columns
-        df_sheet_all['Row_Number'] = range(2, len(df_sheet_all) + 2)
-
-        # Map column names using the provided headers
-        column_mapping = self.get_column_mapping(headers_27)
-        df_sheet_all.rename(columns=column_mapping, inplace=True)
-
-        # Handle primary keys
-        df_sheet_all['CIK'] = df_sheet_all['CIK'].fillna('')
-        df_sheet_all['Ticker'] = df_sheet_all['Ticker'].fillna('')
-        df_sheet_all['CompanyNameIssuer'] = df_sheet_all['CompanyNameIssuer'].fillna('')
-
-        key_to_row = df_sheet_all.set_index(['CIK', 'Ticker', 'CompanyNameIssuer'])['Row_Number'].to_dict()
+        # Use indexes since headers might differ
+        records = data[1:]  # Exclude header
+        total_rows = len(records)
+        total_cols = len(data[0])
+
+        # Build key to row mapping
+        key_to_row = {}
+        for idx, row in enumerate(records):
+            # Ensure row has at least 27 columns
+            row = row + [''] * (27 - len(row))
+            CIK = row[18]  # 19th column
+            Ticker = row[0]  # 1st column
+            CompanyNameIssuer = row[2]  # 3rd column
+            key = (CIK, Ticker, CompanyNameIssuer)
+            key_to_row[key] = idx + 2  # Row numbers start from 2
 
         updates = []
         new_rows = []
         for index, row in df_updates.iterrows():
-            CIK = row['CIK']
-            Ticker = row['Ticker']
-            CompanyNameIssuer = row['CompanyNameIssuer']
+            CIK = row.iloc[18]  # 19th column
+            Ticker = row.iloc[0]  # 1st column
+            CompanyNameIssuer = row.iloc[2]  # 3rd column
             key = (CIK, Ticker, CompanyNameIssuer)
             if key in key_to_row:
                 row_number = key_to_row[key]
-                for col in df_updates.columns:
-                    if col not in ['CIK', 'Ticker', 'CompanyNameIssuer'] and pd.notna(row[col]):
-                        # Find the correct column index in the sheet
-                        sheet_col_name = [k for k, v in column_mapping.items() if v == col][0]
-                        if sheet_col_name in headers:
-                            col_index = headers.index(sheet_col_name) + 1  # 1-based indexing
-                            cell = gspread.Cell(row_number, col_index, row[col])
-                            updates.append(cell)
+                for col_idx in range(27):
+                    if pd.notna(row.iloc[col_idx]):
+                        cell = gspread.Cell(row_number, col_idx + 1, row.iloc[col_idx])
+                        updates.append(cell)
             else:
-                # Append a new row
-                new_row = [row.get(column_mapping.get(col, col), '') if pd.notna(row.get(col, '')) else '' for col in headers_27]
+                # Append new row
+                new_row = [row.iloc[i] if pd.notna(row.iloc[i]) else '' for i in range(27)]
                 new_rows.append(new_row)
 
         if updates:
@@ -75,54 +63,3 @@ def update_google_sheet(self, worksheet_name, df_updates):
         if new_rows:
             worksheet.append_rows(new_rows, value_input_option='USER_ENTERED')
             print(f"Added {len(new_rows)} new rows to Google Sheet.")
-
-    def get_column_mapping(self, sheet_headers):
-        # Mapping between Google Sheet headers and database columns
-        mapping = {
-            'Ticker': 'Ticker',
-            'Exchange': 'Exchange',
-            'Company Name/Issuer': 'CompanyNameIssuer',
-            'Transfer Agent': 'TransferAgent',
-            'Online Purchase?': 'OnlinePurchase',
-            'DTC Member #': 'DTCMemberNum',
-            'TA URL': 'TAURL',
-            'Transfer Agent %': 'TransferAgentPct',
-            'IR Emails': 'IREmails',
-            'IR Phone #': 'IRPhoneNum',
-            'IR /Company Address': 'IRCompanyAddress',
-            'IR URL': 'IRURL',
-            'IR Contact Info': 'IRContactInfo',
-            'Shares Outstanding': 'SharesOutstanding',
-            'CUSIP': 'CUSIP',
-            'Company Info URL': 'CompanyInfoURL',
-            'Company Info': 'CompanyInfo',
-            'Full Progress %': 'FullProgressPct',
-            'CIK': 'CIK',
-            'DRS': 'DRS',
-            "% of Shares DRS'd": 'PercentSharesDRSd',
-            'Submission Received': 'SubmissionReceived',
-            'Timestamps (UTC)': 'TimestampsUTC',
-            'Learn More about DRS': 'LearnMoreAboutDRS',
-            'Certificates offered?': 'CertificatesOffered',
-            'S&P 500?': 'SandP500',
-            'Incorporated in:': 'IncorporatedIn'
-        }
-
-        # Reverse mapping to handle headers not in mapping
-        sheet_to_db = {}
-        for header in sheet_headers:
-            if header in mapping:
-                sheet_to_db[header] = mapping[header]
-            else:
-                # Normalize and attempt to match
-                normalized_header = header.strip().lower().replace(' ', '').replace('_', '').replace('?', '').replace('#', '').replace('/', '')
-                for sheet_header, db_column in mapping.items():
-                    normalized_sheet_header = sheet_header.strip().lower().replace(' ', '').replace('_', '').replace('?', '').replace('#', '').replace('/', '')
-                    if normalized_header == normalized_sheet_header:
-                        sheet_to_db[header] = db_column
-                        break
-                else:
-                    # If no match found, map header to itself
-                    sheet_to_db[header] = header
-
-        return sheet_to_db