diff --git a/src/scripts/data_merger.py b/src/scripts/data_merger.py index 909ba1f..9c144fa 100644 --- a/src/scripts/data_merger.py +++ b/src/scripts/data_merger.py @@ -2,26 +2,19 @@ class DataMerger: def merge_dataframes(self, df_sheet, df_db): - # Define the columns of interest (first 27 columns) - columns = [ - 'Ticker', 'Exchange', 'CompanyNameIssuer', 'TransferAgent', 'OnlinePurchase', 'DTCMemberNum', 'TAURL', - 'TransferAgentPct', 'IREmails', 'IRPhoneNum', 'IRCompanyAddress', 'IRURL', 'IRContactInfo', - 'SharesOutstanding', 'CUSIP', 'CompanyInfoURL', 'CompanyInfo', 'FullProgressPct', 'CIK', 'DRS', - 'PercentSharesDRSd', 'SubmissionReceived', 'TimestampsUTC', 'LearnMoreAboutDRS', 'CertificatesOffered', - 'SandP500', 'IncorporatedIn' - ] - - # Ensure the DataFrames have the correct columns - df_sheet = df_sheet.reindex(columns=columns) - df_db = df_db.reindex(columns=columns) - - df_sheet.replace(['', ' '], pd.NA, inplace=True) - df_db.replace(['', ' '], pd.NA, inplace=True) - - # Merge on three keys: CIK, Ticker, CompanyNameIssuer + # We only consider the first 27 columns + df_sheet = df_sheet.iloc[:, :27] + df_db = df_db.iloc[:, :27] + + # Replace empty strings with NaN for comparison + df_sheet.replace('', pd.NA, inplace=True) + df_db.replace('', pd.NA, inplace=True) + + # Merge on composite key (CIK, Ticker, CompanyNameIssuer) df_merged = pd.merge( df_sheet, df_db, - on=['CIK', 'Ticker', 'CompanyNameIssuer'], + left_on=[df_sheet.columns[18], df_sheet.columns[0], df_sheet.columns[2]], + right_on=[df_db.columns[18], df_db.columns[0], df_db.columns[2]], how='outer', suffixes=('_sheet', '_db'), indicator=True @@ -30,24 +23,31 @@ def merge_dataframes(self, df_sheet, df_db): df_db_updates = pd.DataFrame() df_sheet_updates = pd.DataFrame() - for col in columns: - if col in ['CIK', 'Ticker', 'CompanyNameIssuer']: + for i in range(27): + if i in [0, 2, 18]: # Skip keys continue - col_sheet = col + '_sheet' - col_db = col + '_db' + col_sheet = df_sheet.columns[i] + '_sheet' + col_db = df_db.columns[i] + '_db' # Update database where DB has NaN and sheet has data condition_db_update = df_merged[col_db].isna() & df_merged[col_sheet].notna() - updates_db = df_merged.loc[condition_db_update, ['CIK', 'Ticker', 'CompanyNameIssuer', col_sheet]].rename(columns={col_sheet: col}) - df_db_updates = pd.concat([df_db_updates, updates_db], ignore_index=True) + df_db_updates.loc[condition_db_update, i] = df_merged.loc[condition_db_update, col_sheet] # Update sheet where sheet has NaN and DB has data condition_sheet_update = df_merged[col_sheet].isna() & df_merged[col_db].notna() - updates_sheet = df_merged.loc[condition_sheet_update, ['CIK', 'Ticker', 'CompanyNameIssuer', col_db]].rename(columns={col_db: col}) - df_sheet_updates = pd.concat([df_sheet_updates, updates_sheet], ignore_index=True) + df_sheet_updates.loc[condition_sheet_update, i] = df_merged.loc[condition_sheet_update, col_db] + + # Include primary keys + df_db_updates[0] = df_merged[df_sheet.columns[0] + '_sheet'] + df_db_updates[2] = df_merged[df_sheet.columns[2] + '_sheet'] + df_db_updates[18] = df_merged[df_sheet.columns[18] + '_sheet'] + + df_sheet_updates[0] = df_merged[df_sheet.columns[0] + '_sheet'] + df_sheet_updates[2] = df_merged[df_sheet.columns[2] + '_sheet'] + df_sheet_updates[18] = df_merged[df_sheet.columns[18] + '_sheet'] - # Remove duplicates in updates - df_db_updates = df_db_updates.drop_duplicates(subset=['CIK', 'Ticker', 'CompanyNameIssuer', col]) - df_sheet_updates = df_sheet_updates.drop_duplicates(subset=['CIK', 'Ticker', 'CompanyNameIssuer', col]) + # Reorder columns + df_db_updates = df_db_updates.sort_index(axis=1) + df_sheet_updates = df_sheet_updates.sort_index(axis=1) return df_db_updates, df_sheet_updates diff --git a/src/scripts/database_utils.py b/src/scripts/database_utils.py index 64b7155..2a5f6e3 100644 --- a/src/scripts/database_utils.py +++ b/src/scripts/database_utils.py @@ -60,12 +60,12 @@ def update_database(self, df_updates): conn = sqlite3.connect(self.db_file_path) cursor = conn.cursor() for index, row in df_updates.iterrows(): - CIK = row['CIK'] - Ticker = row['Ticker'] - CompanyNameIssuer = row['CompanyNameIssuer'] - columns_to_update = [col for col in df_updates.columns if col not in ['CIK', 'Ticker', 'CompanyNameIssuer'] and pd.notna(row[col])] - set_clause = ', '.join([f"{col} = ?" for col in columns_to_update]) - values = [row[col] for col in columns_to_update] + CIK = row.iloc[18] # 19th column + Ticker = row.iloc[0] # 1st column + CompanyNameIssuer = row.iloc[2] # 3rd column + columns_to_update = [i for i in range(27) if i not in [0, 2, 18] and pd.notna(row.iloc[i])] + set_clause = ', '.join([f"col{i+1} = ?" for i in columns_to_update]) + values = [row.iloc[i] for i in columns_to_update] values.extend([CIK, Ticker, CompanyNameIssuer]) if set_clause: @@ -73,10 +73,9 @@ def update_database(self, df_updates): cursor.execute(sql, values) if cursor.rowcount == 0: # Insert new record - columns = ['CIK', 'Ticker', 'CompanyNameIssuer'] + columns_to_update - placeholders = ', '.join(['?'] * len(columns)) - insert_values = [row[col] for col in columns] - sql_insert = f"INSERT INTO full_database_backend ({', '.join(columns)}) VALUES ({placeholders})" + placeholders = ', '.join(['?'] * 27) + insert_values = [row.iloc[i] if pd.notna(row.iloc[i]) else '' for i in range(27)] + sql_insert = f"INSERT INTO full_database_backend VALUES ({placeholders})" cursor.execute(sql_insert, insert_values) else: print(f"No updates for record with CIK={CIK}, Ticker={Ticker}, CompanyNameIssuer={CompanyNameIssuer}") diff --git a/src/scripts/google_sheet_utils.py b/src/scripts/google_sheet_utils.py index ddbb511..0801aa8 100644 --- a/src/scripts/google_sheet_utils.py +++ b/src/scripts/google_sheet_utils.py @@ -8,14 +8,10 @@ def __init__(self, sheet_id, creds_json): def read_sheet_to_dataframe(self, worksheet_name): worksheet = self.sheet.worksheet(worksheet_name) + # Read only the first 27 columns data = worksheet.get_all_values() - headers = data[0] - records = data[1:] - - # Only consider the first 27 columns - headers = headers[:27] - records = [row[:27] + ['']*(27 - len(row)) for row in records] # Ensure each row has 27 elements - + headers = data[0][:27] # First 27 headers + records = [row[:27] for row in data[1:]] # First 27 columns of data df_sheet = pd.DataFrame(records, columns=headers) return df_sheet @@ -25,47 +21,39 @@ def update_google_sheet(self, worksheet_name, df_updates): return worksheet = self.sheet.worksheet(worksheet_name) - headers = worksheet.row_values(1) data = worksheet.get_all_values() - records = data[1:] - - # Only consider the first 27 columns - headers_27 = headers[:27] - df_sheet_all = pd.DataFrame(records, columns=headers) - df_sheet_all = df_sheet_all.iloc[:, :27] # Only first 27 columns - df_sheet_all['Row_Number'] = range(2, len(df_sheet_all) + 2) - - # Map column names using the provided headers - column_mapping = self.get_column_mapping(headers_27) - df_sheet_all.rename(columns=column_mapping, inplace=True) - - # Handle primary keys - df_sheet_all['CIK'] = df_sheet_all['CIK'].fillna('') - df_sheet_all['Ticker'] = df_sheet_all['Ticker'].fillna('') - df_sheet_all['CompanyNameIssuer'] = df_sheet_all['CompanyNameIssuer'].fillna('') - - key_to_row = df_sheet_all.set_index(['CIK', 'Ticker', 'CompanyNameIssuer'])['Row_Number'].to_dict() + # Use indexes since headers might differ + records = data[1:] # Exclude header + total_rows = len(records) + total_cols = len(data[0]) + + # Build key to row mapping + key_to_row = {} + for idx, row in enumerate(records): + # Ensure row has at least 27 columns + row = row + [''] * (27 - len(row)) + CIK = row[18] # 19th column + Ticker = row[0] # 1st column + CompanyNameIssuer = row[2] # 3rd column + key = (CIK, Ticker, CompanyNameIssuer) + key_to_row[key] = idx + 2 # Row numbers start from 2 updates = [] new_rows = [] for index, row in df_updates.iterrows(): - CIK = row['CIK'] - Ticker = row['Ticker'] - CompanyNameIssuer = row['CompanyNameIssuer'] + CIK = row.iloc[18] # 19th column + Ticker = row.iloc[0] # 1st column + CompanyNameIssuer = row.iloc[2] # 3rd column key = (CIK, Ticker, CompanyNameIssuer) if key in key_to_row: row_number = key_to_row[key] - for col in df_updates.columns: - if col not in ['CIK', 'Ticker', 'CompanyNameIssuer'] and pd.notna(row[col]): - # Find the correct column index in the sheet - sheet_col_name = [k for k, v in column_mapping.items() if v == col][0] - if sheet_col_name in headers: - col_index = headers.index(sheet_col_name) + 1 # 1-based indexing - cell = gspread.Cell(row_number, col_index, row[col]) - updates.append(cell) + for col_idx in range(27): + if pd.notna(row.iloc[col_idx]): + cell = gspread.Cell(row_number, col_idx + 1, row.iloc[col_idx]) + updates.append(cell) else: - # Append a new row - new_row = [row.get(column_mapping.get(col, col), '') if pd.notna(row.get(col, '')) else '' for col in headers_27] + # Append new row + new_row = [row.iloc[i] if pd.notna(row.iloc[i]) else '' for i in range(27)] new_rows.append(new_row) if updates: @@ -75,54 +63,3 @@ def update_google_sheet(self, worksheet_name, df_updates): if new_rows: worksheet.append_rows(new_rows, value_input_option='USER_ENTERED') print(f"Added {len(new_rows)} new rows to Google Sheet.") - - def get_column_mapping(self, sheet_headers): - # Mapping between Google Sheet headers and database columns - mapping = { - 'Ticker': 'Ticker', - 'Exchange': 'Exchange', - 'Company Name/Issuer': 'CompanyNameIssuer', - 'Transfer Agent': 'TransferAgent', - 'Online Purchase?': 'OnlinePurchase', - 'DTC Member #': 'DTCMemberNum', - 'TA URL': 'TAURL', - 'Transfer Agent %': 'TransferAgentPct', - 'IR Emails': 'IREmails', - 'IR Phone #': 'IRPhoneNum', - 'IR /Company Address': 'IRCompanyAddress', - 'IR URL': 'IRURL', - 'IR Contact Info': 'IRContactInfo', - 'Shares Outstanding': 'SharesOutstanding', - 'CUSIP': 'CUSIP', - 'Company Info URL': 'CompanyInfoURL', - 'Company Info': 'CompanyInfo', - 'Full Progress %': 'FullProgressPct', - 'CIK': 'CIK', - 'DRS': 'DRS', - "% of Shares DRS'd": 'PercentSharesDRSd', - 'Submission Received': 'SubmissionReceived', - 'Timestamps (UTC)': 'TimestampsUTC', - 'Learn More about DRS': 'LearnMoreAboutDRS', - 'Certificates offered?': 'CertificatesOffered', - 'S&P 500?': 'SandP500', - 'Incorporated in:': 'IncorporatedIn' - } - - # Reverse mapping to handle headers not in mapping - sheet_to_db = {} - for header in sheet_headers: - if header in mapping: - sheet_to_db[header] = mapping[header] - else: - # Normalize and attempt to match - normalized_header = header.strip().lower().replace(' ', '').replace('_', '').replace('?', '').replace('#', '').replace('/', '') - for sheet_header, db_column in mapping.items(): - normalized_sheet_header = sheet_header.strip().lower().replace(' ', '').replace('_', '').replace('?', '').replace('#', '').replace('/', '') - if normalized_header == normalized_sheet_header: - sheet_to_db[header] = db_column - break - else: - # If no match found, map header to itself - sheet_to_db[header] = header - - return sheet_to_db