Skip to content

Commit

Permalink
Updated synchronization scripts to handle differences in Google Sheet…
Browse files Browse the repository at this point in the history
… schema. The scripts now process only the first 27 columns using column indexes, accommodating sheets with more columns and differing header names. Ensured that headers are preserved and only intended columns are affected during synchronization. Adjusted data merging logic to use column positions, and updated database interactions accordingly.
  • Loading branch information
JamesAlfonse authored Dec 4, 2024
1 parent ce42e52 commit ac418dc
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 129 deletions.
58 changes: 29 additions & 29 deletions src/scripts/data_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,19 @@

class DataMerger:
def merge_dataframes(self, df_sheet, df_db):
# Define the columns of interest (first 27 columns)
columns = [
'Ticker', 'Exchange', 'CompanyNameIssuer', 'TransferAgent', 'OnlinePurchase', 'DTCMemberNum', 'TAURL',
'TransferAgentPct', 'IREmails', 'IRPhoneNum', 'IRCompanyAddress', 'IRURL', 'IRContactInfo',
'SharesOutstanding', 'CUSIP', 'CompanyInfoURL', 'CompanyInfo', 'FullProgressPct', 'CIK', 'DRS',
'PercentSharesDRSd', 'SubmissionReceived', 'TimestampsUTC', 'LearnMoreAboutDRS', 'CertificatesOffered',
'SandP500', 'IncorporatedIn'
]

# Ensure the DataFrames have the correct columns
df_sheet = df_sheet.reindex(columns=columns)
df_db = df_db.reindex(columns=columns)

df_sheet.replace(['', ' '], pd.NA, inplace=True)
df_db.replace(['', ' '], pd.NA, inplace=True)

# Merge on three keys: CIK, Ticker, CompanyNameIssuer
# We only consider the first 27 columns
df_sheet = df_sheet.iloc[:, :27]
df_db = df_db.iloc[:, :27]

# Replace empty strings with NaN for comparison
df_sheet.replace('', pd.NA, inplace=True)
df_db.replace('', pd.NA, inplace=True)

# Merge on composite key (CIK, Ticker, CompanyNameIssuer)
df_merged = pd.merge(
df_sheet, df_db,
on=['CIK', 'Ticker', 'CompanyNameIssuer'],
left_on=[df_sheet.columns[18], df_sheet.columns[0], df_sheet.columns[2]],
right_on=[df_db.columns[18], df_db.columns[0], df_db.columns[2]],
how='outer',
suffixes=('_sheet', '_db'),
indicator=True
Expand All @@ -30,24 +23,31 @@ def merge_dataframes(self, df_sheet, df_db):
df_db_updates = pd.DataFrame()
df_sheet_updates = pd.DataFrame()

for col in columns:
if col in ['CIK', 'Ticker', 'CompanyNameIssuer']:
for i in range(27):
if i in [0, 2, 18]: # Skip keys
continue
col_sheet = col + '_sheet'
col_db = col + '_db'
col_sheet = df_sheet.columns[i] + '_sheet'
col_db = df_db.columns[i] + '_db'

# Update database where DB has NaN and sheet has data
condition_db_update = df_merged[col_db].isna() & df_merged[col_sheet].notna()
updates_db = df_merged.loc[condition_db_update, ['CIK', 'Ticker', 'CompanyNameIssuer', col_sheet]].rename(columns={col_sheet: col})
df_db_updates = pd.concat([df_db_updates, updates_db], ignore_index=True)
df_db_updates.loc[condition_db_update, i] = df_merged.loc[condition_db_update, col_sheet]

# Update sheet where sheet has NaN and DB has data
condition_sheet_update = df_merged[col_sheet].isna() & df_merged[col_db].notna()
updates_sheet = df_merged.loc[condition_sheet_update, ['CIK', 'Ticker', 'CompanyNameIssuer', col_db]].rename(columns={col_db: col})
df_sheet_updates = pd.concat([df_sheet_updates, updates_sheet], ignore_index=True)
df_sheet_updates.loc[condition_sheet_update, i] = df_merged.loc[condition_sheet_update, col_db]

# Include primary keys
df_db_updates[0] = df_merged[df_sheet.columns[0] + '_sheet']
df_db_updates[2] = df_merged[df_sheet.columns[2] + '_sheet']
df_db_updates[18] = df_merged[df_sheet.columns[18] + '_sheet']

df_sheet_updates[0] = df_merged[df_sheet.columns[0] + '_sheet']
df_sheet_updates[2] = df_merged[df_sheet.columns[2] + '_sheet']
df_sheet_updates[18] = df_merged[df_sheet.columns[18] + '_sheet']

# Remove duplicates in updates
df_db_updates = df_db_updates.drop_duplicates(subset=['CIK', 'Ticker', 'CompanyNameIssuer', col])
df_sheet_updates = df_sheet_updates.drop_duplicates(subset=['CIK', 'Ticker', 'CompanyNameIssuer', col])
# Reorder columns
df_db_updates = df_db_updates.sort_index(axis=1)
df_sheet_updates = df_sheet_updates.sort_index(axis=1)

return df_db_updates, df_sheet_updates
19 changes: 9 additions & 10 deletions src/scripts/database_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,23 +60,22 @@ def update_database(self, df_updates):
conn = sqlite3.connect(self.db_file_path)
cursor = conn.cursor()
for index, row in df_updates.iterrows():
CIK = row['CIK']
Ticker = row['Ticker']
CompanyNameIssuer = row['CompanyNameIssuer']
columns_to_update = [col for col in df_updates.columns if col not in ['CIK', 'Ticker', 'CompanyNameIssuer'] and pd.notna(row[col])]
set_clause = ', '.join([f"{col} = ?" for col in columns_to_update])
values = [row[col] for col in columns_to_update]
CIK = row.iloc[18] # 19th column
Ticker = row.iloc[0] # 1st column
CompanyNameIssuer = row.iloc[2] # 3rd column
columns_to_update = [i for i in range(27) if i not in [0, 2, 18] and pd.notna(row.iloc[i])]
set_clause = ', '.join([f"col{i+1} = ?" for i in columns_to_update])
values = [row.iloc[i] for i in columns_to_update]
values.extend([CIK, Ticker, CompanyNameIssuer])

if set_clause:
sql = f"UPDATE full_database_backend SET {set_clause} WHERE CIK = ? AND Ticker = ? AND CompanyNameIssuer = ?"
cursor.execute(sql, values)
if cursor.rowcount == 0:
# Insert new record
columns = ['CIK', 'Ticker', 'CompanyNameIssuer'] + columns_to_update
placeholders = ', '.join(['?'] * len(columns))
insert_values = [row[col] for col in columns]
sql_insert = f"INSERT INTO full_database_backend ({', '.join(columns)}) VALUES ({placeholders})"
placeholders = ', '.join(['?'] * 27)
insert_values = [row.iloc[i] if pd.notna(row.iloc[i]) else '' for i in range(27)]
sql_insert = f"INSERT INTO full_database_backend VALUES ({placeholders})"
cursor.execute(sql_insert, insert_values)
else:
print(f"No updates for record with CIK={CIK}, Ticker={Ticker}, CompanyNameIssuer={CompanyNameIssuer}")
Expand Down
117 changes: 27 additions & 90 deletions src/scripts/google_sheet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,10 @@ def __init__(self, sheet_id, creds_json):

def read_sheet_to_dataframe(self, worksheet_name):
worksheet = self.sheet.worksheet(worksheet_name)
# Read only the first 27 columns
data = worksheet.get_all_values()
headers = data[0]
records = data[1:]

# Only consider the first 27 columns
headers = headers[:27]
records = [row[:27] + ['']*(27 - len(row)) for row in records] # Ensure each row has 27 elements

headers = data[0][:27] # First 27 headers
records = [row[:27] for row in data[1:]] # First 27 columns of data
df_sheet = pd.DataFrame(records, columns=headers)
return df_sheet

Expand All @@ -25,47 +21,39 @@ def update_google_sheet(self, worksheet_name, df_updates):
return

worksheet = self.sheet.worksheet(worksheet_name)
headers = worksheet.row_values(1)
data = worksheet.get_all_values()
records = data[1:]

# Only consider the first 27 columns
headers_27 = headers[:27]
df_sheet_all = pd.DataFrame(records, columns=headers)
df_sheet_all = df_sheet_all.iloc[:, :27] # Only first 27 columns
df_sheet_all['Row_Number'] = range(2, len(df_sheet_all) + 2)

# Map column names using the provided headers
column_mapping = self.get_column_mapping(headers_27)
df_sheet_all.rename(columns=column_mapping, inplace=True)

# Handle primary keys
df_sheet_all['CIK'] = df_sheet_all['CIK'].fillna('')
df_sheet_all['Ticker'] = df_sheet_all['Ticker'].fillna('')
df_sheet_all['CompanyNameIssuer'] = df_sheet_all['CompanyNameIssuer'].fillna('')

key_to_row = df_sheet_all.set_index(['CIK', 'Ticker', 'CompanyNameIssuer'])['Row_Number'].to_dict()
# Use indexes since headers might differ
records = data[1:] # Exclude header
total_rows = len(records)
total_cols = len(data[0])

# Build key to row mapping
key_to_row = {}
for idx, row in enumerate(records):
# Ensure row has at least 27 columns
row = row + [''] * (27 - len(row))
CIK = row[18] # 19th column
Ticker = row[0] # 1st column
CompanyNameIssuer = row[2] # 3rd column
key = (CIK, Ticker, CompanyNameIssuer)
key_to_row[key] = idx + 2 # Row numbers start from 2

updates = []
new_rows = []
for index, row in df_updates.iterrows():
CIK = row['CIK']
Ticker = row['Ticker']
CompanyNameIssuer = row['CompanyNameIssuer']
CIK = row.iloc[18] # 19th column
Ticker = row.iloc[0] # 1st column
CompanyNameIssuer = row.iloc[2] # 3rd column
key = (CIK, Ticker, CompanyNameIssuer)
if key in key_to_row:
row_number = key_to_row[key]
for col in df_updates.columns:
if col not in ['CIK', 'Ticker', 'CompanyNameIssuer'] and pd.notna(row[col]):
# Find the correct column index in the sheet
sheet_col_name = [k for k, v in column_mapping.items() if v == col][0]
if sheet_col_name in headers:
col_index = headers.index(sheet_col_name) + 1 # 1-based indexing
cell = gspread.Cell(row_number, col_index, row[col])
updates.append(cell)
for col_idx in range(27):
if pd.notna(row.iloc[col_idx]):
cell = gspread.Cell(row_number, col_idx + 1, row.iloc[col_idx])
updates.append(cell)
else:
# Append a new row
new_row = [row.get(column_mapping.get(col, col), '') if pd.notna(row.get(col, '')) else '' for col in headers_27]
# Append new row
new_row = [row.iloc[i] if pd.notna(row.iloc[i]) else '' for i in range(27)]
new_rows.append(new_row)

if updates:
Expand All @@ -75,54 +63,3 @@ def update_google_sheet(self, worksheet_name, df_updates):
if new_rows:
worksheet.append_rows(new_rows, value_input_option='USER_ENTERED')
print(f"Added {len(new_rows)} new rows to Google Sheet.")

def get_column_mapping(self, sheet_headers):
# Mapping between Google Sheet headers and database columns
mapping = {
'Ticker': 'Ticker',
'Exchange': 'Exchange',
'Company Name/Issuer': 'CompanyNameIssuer',
'Transfer Agent': 'TransferAgent',
'Online Purchase?': 'OnlinePurchase',
'DTC Member #': 'DTCMemberNum',
'TA URL': 'TAURL',
'Transfer Agent %': 'TransferAgentPct',
'IR Emails': 'IREmails',
'IR Phone #': 'IRPhoneNum',
'IR /Company Address': 'IRCompanyAddress',
'IR URL': 'IRURL',
'IR Contact Info': 'IRContactInfo',
'Shares Outstanding': 'SharesOutstanding',
'CUSIP': 'CUSIP',
'Company Info URL': 'CompanyInfoURL',
'Company Info': 'CompanyInfo',
'Full Progress %': 'FullProgressPct',
'CIK': 'CIK',
'DRS': 'DRS',
"% of Shares DRS'd": 'PercentSharesDRSd',
'Submission Received': 'SubmissionReceived',
'Timestamps (UTC)': 'TimestampsUTC',
'Learn More about DRS': 'LearnMoreAboutDRS',
'Certificates offered?': 'CertificatesOffered',
'S&P 500?': 'SandP500',
'Incorporated in:': 'IncorporatedIn'
}

# Reverse mapping to handle headers not in mapping
sheet_to_db = {}
for header in sheet_headers:
if header in mapping:
sheet_to_db[header] = mapping[header]
else:
# Normalize and attempt to match
normalized_header = header.strip().lower().replace(' ', '').replace('_', '').replace('?', '').replace('#', '').replace('/', '')
for sheet_header, db_column in mapping.items():
normalized_sheet_header = sheet_header.strip().lower().replace(' ', '').replace('_', '').replace('?', '').replace('#', '').replace('/', '')
if normalized_header == normalized_sheet_header:
sheet_to_db[header] = db_column
break
else:
# If no match found, map header to itself
sheet_to_db[header] = header

return sheet_to_db

0 comments on commit ac418dc

Please sign in to comment.