Skip to content

Commit

Permalink
Implement conflict resolution by choosing the source with more filled…
Browse files Browse the repository at this point in the history
… cells in case of data conflicts between Google Sheet and database.
  • Loading branch information
JamesAlfonse authored Dec 5, 2024
1 parent 81ba078 commit b752289
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 23 deletions.
66 changes: 57 additions & 9 deletions src/scripts/database_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,71 @@ def update_database(self, data):
conn = sqlite3.connect(self.db_file_path)
cursor = conn.cursor()

# Fetch existing data from the database
cursor.execute('SELECT * FROM full_database_backend')
existing_rows = cursor.fetchall()
existing_data = {}
for row in existing_rows:
key = (row[18], row[0], row[2]) # CIK, Ticker, CompanyNameIssuer
existing_data[key] = row

for row in data:
# Ensure row has exactly 27 elements
row = row + [''] * (27 - len(row))
cursor.execute('SELECT * FROM full_database_backend WHERE CIK=? AND Ticker=? AND CompanyNameIssuer=?',
(row[18], row[0], row[2]))
db_row = cursor.fetchone()
if db_row:
db_filled = sum(1 for cell in db_row if cell)
sheet_filled = sum(1 for cell in row if cell)
if sheet_filled > db_filled:
key = (row[18], row[0], row[2]) # CIK, Ticker, CompanyNameIssuer

if key in existing_data:
# Compare the number of non-empty cells
existing_row = existing_data[key]
sheet_non_empty = sum(1 for cell in row if cell.strip())
db_non_empty = sum(1 for cell in existing_row if cell and str(cell).strip())

if sheet_non_empty > db_non_empty:
# Update the database with the Google Sheet row
cursor.execute('''
REPLACE INTO full_database_backend VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT OR REPLACE INTO full_database_backend (
Ticker, Exchange, CompanyNameIssuer, TransferAgent, OnlinePurchase, DTCMemberNum, TAURL,
TransferAgentPct, IREmails, IRPhoneNum, IRCompanyAddress, IRURL, IRContactInfo, SharesOutstanding,
CUSIP, CompanyInfoURL, CompanyInfo, FullProgressPct, CIK, DRS, PercentSharesDRSd, SubmissionReceived,
TimestampsUTC, LearnMoreAboutDRS, CertificatesOffered, SandP500, IncorporatedIn
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', tuple(row))
print(f"Updated row in database for key {key} with data from Google Sheet.")
else:
# Keep the existing database row
print(f"Kept existing database row for key {key}.")
else:
# Insert the new row from Google Sheet into the database
cursor.execute('''
INSERT INTO full_database_backend VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT INTO full_database_backend (
Ticker, Exchange, CompanyNameIssuer, TransferAgent, OnlinePurchase, DTCMemberNum, TAURL,
TransferAgentPct, IREmails, IRPhoneNum, IRCompanyAddress, IRURL, IRContactInfo, SharesOutstanding,
CUSIP, CompanyInfoURL, CompanyInfo, FullProgressPct, CIK, DRS, PercentSharesDRSd, SubmissionReceived,
TimestampsUTC, LearnMoreAboutDRS, CertificatesOffered, SandP500, IncorporatedIn
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', tuple(row))
print(f"Inserted new row into database for key {key}.")

conn.commit()
conn.close()
print("Database updated successfully.")

def read_database_data(self):
conn = sqlite3.connect(self.db_file_path)
cursor = conn.cursor()
cursor.execute('SELECT * FROM full_database_backend')
rows = cursor.fetchall()
conn.close()
return rows

def export_database_to_json(self, json_file_path):
conn = sqlite3.connect(self.db_file_path)
cursor = conn.cursor()
cursor.execute('SELECT * FROM full_database_backend')
rows = cursor.fetchall()
column_names = [description[0] for description in cursor.description]
data_json = [dict(zip(column_names, row)) for row in rows]
with open(json_file_path, 'w', encoding='utf-8') as f:
json.dump(data_json, f, ensure_ascii=False, indent=4)
conn.close()
print(f"Exported database to {json_file_path}")
46 changes: 32 additions & 14 deletions src/scripts/google_sheet_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import gspread
import pandas as pd

class GoogleSheetHandler:
def __init__(self, sheet_id, creds_json):
Expand All @@ -8,41 +7,60 @@ def __init__(self, sheet_id, creds_json):

def read_sheet_data(self, worksheet_name):
worksheet = self.sheet.worksheet(worksheet_name)
# Read all data from the worksheet
data = worksheet.get_all_values()
# Only process the first 27 columns
data = [row[:27] for row in data[1:]] # Skip header row
return data

def update_google_sheet(self, worksheet_name, db_data):
worksheet = self.sheet.worksheet(worksheet_name)
# Get existing data to build key mapping
existing_data = worksheet.get_all_values()
existing_data = [row[:27] for row in existing_data]
existing_data = [row[:27] for row in existing_data] # Only first 27 columns
headers = existing_data[0]
records = existing_data[1:]
key_to_row = {}
for idx, row in enumerate(records):
row = row + [''] * (27 - len(row))
CIK, Ticker, CompanyNameIssuer = row[18], row[0], row[2]
CIK = row[18]
Ticker = row[0]
CompanyNameIssuer = row[2]
key = (CIK, Ticker, CompanyNameIssuer)
key_to_row[key] = idx + 2 # Adjust for header row

updates = []
for db_row in db_data:
CIK, Ticker, CompanyNameIssuer = db_row[18], db_row[0], db_row[2]
for row in db_data:
# Ensure row has exactly 27 elements
row = list(row)
row = row + [''] * (27 - len(row))
CIK = row[18]
Ticker = row[0]
CompanyNameIssuer = row[2]
key = (CIK, Ticker, CompanyNameIssuer)
if key in key_to_row:
row_number = key_to_row[key]
# Get the existing row from Google Sheet
sheet_row = worksheet.row_values(row_number)
sheet_row = sheet_row + [''] * (27 - len(sheet_row))

# Resolve conflict by selecting source with more filled cells
sheet_filled = sum(1 for cell in sheet_row if cell)
db_filled = sum(1 for cell in db_row if cell)
if db_filled > sheet_filled:
updates.extend(
[gspread.Cell(row_number, col + 1, db_row[col]) for col in range(27) if sheet_row[col] != db_row[col]]
)

# Compare the number of non-empty cells
db_non_empty = sum(1 for cell in row if str(cell).strip())
sheet_non_empty = sum(1 for cell in sheet_row if cell.strip())

if db_non_empty > sheet_non_empty:
# Update the entire row in Google Sheet with data from database
cell_list = worksheet.range(row_number, 1, row_number, 27)
for i, cell in enumerate(cell_list):
cell.value = row[i] if row[i] else ''
updates.extend(cell_list)
print(f"Updated row {row_number} in Google Sheet for key {key} with data from database.")
else:
# Keep the existing Google Sheet row
print(f"Kept existing Google Sheet row for key {key}.")
else:
new_row = [db_row[i] if db_row[i] else '' for i in range(27)]
# Append new row
new_row = [row[i] if row[i] else '' for i in range(27)]
worksheet.append_row(new_row, value_input_option='USER_ENTERED')
print(f"Added new row for key {key} to Google Sheet.")

Expand Down

0 comments on commit b752289

Please sign in to comment.