From 046802e6cf3658494342158d6227021f7c8c7375 Mon Sep 17 00:00:00 2001 From: JamesAlfonse <120358302+JamesAlfonse@users.noreply.github.com> Date: Sun, 15 Dec 2024 19:07:14 -0500 Subject: [PATCH] The primary key is now (CIK, Ticker, CompanyNameIssuer) as requested. The ON CONFLICT clause has been removed, so new unique combinations will simply be inserted as new rows. If a combination (CIK, Ticker, CompanyNameIssuer) already exists, it will remain unchanged. Code variables and function names have been made more descriptive and consistent. The change_primary_key_to_cik.py file and the associated step in the GitHub Actions workflow have been removed. --- .github/workflows/SEC_CTEC_Data.yml | 46 +++++++ .github/workflows/SEC_Data.yml | 49 ------- src/scripts/SEC_company_tickers_exchange.py | 44 +++---- src/scripts/update_db_from_json.py | 139 +++++++++++++------- 4 files changed, 156 insertions(+), 122 deletions(-) create mode 100644 .github/workflows/SEC_CTEC_Data.yml delete mode 100644 .github/workflows/SEC_Data.yml diff --git a/.github/workflows/SEC_CTEC_Data.yml b/.github/workflows/SEC_CTEC_Data.yml new file mode 100644 index 0000000..56f2761 --- /dev/null +++ b/.github/workflows/SEC_CTEC_Data.yml @@ -0,0 +1,46 @@ +name: Update Database with SEC Data (CIK, Ticker, Exchange, Company Name) + +on: + schedule: + - cron: '30 23 * * *' # Runs every day at 23:30 + workflow_dispatch: # Allows manual triggering + +concurrency: + group: database-update + +jobs: + update-data: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: pip install requests pandas gspread oauth2client + + - name: Pull SEC data and update repository + run: python src/scripts/SEC_company_tickers_exchange.py + + - name: Update Database from JSON + run: python src/scripts/update_db_from_json.py + + - name: Verify changes + run: | + echo "Checking for changes in data files..." + git status + + - name: Commit and push changes + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add data/company_tickers_exchange.json data/Full_Database_Backend.db + git commit -m "Updated database with SEC data" + git push + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/SEC_Data.yml b/.github/workflows/SEC_Data.yml deleted file mode 100644 index b3b1e2b..0000000 --- a/.github/workflows/SEC_Data.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: Update Database with SEC Data - -on: - schedule: - - cron: '0 0 * * *' # Runs every day at midnight - workflow_dispatch: # Allows manual triggering - -concurrency: - group: database-update - -jobs: - update-data: - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - - name: Install dependencies - run: pip install requests pandas gspread oauth2client - - - name: Pull SEC data and update repository - run: python src/scripts/SEC_company_tickers_exchange.py - - - name: Change Primary Key in Database - run: python src/scripts/change_primary_key_to_cik.py - - - name: Update Database from JSON - run: python src/scripts/update_db_from_json.py - - - name: Verify changes - run: | - echo "Checking for changes in data files..." - git status - - - name: Commit and push changes - run: | - git config --global user.name 'JamesAlfonse' - git config --global user.email 'jamesalfonse@gmail.com' - git add data/company_tickers_exchange.json data/Full_Database_Backend.db - git commit -m "Updated database with SEC data" - git push - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/src/scripts/SEC_company_tickers_exchange.py b/src/scripts/SEC_company_tickers_exchange.py index 56e50af..83c552c 100644 --- a/src/scripts/SEC_company_tickers_exchange.py +++ b/src/scripts/SEC_company_tickers_exchange.py @@ -2,35 +2,33 @@ import os import time -# URL of the JSON file -url = "https://www.sec.gov/files/company_tickers_exchange.json" +# URL for the SEC JSON data +SEC_JSON_URL = "https://www.sec.gov/files/company_tickers_exchange.json" -# Path to the data folder -data_folder = "data" -os.makedirs(data_folder, exist_ok=True) +# Data folder and output file paths +DATA_FOLDER = "data" +os.makedirs(DATA_FOLDER, exist_ok=True) +OUTPUT_FILE = os.path.join(DATA_FOLDER, "company_tickers_exchange.json") -# Path to the output file -output_file = os.path.join(data_folder, "company_tickers_exchange.json") - -# Set headers to mimic a browser request responsibly -headers = { +# HTTP headers to mimic a browser and provide contact info +HEADERS = { "User-Agent": "MyAppName/1.0 (hi@WhyDRS.org)" } -# Rate limit parameters -max_requests_per_second = 10 -sleep_time = 1 / max_requests_per_second +# Rate limit configuration +MAX_REQUESTS_PER_SECOND = 10 +SLEEP_TIME = 1 / MAX_REQUESTS_PER_SECOND -# Function to download the JSON file -def download_file(url, headers, output_file): +def download_sec_data(url, headers, output_file): + """Download the SEC JSON data and save it to a file.""" response = requests.get(url, headers=headers) - response.raise_for_status() # Check that the request was successful - with open(output_file, "wb") as f: - f.write(response.content) - print(f"File saved to {output_file}") + response.raise_for_status() # Raise an error if the request failed + with open(output_file, "wb") as file: + file.write(response.content) + print(f"SEC data file saved to {output_file}") -# Download the JSON file -download_file(url, headers, output_file) +# Download the JSON data +download_sec_data(SEC_JSON_URL, HEADERS, OUTPUT_FILE) -# Sleep to respect rate limit -time.sleep(sleep_time) +# Sleep to respect rate limits +time.sleep(SLEEP_TIME) diff --git a/src/scripts/update_db_from_json.py b/src/scripts/update_db_from_json.py index ef3e23d..f3fa121 100644 --- a/src/scripts/update_db_from_json.py +++ b/src/scripts/update_db_from_json.py @@ -3,74 +3,113 @@ import json import sqlite3 -# Path to the JSON file -json_file_path = 'data/company_tickers_exchange.json' -db_file_path = 'data/Full_Database_Backend.db' +# File paths +JSON_FILE_PATH = 'data/company_tickers_exchange.json' +DB_FILE_PATH = 'data/Full_Database_Backend.db' -# Read the JSON file -with open(json_file_path, 'r') as json_file: - data = json.load(json_file) +# Read JSON data +with open(JSON_FILE_PATH, 'r') as json_file: + sec_data = json.load(json_file) -# Extract fields and data -fields = data['fields'] -records = data['data'] +fields = sec_data['fields'] +records = sec_data['data'] -# Convert to DataFrame +# Convert JSON records to a DataFrame df = pd.DataFrame(records, columns=fields) -# Replace NaN values with empty strings +# Replace NaN with empty strings for consistency df = df.fillna('') # Connect to the SQLite database -conn = sqlite3.connect(db_file_path) +conn = sqlite3.connect(DB_FILE_PATH) cursor = conn.cursor() -# Update the database with the JSON data -for index, row in df.iterrows(): +# Create table with the updated schema if it doesn't exist +# Primary key: (CIK, Ticker, CompanyNameIssuer) +cursor.execute(''' +CREATE TABLE IF NOT EXISTS full_database_backend ( + Ticker TEXT, + Exchange TEXT, + CompanyNameIssuer TEXT, + TransferAgent TEXT, + OnlinePurchase TEXT, + DTCMemberNum TEXT, + TAURL TEXT, + TransferAgentPct TEXT, + IREmails TEXT, + IRPhoneNum TEXT, + IRCompanyAddress TEXT, + IRURL TEXT, + IRContactInfo TEXT, + SharesOutstanding TEXT, + CUSIP TEXT, + CompanyInfoURL TEXT, + CompanyInfo TEXT, + FullProgressPct TEXT, + CIK TEXT, + DRS TEXT, + PercentSharesDRSd TEXT, + SubmissionReceived TEXT, + TimestampsUTC TEXT, + LearnMoreAboutDRS TEXT, + CertificatesOffered TEXT, + SandP500 TEXT, + IncorporatedIn TEXT, + PRIMARY KEY (CIK, Ticker, CompanyNameIssuer) +) +''') + +# Insert rows from DataFrame into the database +# No ON CONFLICT clause - if there's a new combination of (CIK, Ticker, CompanyNameIssuer), +# it will be added as a new row. +for _, row in df.iterrows(): + cik_value = row['cik'] + ticker_value = row['ticker'] + exchange_value = row['exchange'] + company_name_issuer_value = row['name'] + + # Insert a minimal set of values; other fields may remain empty at this stage cursor.execute(''' INSERT INTO full_database_backend (CIK, Ticker, Exchange, CompanyNameIssuer) VALUES (?, ?, ?, ?) - ON CONFLICT (CIK, Ticker) DO UPDATE SET - Exchange = excluded.Exchange, - CompanyNameIssuer = excluded.CompanyNameIssuer - ''', (row['cik'], row['ticker'], row['exchange'], row['name'])) + ''', (cik_value, ticker_value, exchange_value, company_name_issuer_value)) -# Replace NULL, blank, and single space values with empty strings +# Clean up whitespace and NULL-like values from all text fields cursor.execute(''' - UPDATE full_database_backend - SET - Ticker = IFNULL(NULLIF(TRIM(Ticker), ''), ''), - Exchange = IFNULL(NULLIF(TRIM(Exchange), ''), ''), - CompanyNameIssuer = IFNULL(NULLIF(TRIM(CompanyNameIssuer), ''), ''), - TransferAgent = IFNULL(NULLIF(TRIM(TransferAgent), ''), ''), - OnlinePurchase = IFNULL(NULLIF(TRIM(OnlinePurchase), ''), ''), - DTCMemberNum = IFNULL(NULLIF(TRIM(DTCMemberNum), ''), ''), - TAURL = IFNULL(NULLIF(TRIM(TAURL), ''), ''), - TransferAgentPct = IFNULL(NULLIF(TRIM(TransferAgentPct), ''), ''), - IREmails = IFNULL(NULLIF(TRIM(IREmails), ''), ''), - IRPhoneNum = IFNULL(NULLIF(TRIM(IRPhoneNum), ''), ''), - IRCompanyAddress = IFNULL(NULLIF(TRIM(IRCompanyAddress), ''), ''), - IRURL = IFNULL(NULLIF(TRIM(IRURL), ''), ''), - IRContactInfo = IFNULL(NULLIF(TRIM(IRContactInfo), ''), ''), - SharesOutstanding = IFNULL(NULLIF(TRIM(SharesOutstanding), ''), ''), - CUSIP = IFNULL(NULLIF(TRIM(CUSIP), ''), ''), - CompanyInfoURL = IFNULL(NULLIF(TRIM(CompanyInfoURL), ''), ''), - CompanyInfo = IFNULL(NULLIF(TRIM(CompanyInfo), ''), ''), - FullProgressPct = IFNULL(NULLIF(TRIM(FullProgressPct), ''), ''), - CIK = IFNULL(NULLIF(TRIM(CIK), ''), ''), - DRS = IFNULL(NULLIF(TRIM(DRS), ''), ''), - PercentSharesDRSd = IFNULL(NULLIF(TRIM(PercentSharesDRSd), ''), ''), - SubmissionReceived = IFNULL(NULLIF(TRIM(SubmissionReceived), ''), ''), - TimestampsUTC = IFNULL(NULLIF(TRIM(TimestampsUTC), ''), ''), - LearnMoreAboutDRS = IFNULL(NULLIF(TRIM(LearnMoreAboutDRS), ''), ''), - CertificatesOffered = IFNULL(NULLIF(TRIM(CertificatesOffered), ''), ''), - SandP500 = IFNULL(NULLIF(TRIM(SandP500), ''), ''), - IncorporatedIn = IFNULL(NULLIF(TRIM(IncorporatedIn), ''), '') +UPDATE full_database_backend +SET + Ticker = IFNULL(NULLIF(TRIM(Ticker), ''), ''), + Exchange = IFNULL(NULLIF(TRIM(Exchange), ''), ''), + CompanyNameIssuer = IFNULL(NULLIF(TRIM(CompanyNameIssuer), ''), ''), + TransferAgent = IFNULL(NULLIF(TRIM(TransferAgent), ''), ''), + OnlinePurchase = IFNULL(NULLIF(TRIM(OnlinePurchase), ''), ''), + DTCMemberNum = IFNULL(NULLIF(TRIM(DTCMemberNum), ''), ''), + TAURL = IFNULL(NULLIF(TRIM(TAURL), ''), ''), + TransferAgentPct = IFNULL(NULLIF(TRIM(TransferAgentPct), ''), ''), + IREmails = IFNULL(NULLIF(TRIM(IREmails), ''), ''), + IRPhoneNum = IFNULL(NULLIF(TRIM(IRPhoneNum), ''), ''), + IRCompanyAddress = IFNULL(NULLIF(TRIM(IRCompanyAddress), ''), ''), + IRURL = IFNULL(NULLIF(TRIM(IRURL), ''), ''), + IRContactInfo = IFNULL(NULLIF(TRIM(IRContactInfo), ''), ''), + SharesOutstanding = IFNULL(NULLIF(TRIM(SharesOutstanding), ''), ''), + CUSIP = IFNULL(NULLIF(TRIM(CUSIP), ''), ''), + CompanyInfoURL = IFNULL(NULLIF(TRIM(CompanyInfoURL), ''), ''), + CompanyInfo = IFNULL(NULLIF(TRIM(CompanyInfo), ''), ''), + FullProgressPct = IFNULL(NULLIF(TRIM(FullProgressPct), ''), ''), + CIK = IFNULL(NULLIF(TRIM(CIK), ''), ''), + DRS = IFNULL(NULLIF(TRIM(DRS), ''), ''), + PercentSharesDRSd = IFNULL(NULLIF(TRIM(PercentSharesDRSd), ''), ''), + SubmissionReceived = IFNULL(NULLIF(TRIM(SubmissionReceived), ''), ''), + TimestampsUTC = IFNULL(NULLIF(TRIM(TimestampsUTC), ''), ''), + LearnMoreAboutDRS = IFNULL(NULLIF(TRIM(LearnMoreAboutDRS), ''), ''), + CertificatesOffered = IFNULL(NULLIF(TRIM(CertificatesOffered), ''), ''), + SandP500 = IFNULL(NULLIF(TRIM(SandP500), ''), ''), + IncorporatedIn = IFNULL(NULLIF(TRIM(IncorporatedIn), ''), '') ''') -# Commit the changes and close the connection +# Commit changes and close connection conn.commit() cursor.close() conn.close() -print(f"Database updated with data from {json_file_path}") +print(f"Database updated from {JSON_FILE_PATH} successfully.")