Skip to content

Commit

Permalink
The primary key is now (CIK, Ticker, CompanyNameIssuer) as requested.
Browse files Browse the repository at this point in the history
The ON CONFLICT clause has been removed, so new unique combinations will simply be inserted as new rows.
If a combination (CIK, Ticker, CompanyNameIssuer) already exists, it will remain unchanged.
Code variables and function names have been made more descriptive and consistent.
The change_primary_key_to_cik.py file and the associated step in the GitHub Actions workflow have been removed.
  • Loading branch information
JamesAlfonse authored Dec 16, 2024
1 parent 9de681a commit 046802e
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 122 deletions.
46 changes: 46 additions & 0 deletions .github/workflows/SEC_CTEC_Data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Update Database with SEC Data (CIK, Ticker, Exchange, Company Name)

on:
schedule:
- cron: '30 23 * * *' # Runs every day at 23:30
workflow_dispatch: # Allows manual triggering

concurrency:
group: database-update

jobs:
update-data:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Install dependencies
run: pip install requests pandas gspread oauth2client

- name: Pull SEC data and update repository
run: python src/scripts/SEC_company_tickers_exchange.py

- name: Update Database from JSON
run: python src/scripts/update_db_from_json.py

- name: Verify changes
run: |
echo "Checking for changes in data files..."
git status
- name: Commit and push changes
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add data/company_tickers_exchange.json data/Full_Database_Backend.db
git commit -m "Updated database with SEC data"
git push
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
49 changes: 0 additions & 49 deletions .github/workflows/SEC_Data.yml

This file was deleted.

44 changes: 21 additions & 23 deletions src/scripts/SEC_company_tickers_exchange.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,33 @@
import os
import time

# URL of the JSON file
url = "https://www.sec.gov/files/company_tickers_exchange.json"
# URL for the SEC JSON data
SEC_JSON_URL = "https://www.sec.gov/files/company_tickers_exchange.json"

# Path to the data folder
data_folder = "data"
os.makedirs(data_folder, exist_ok=True)
# Data folder and output file paths
DATA_FOLDER = "data"
os.makedirs(DATA_FOLDER, exist_ok=True)
OUTPUT_FILE = os.path.join(DATA_FOLDER, "company_tickers_exchange.json")

# Path to the output file
output_file = os.path.join(data_folder, "company_tickers_exchange.json")

# Set headers to mimic a browser request responsibly
headers = {
# HTTP headers to mimic a browser and provide contact info
HEADERS = {
"User-Agent": "MyAppName/1.0 ([email protected])"
}

# Rate limit parameters
max_requests_per_second = 10
sleep_time = 1 / max_requests_per_second
# Rate limit configuration
MAX_REQUESTS_PER_SECOND = 10
SLEEP_TIME = 1 / MAX_REQUESTS_PER_SECOND

# Function to download the JSON file
def download_file(url, headers, output_file):
def download_sec_data(url, headers, output_file):
"""Download the SEC JSON data and save it to a file."""
response = requests.get(url, headers=headers)
response.raise_for_status() # Check that the request was successful
with open(output_file, "wb") as f:
f.write(response.content)
print(f"File saved to {output_file}")
response.raise_for_status() # Raise an error if the request failed
with open(output_file, "wb") as file:
file.write(response.content)
print(f"SEC data file saved to {output_file}")

# Download the JSON file
download_file(url, headers, output_file)
# Download the JSON data
download_sec_data(SEC_JSON_URL, HEADERS, OUTPUT_FILE)

# Sleep to respect rate limit
time.sleep(sleep_time)
# Sleep to respect rate limits
time.sleep(SLEEP_TIME)
139 changes: 89 additions & 50 deletions src/scripts/update_db_from_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,74 +3,113 @@
import json
import sqlite3

# Path to the JSON file
json_file_path = 'data/company_tickers_exchange.json'
db_file_path = 'data/Full_Database_Backend.db'
# File paths
JSON_FILE_PATH = 'data/company_tickers_exchange.json'
DB_FILE_PATH = 'data/Full_Database_Backend.db'

# Read the JSON file
with open(json_file_path, 'r') as json_file:
data = json.load(json_file)
# Read JSON data
with open(JSON_FILE_PATH, 'r') as json_file:
sec_data = json.load(json_file)

# Extract fields and data
fields = data['fields']
records = data['data']
fields = sec_data['fields']
records = sec_data['data']

# Convert to DataFrame
# Convert JSON records to a DataFrame
df = pd.DataFrame(records, columns=fields)

# Replace NaN values with empty strings
# Replace NaN with empty strings for consistency
df = df.fillna('')

# Connect to the SQLite database
conn = sqlite3.connect(db_file_path)
conn = sqlite3.connect(DB_FILE_PATH)
cursor = conn.cursor()

# Update the database with the JSON data
for index, row in df.iterrows():
# Create table with the updated schema if it doesn't exist
# Primary key: (CIK, Ticker, CompanyNameIssuer)
cursor.execute('''
CREATE TABLE IF NOT EXISTS full_database_backend (
Ticker TEXT,
Exchange TEXT,
CompanyNameIssuer TEXT,
TransferAgent TEXT,
OnlinePurchase TEXT,
DTCMemberNum TEXT,
TAURL TEXT,
TransferAgentPct TEXT,
IREmails TEXT,
IRPhoneNum TEXT,
IRCompanyAddress TEXT,
IRURL TEXT,
IRContactInfo TEXT,
SharesOutstanding TEXT,
CUSIP TEXT,
CompanyInfoURL TEXT,
CompanyInfo TEXT,
FullProgressPct TEXT,
CIK TEXT,
DRS TEXT,
PercentSharesDRSd TEXT,
SubmissionReceived TEXT,
TimestampsUTC TEXT,
LearnMoreAboutDRS TEXT,
CertificatesOffered TEXT,
SandP500 TEXT,
IncorporatedIn TEXT,
PRIMARY KEY (CIK, Ticker, CompanyNameIssuer)
)
''')

# Insert rows from DataFrame into the database
# No ON CONFLICT clause - if there's a new combination of (CIK, Ticker, CompanyNameIssuer),
# it will be added as a new row.
for _, row in df.iterrows():
cik_value = row['cik']
ticker_value = row['ticker']
exchange_value = row['exchange']
company_name_issuer_value = row['name']

# Insert a minimal set of values; other fields may remain empty at this stage
cursor.execute('''
INSERT INTO full_database_backend (CIK, Ticker, Exchange, CompanyNameIssuer)
VALUES (?, ?, ?, ?)
ON CONFLICT (CIK, Ticker) DO UPDATE SET
Exchange = excluded.Exchange,
CompanyNameIssuer = excluded.CompanyNameIssuer
''', (row['cik'], row['ticker'], row['exchange'], row['name']))
''', (cik_value, ticker_value, exchange_value, company_name_issuer_value))

# Replace NULL, blank, and single space values with empty strings
# Clean up whitespace and NULL-like values from all text fields
cursor.execute('''
UPDATE full_database_backend
SET
Ticker = IFNULL(NULLIF(TRIM(Ticker), ''), ''),
Exchange = IFNULL(NULLIF(TRIM(Exchange), ''), ''),
CompanyNameIssuer = IFNULL(NULLIF(TRIM(CompanyNameIssuer), ''), ''),
TransferAgent = IFNULL(NULLIF(TRIM(TransferAgent), ''), ''),
OnlinePurchase = IFNULL(NULLIF(TRIM(OnlinePurchase), ''), ''),
DTCMemberNum = IFNULL(NULLIF(TRIM(DTCMemberNum), ''), ''),
TAURL = IFNULL(NULLIF(TRIM(TAURL), ''), ''),
TransferAgentPct = IFNULL(NULLIF(TRIM(TransferAgentPct), ''), ''),
IREmails = IFNULL(NULLIF(TRIM(IREmails), ''), ''),
IRPhoneNum = IFNULL(NULLIF(TRIM(IRPhoneNum), ''), ''),
IRCompanyAddress = IFNULL(NULLIF(TRIM(IRCompanyAddress), ''), ''),
IRURL = IFNULL(NULLIF(TRIM(IRURL), ''), ''),
IRContactInfo = IFNULL(NULLIF(TRIM(IRContactInfo), ''), ''),
SharesOutstanding = IFNULL(NULLIF(TRIM(SharesOutstanding), ''), ''),
CUSIP = IFNULL(NULLIF(TRIM(CUSIP), ''), ''),
CompanyInfoURL = IFNULL(NULLIF(TRIM(CompanyInfoURL), ''), ''),
CompanyInfo = IFNULL(NULLIF(TRIM(CompanyInfo), ''), ''),
FullProgressPct = IFNULL(NULLIF(TRIM(FullProgressPct), ''), ''),
CIK = IFNULL(NULLIF(TRIM(CIK), ''), ''),
DRS = IFNULL(NULLIF(TRIM(DRS), ''), ''),
PercentSharesDRSd = IFNULL(NULLIF(TRIM(PercentSharesDRSd), ''), ''),
SubmissionReceived = IFNULL(NULLIF(TRIM(SubmissionReceived), ''), ''),
TimestampsUTC = IFNULL(NULLIF(TRIM(TimestampsUTC), ''), ''),
LearnMoreAboutDRS = IFNULL(NULLIF(TRIM(LearnMoreAboutDRS), ''), ''),
CertificatesOffered = IFNULL(NULLIF(TRIM(CertificatesOffered), ''), ''),
SandP500 = IFNULL(NULLIF(TRIM(SandP500), ''), ''),
IncorporatedIn = IFNULL(NULLIF(TRIM(IncorporatedIn), ''), '')
UPDATE full_database_backend
SET
Ticker = IFNULL(NULLIF(TRIM(Ticker), ''), ''),
Exchange = IFNULL(NULLIF(TRIM(Exchange), ''), ''),
CompanyNameIssuer = IFNULL(NULLIF(TRIM(CompanyNameIssuer), ''), ''),
TransferAgent = IFNULL(NULLIF(TRIM(TransferAgent), ''), ''),
OnlinePurchase = IFNULL(NULLIF(TRIM(OnlinePurchase), ''), ''),
DTCMemberNum = IFNULL(NULLIF(TRIM(DTCMemberNum), ''), ''),
TAURL = IFNULL(NULLIF(TRIM(TAURL), ''), ''),
TransferAgentPct = IFNULL(NULLIF(TRIM(TransferAgentPct), ''), ''),
IREmails = IFNULL(NULLIF(TRIM(IREmails), ''), ''),
IRPhoneNum = IFNULL(NULLIF(TRIM(IRPhoneNum), ''), ''),
IRCompanyAddress = IFNULL(NULLIF(TRIM(IRCompanyAddress), ''), ''),
IRURL = IFNULL(NULLIF(TRIM(IRURL), ''), ''),
IRContactInfo = IFNULL(NULLIF(TRIM(IRContactInfo), ''), ''),
SharesOutstanding = IFNULL(NULLIF(TRIM(SharesOutstanding), ''), ''),
CUSIP = IFNULL(NULLIF(TRIM(CUSIP), ''), ''),
CompanyInfoURL = IFNULL(NULLIF(TRIM(CompanyInfoURL), ''), ''),
CompanyInfo = IFNULL(NULLIF(TRIM(CompanyInfo), ''), ''),
FullProgressPct = IFNULL(NULLIF(TRIM(FullProgressPct), ''), ''),
CIK = IFNULL(NULLIF(TRIM(CIK), ''), ''),
DRS = IFNULL(NULLIF(TRIM(DRS), ''), ''),
PercentSharesDRSd = IFNULL(NULLIF(TRIM(PercentSharesDRSd), ''), ''),
SubmissionReceived = IFNULL(NULLIF(TRIM(SubmissionReceived), ''), ''),
TimestampsUTC = IFNULL(NULLIF(TRIM(TimestampsUTC), ''), ''),
LearnMoreAboutDRS = IFNULL(NULLIF(TRIM(LearnMoreAboutDRS), ''), ''),
CertificatesOffered = IFNULL(NULLIF(TRIM(CertificatesOffered), ''), ''),
SandP500 = IFNULL(NULLIF(TRIM(SandP500), ''), ''),
IncorporatedIn = IFNULL(NULLIF(TRIM(IncorporatedIn), ''), '')
''')

# Commit the changes and close the connection
# Commit changes and close connection
conn.commit()
cursor.close()
conn.close()

print(f"Database updated with data from {json_file_path}")
print(f"Database updated from {JSON_FILE_PATH} successfully.")

0 comments on commit 046802e

Please sign in to comment.