We can create specific unit and integration tests to ensure that each stage of your data pipeline works correctly. Below are examples of tests that could be written for each of the critical components in the project:
This test ensures that the adzuna.py
scraper correctly fetches and processes data.
import unittest
from scripts.web_scraping.adzuna import fetch_adzuna_data
class TestAdzunaScraper(unittest.TestCase):
def test_fetch_adzuna_data(self):
# Assuming fetch_adzuna_data returns a list of dictionaries
data = fetch_adzuna_data()
self.assertIsInstance(data, list)
self.assertGreater(len(data), 0)
for job in data:
self.assertIn('title', job)
self.assertIn('company', job)
self.assertIn('location', job)
self.assertIn('salary', job)
if __name__ == '__main__':
unittest.main()
Similar to the Adzuna test, but for the Muse data.
import unittest
from scripts.web_scraping.muse import fetch_muse_data
class TestMuseScraper(unittest.TestCase):
def test_fetch_muse_data(self):
data = fetch_muse_data()
self.assertIsInstance(data, list)
self.assertGreater(len(data), 0)
for job in data:
self.assertIn('title', job)
self.assertIn('company', job)
self.assertIn('location', job)
self.assertIn('salary', job)
if __name__ == '__main__':
unittest.main()
This test ensures that the ETL process correctly processes the scraped data.
import unittest
from scripts.etl.etlscript import process_data
class TestETLProcess(unittest.TestCase):
def test_process_data(self):
# Mocked raw data
raw_data = [{'title': 'Data Scientist', 'company': 'Company A', 'location': 'Berlin', 'salary': '60,000'}]
processed_data = process_data(raw_data)
self.assertIsInstance(processed_data, list)
self.assertGreater(len(processed_data), 0)
for job in processed_data:
self.assertIn('title', job)
self.assertIn('company', job)
self.assertIn('location', job)
self.assertIn('salary', job)
self.assertIsInstance(job['salary'], int) # Assuming ETL converts salary to an integer
if __name__ == '__main__':
unittest.main()
This test checks that the database connection is established correctly.
import unittest
from scripts.database.db_connection import create_connection
class TestDBConnection(unittest.TestCase):
def test_create_connection(self):
conn = create_connection()
self.assertIsNotNone(conn)
self.assertTrue(conn.is_connected())
if __name__ == '__main__':
unittest.main()
This test ensures that data flows correctly through the entire pipeline, from scraping to processing to loading.
import unittest
from scripts.web_scraping.adzuna import fetch_adzuna_data
from scripts.etl.etlscript import process_data
from scripts.database.db_connection import create_connection, load_data_to_db
class TestFullPipeline(unittest.TestCase):
def test_full_pipeline(self):
raw_data = fetch_adzuna_data()
self.assertGreater(len(raw_data), 0)
processed_data = process_data(raw_data)
self.assertGreater(len(processed_data), 0)
conn = create_connection()
success = load_data_to_db(conn, processed_data)
self.assertTrue(success)
if __name__ == '__main__':
unittest.main()
This test checks that the data pipeline performs efficiently.
import timeit
from scripts.web_scraping.adzuna import fetch_adzuna_data
from scripts.etl.etlscript import process_data
from scripts.database.db_connection import create_connection, load_data_to_db
def test_pipeline_performance():
setup_code = """
from scripts.web_scraping.adzuna import fetch_adzuna_data
from scripts.etl.etlscript import process_data
from scripts.database.db_connection import create_connection, load_data_to_db
"""
test_code = """
raw_data = fetch_adzuna_data()
processed_data = process_data(raw_data)
conn = create_connection()
load_data_to_db(conn, processed_data)
"""
execution_time = timeit.timeit(stmt=test_code, setup=setup_code, number=1)
assert execution_time < 5 # Arbitrary threshold
if __name__ == '__main__':
test_pipeline_performance()
In your ci.yml
file, you should ensure these tests run automatically with each commit. Below is a sample configuration for running these tests using GitHub Actions:
name: CI Pipeline
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run tests
run: |
pytest
These tests cover key aspects of your data pipeline, ensuring each stage from web scraping to ETL to database storage works correctly. The integration of these tests into a CI/CD pipeline will ensure that any changes to your project are thoroughly vetted before deployment, minimizing the risk of errors in production. This approach can be applied to other components of your project as well, ensuring robust and reliable operation.