From 31da54d7a91ba5781459cf5fce84a7a10dc0a16b Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Sat, 1 Feb 2025 16:49:29 -0800 Subject: [PATCH 1/5] fix: add detailed inline comments to improve code readability (#80) --- src/dsci524_group29_webscraping/save_data.py | 22 +++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/dsci524_group29_webscraping/save_data.py b/src/dsci524_group29_webscraping/save_data.py index 3b82749..1f49bb0 100644 --- a/src/dsci524_group29_webscraping/save_data.py +++ b/src/dsci524_group29_webscraping/save_data.py @@ -42,33 +42,45 @@ def save_data(data, format='csv', destination='output.csv'): - If the specified directory in the destination does not exist, a FileNotFoundError will be raised. """ # Validate the destination directory + # Check if the directory in the destination path exists dir_path = os.path.dirname(destination) if dir_path and not os.path.exists(dir_path): + # Raise an error if the directory does not exist raise FileNotFoundError(f"The directory {dir_path} does not exist.") - # Save as CSV + # Save data in CSV format if format == 'csv': + # Ensure the input data is a list of dictionaries if not isinstance(data, list) or not all(isinstance(item, dict) for item in data): raise ValueError("For CSV, data must be a list of dictionaries.") try: + # Open the destination file in write mode with open(destination, mode='w', newline='') as file: + # Create a CSV writer object writer = csv.DictWriter(file, fieldnames=data[0].keys()) - writer.writeheader() - writer.writerows(data) + writer.writeheader() # Write the header row + writer.writerows(data) # Write the data rows except Exception as e: + # Raise an error if CSV saving fails raise Exception(f"Failed to save CSV data: {e}") - # Save as JSON + # Save data in JSON format elif format == 'json': + # Ensure the input data is either a list or a dictionary if not isinstance(data, (list, dict)): raise ValueError("For JSON, data must be a list or a dictionary.") try: + # Open the destination file in write mode with open(destination, mode='w') as file: + # Write the JSON data to the file with indentation for readability json.dump(data, file, indent=4) except Exception as e: + # Raise an error if JSON saving fails raise Exception(f"Failed to save JSON data: {e}") + # Raise an error if the specified format is unsupported else: raise ValueError("Unsupported format. Use 'csv' or 'json'.") - return os.path.abspath(destination) + # Return the absolute path to the saved file + return os.path.abspath(destination) \ No newline at end of file From 36a36377b4b80cee6daf630f0f6acc51bab1f6f5 Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Sat, 1 Feb 2025 16:57:21 -0800 Subject: [PATCH 2/5] fix: updated authors list in pyproject.toml with all team members (#75) --- pyproject.toml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fae4508..be49c52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,11 @@ name = "dsci524_group29_webscraping" version = "1.1.3" description = "A simple Python toolkit for web scraping" -authors = ["group29"] +authors = [ + "Lixuan Lin", + "Hui Tang", + "Sienko Ikhabi" +] license = "MIT" readme = "README.md" @@ -30,4 +34,4 @@ build_command = "pip install poetry && poetry build" # build dists [build-system] requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" +build-backend = "poetry.core.masonry.api" \ No newline at end of file From 48bc46852d8cd9d1dba39d71181f91d74a52701a Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Sat, 1 Feb 2025 17:13:10 -0800 Subject: [PATCH 3/5] fix: added usage instructions to README.md (#69) --- README.md | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8f31f01..9737eaa 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,43 @@ $ pip install dsci524_group29_webscraping - `parse_content(html, selector, selector_type)`: Parses the provided HTML content using CSS selectors or XPath to extract specified data. - `save_data(data, format, destination)`: Saves the extracted data into the desired format (e.g., TXT, CSV, JSON) at the specified destination path. +## Usage + +Below are examples demonstrating how to use the main functions in this package: + +### 1. Fetch HTML Content +```python +from dsci524_group29_webscraping import fetch_html + +# Fetch the raw HTML content from a webpage +url = "https://example.com" +html_content = fetch_html(url) +print(html_content) # Outputs the HTML content of the page +``` + +### 2. Parse Content +```python +from dsci524_group29_webscraping import parse_content + +# Parse the HTML content to extract specific elements +selector = "h1" # Example: extract all

elements +selector_type = "css" # Use CSS selectors +extracted_data = parse_content(html_content, selector, selector_type) +print(extracted_data) # Outputs a list of the extracted data +``` + +### 3. Save Data +```python +from dsci524_group29_webscraping import save_data + +# Save the extracted data to a CSV file +data = [{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}] # Example data +file_path = save_data(data, format="csv", destination="output.csv") +print(f"Data saved to: {file_path}") +``` + +This package simplifies the process of fetching, parsing, and saving web data, making it ideal for beginners. + ## Python Ecosystem While libraries like [`BeautifulSoup`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) @@ -35,7 +72,6 @@ it accessible for quick tasks and educational purposes. *dsci524_group29_webscraping* differentiates itself by offering a simple set of functions that do the job for simple, beginner level needs. - ## Contributors - Lixuan Lin From d7f1515254f1fa6cabf7583487351d1acda25157 Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Sat, 1 Feb 2025 17:16:05 -0800 Subject: [PATCH 4/5] fix: updated LICENSE to list all authors (#75) --- LICENSE | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/LICENSE b/LICENSE index 45183cb..6254a4f 100755 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025, group29 +Copyright (c) 2025, Lixuan Lin, Hui Tang, Sienko Ikhabi Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -18,5 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - +SOFTWARE. \ No newline at end of file From 4f6e4b351a1d29c3a1625388caf2964afb2bd7ca Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Sat, 1 Feb 2025 17:25:30 -0800 Subject: [PATCH 5/5] fix: refactored save_data.py to improve readability and comments (#81) --- src/dsci524_group29_webscraping/save_data.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/dsci524_group29_webscraping/save_data.py b/src/dsci524_group29_webscraping/save_data.py index 1f49bb0..b4ee5f6 100644 --- a/src/dsci524_group29_webscraping/save_data.py +++ b/src/dsci524_group29_webscraping/save_data.py @@ -42,10 +42,9 @@ def save_data(data, format='csv', destination='output.csv'): - If the specified directory in the destination does not exist, a FileNotFoundError will be raised. """ # Validate the destination directory - # Check if the directory in the destination path exists dir_path = os.path.dirname(destination) if dir_path and not os.path.exists(dir_path): - # Raise an error if the directory does not exist + # Ensure the directory exists before attempting to save raise FileNotFoundError(f"The directory {dir_path} does not exist.") # Save data in CSV format @@ -54,14 +53,13 @@ def save_data(data, format='csv', destination='output.csv'): if not isinstance(data, list) or not all(isinstance(item, dict) for item in data): raise ValueError("For CSV, data must be a list of dictionaries.") try: - # Open the destination file in write mode with open(destination, mode='w', newline='') as file: - # Create a CSV writer object + # Write the data to the CSV file writer = csv.DictWriter(file, fieldnames=data[0].keys()) writer.writeheader() # Write the header row writer.writerows(data) # Write the data rows except Exception as e: - # Raise an error if CSV saving fails + # Handle unexpected issues when saving the CSV file raise Exception(f"Failed to save CSV data: {e}") # Save data in JSON format @@ -70,16 +68,15 @@ def save_data(data, format='csv', destination='output.csv'): if not isinstance(data, (list, dict)): raise ValueError("For JSON, data must be a list or a dictionary.") try: - # Open the destination file in write mode with open(destination, mode='w') as file: # Write the JSON data to the file with indentation for readability json.dump(data, file, indent=4) except Exception as e: - # Raise an error if JSON saving fails + # Handle unexpected issues when saving the JSON file raise Exception(f"Failed to save JSON data: {e}") - # Raise an error if the specified format is unsupported else: + # Raise an error for unsupported formats raise ValueError("Unsupported format. Use 'csv' or 'json'.") # Return the absolute path to the saved file