Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

upload test summary file #18

Merged
merged 1 commit into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Interested in contributing? Check out the contributing guidelines. Please note t

## License

`dataprofiler` was created by DongchunChen, Ismail (Husain) Bhinderwala and Jingyuan Wang. It is licensed under the terms of the MIT license.
`dataprofiler` was created by Dongchun Chen, Ismail (Husain) Bhinderwala and Jingyuan Wang. It is licensed under the terms of the MIT license.

## Credits

Expand Down
32 changes: 31 additions & 1 deletion src/dataprofiler/dataprofiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,41 @@ def summarize_data(df):
A DataFrame where each row corresponds to a numeric column in the input DataFrame,
and the columns represent the calculated statistics: min, 25%, 50% (median), 75%, and max.

Raises
------
TypeError
If the input is not a pandas DataFrame.
ValueError
If the DataFrame is empty or contains no numeric columns.

Example
-------
>>> summarize_data(df)
"""
pass

# Check if input is a DataFrame
if not isinstance(df, pd.DataFrame):
raise TypeError("Input must be a pandas DataFrame.")

# Check if DataFrame is empty
if df.empty:
raise ValueError("The input DataFrame is empty.")

# Select numeric columns
numeric_cols = df.select_dtypes(include=['number'])

# Check if there are numeric columns
if numeric_cols.empty:
raise ValueError("The DataFrame contains no numeric columns.")

# Calculate summary statistics
summary = numeric_cols.describe(percentiles=[0.25, 0.5, 0.75]).T

# Select relevant statistics
summary = summary[['min', '25%', '50%', '75%', 'max']]

return summary


def detect_anomalies(df):
"""Detect anomalies in a dataframe, including missing values, outliers, and duplicates.
Expand Down
47 changes: 47 additions & 0 deletions tests/test_summarize_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest
import pandas as pd
from dataprofiler.dataprofiler import summarize_data

def test_summarize_data_normal():
"""Test summarize_data with a normal DataFrame containing numeric columns."""
data = {
'A': [1, 2, 3, 4, 5],
'B': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)
result = summarize_data(df)
expected_columns = ['min', '25%', '50%', '75%', 'max']
assert all(col in result.columns for col in expected_columns)
assert result.shape[0] == 2 # Two numeric columns

def test_summarize_data_single_column():
"""Test summarize_data with a DataFrame containing a single numeric column."""
data = {'A': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data)
result = summarize_data(df)
assert result.shape == (1, 5)
assert result.loc['A', 'min'] == 1
assert result.loc['A', 'max'] == 5

def test_summarize_data_empty_dataframe():
"""Test summarize_data with an empty DataFrame."""
df = pd.DataFrame()
with pytest.raises(ValueError, match="The input DataFrame is empty."):
summarize_data(df)

def test_summarize_data_no_numeric_columns():
"""Test summarize_data with a DataFrame containing no numeric columns."""
data = {
'A': ['x', 'y', 'z'],
'B': ['foo', 'bar', 'baz']
}
df = pd.DataFrame(data)
with pytest.raises(ValueError, match="The DataFrame contains no numeric columns."):
summarize_data(df)

def test_summarize_data_invalid_input():
"""Test summarize_data with invalid input types."""
with pytest.raises(TypeError, match="Input must be a pandas DataFrame."):
summarize_data([1, 2, 3]) # Invalid input: list
with pytest.raises(TypeError, match="Input must be a pandas DataFrame."):
summarize_data("not a dataframe") # Invalid input: string
Loading