Added Function for visualization

UBC-MDS · Jan 9, 2025 · a1a0847 · a1a0847
1 parent c70ad21
commit a1a0847
Showing 1 changed file with 64 additions and 0 deletions.
diff --git a/src/dataprofiler/dataprofiler.py b/src/dataprofiler/dataprofiler.py
@@ -48,3 +48,67 @@ def detect_anomalies(df):
     """
     pass
 
+def plotify(df):
+    """
+    Visualize a DataFrame by generating appropriate plots based on column datatypes.
+
+    This function analyzes the datatypes of the columns in a DataFrame and generates 
+    visualizations that are appropriate for the column types. It supports the following:
+    - Numeric vs Numeric: Visualizations such as scatter plots, pair plots, or correlation matrices.
+    - Numeric vs Categorical: Visualizations like box plots, violin plots, or bar charts.
+    - Categorical vs Categorical: Visualizations such as stacked bar charts or mosaic plots.
+    - Binary Columns: Treated as a special case of categorical data.
+    - Text Columns: Generates a word cloud for columns containing textual data.
+
+    The function distinguishes between truly numeric columns and those that are numeric 
+    representations of categories by analyzing their unique values and distributions.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        The DataFrame containing the data to be visualized. It should contain 
+        columns with data types that include numeric, categorical, binary, or textual.
+
+    Returns
+    -------
+    None
+        The function generates and displays plots directly. It does not return any value.
+
+    Raises
+    ------
+    ValueError
+        If the input is not a pandas DataFrame or if the DataFrame is empty.
+
+    Notes
+    -----
+    - Binary columns are considered categorical.
+    - Categorical columns are identified based on data types or unique value counts.
+    - Textual data is identified by the object or string dtype.
+    - Numeric data that appears to represent categories (e.g., zip codes or encoded values) 
+      is detected and treated as categorical based on the number of unique values relative 
+      to the size of the column.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> from dataprofiler import plotify
+
+    # Example DataFrame
+    >>> data = {
+    ...     'Age': [25, 30, 35, 40, 45],
+    ...     'Salary': [50000, 60000, 70000, 80000, 90000],
+    ...     'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
+    ...     'ZipCode': [12345, 12345, 67890, 67890, 67890],
+    ...     'Comments': ['Great product', 'Good service', 'Average experience', 'Excellent quality', 'Poor support']
+    ... }
+    >>> df = pd.DataFrame(data)
+    >>> plotify(df)
+
+    This will generate:
+    - A scatter plot for 'Age' vs 'Salary'.
+    - A box plot for 'Salary' vs 'Gender'.
+    - A bar plot for the 'ZipCode' column (treated as categorical).
+    - A word cloud for the 'Comments' column, highlighting words such as 'Great', 'Good', 'Average', 'Excellent', and 'Poor'.
+    """
+    pass