diff --git a/poetry.lock b/poetry.lock index 9a3463a..c22a793 100644 --- a/poetry.lock +++ b/poetry.lock @@ -26,6 +26,23 @@ dev = ["duckdb (>=1.0)", "geopandas", "hatch (>=1.13.0)", "ipython[kernel]", "mi doc = ["docutils", "jinja2", "myst-parser", "numpydoc", "pillow (>=9,<10)", "pydata-sphinx-theme (>=0.14.1)", "scipy", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinxext-altair"] save = ["vl-convert-python (>=1.7.0)"] +[[package]] +name = "asttokens" +version = "3.0.0" +description = "Annotate AST trees with source code positions" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2"}, + {file = "asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7"}, +] + +[package.extras] +astroid = ["astroid (>=2,<4)"] +test = ["astroid (>=2,<4)", "pytest", "pytest-cov", "pytest-xdist"] + [[package]] name = "attrs" version = "24.3.0" @@ -53,7 +70,7 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["dev"] +groups = ["main", "dev"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and sys_platform == \"win32\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, @@ -242,13 +259,26 @@ files = [ docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] tests = ["pytest", "pytest-cov", "pytest-xdist"] +[[package]] +name = "decorator" +version = "5.1.1" +description = "Decorators for Humans" +optional = false +python-versions = ">=3.5" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, + {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, +] + [[package]] name = "exceptiongroup" version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["dev"] +groups = ["main", "dev"] markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, @@ -258,6 +288,22 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "executing" +version = "2.1.0" +description = "Get the currently executing AST node of a frame, and other information" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "executing-2.1.0-py2.py3-none-any.whl", hash = "sha256:8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf"}, + {file = "executing-2.1.0.tar.gz", hash = "sha256:8ea27ddd260da8150fa5a708269c4a10e76161e2496ec3e587da9e3c0fe4b9ab"}, +] + +[package.extras] +tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] + [[package]] name = "fonttools" version = "4.55.3" @@ -334,42 +380,79 @@ unicode = ["unicodedata2 (>=15.1.0)"] woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] [[package]] -name = "importlib-resources" -version = "6.5.2" -description = "Read resources from Python packages" +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" optional = false -python-versions = ">=3.9" +python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "ipython" +version = "8.31.0" +description = "IPython: Productive Interactive Computing" +optional = false +python-versions = ">=3.10" groups = ["main"] -markers = "python_version < \"3.10\"" +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ - {file = "importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec"}, - {file = "importlib_resources-6.5.2.tar.gz", hash = "sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c"}, + {file = "ipython-8.31.0-py3-none-any.whl", hash = "sha256:46ec58f8d3d076a61d128fe517a51eb730e3aaf0c184ea8c17d16e366660c6a6"}, + {file = "ipython-8.31.0.tar.gz", hash = "sha256:b6a2274606bec6166405ff05e54932ed6e5cfecaca1fc05f2cacde7bb074d70b"}, ] [package.dependencies] -zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} +colorama = {version = "*", markers = "sys_platform == \"win32\""} +decorator = "*" +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +jedi = ">=0.16" +matplotlib-inline = "*" +pexpect = {version = ">4.3", markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\""} +prompt_toolkit = ">=3.0.41,<3.1.0" +pygments = ">=2.4.0" +stack_data = "*" +traitlets = ">=5.13.0" +typing_extensions = {version = ">=4.6", markers = "python_version < \"3.12\""} [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] -cover = ["pytest-cov"] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -enabler = ["pytest-enabler (>=2.2)"] -test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"] -type = ["pytest-mypy"] +all = ["ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole]", "ipython[test,test-extra]"] +black = ["black"] +doc = ["docrepr", "exceptiongroup", "intersphinx_registry", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinxcontrib-jquery", "tomli", "typing_extensions"] +kernel = ["ipykernel"] +matplotlib = ["matplotlib"] +nbconvert = ["nbconvert"] +nbformat = ["nbformat"] +notebook = ["ipywidgets", "notebook"] +parallel = ["ipyparallel"] +qtconsole = ["qtconsole"] +test = ["packaging", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"] +test-extra = ["curio", "ipython[test]", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.23)", "pandas", "trio"] [[package]] -name = "iniconfig" -version = "2.0.0" -description = "brain-dead simple config-ini parsing" +name = "jedi" +version = "0.19.2" +description = "An autocompletion tool for Python that can be used for text editors." optional = false -python-versions = ">=3.7" -groups = ["dev"] +python-versions = ">=3.6" +groups = ["main"] markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, + {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"}, + {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"}, ] +[package.dependencies] +parso = ">=0.8.4,<0.9.0" + +[package.extras] +docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx (==1.8.5)", "sphinx-rtd-theme (==0.4.3)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"] +qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] +testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"] + [[package]] name = "jinja2" version = "3.1.5" @@ -681,7 +764,6 @@ files = [ contourpy = ">=1.0.1" cycler = ">=0.10" fonttools = ">=4.22.0" -importlib-resources = {version = ">=3.2.0", markers = "python_version < \"3.10\""} kiwisolver = ">=1.3.1" numpy = ">=1.23" packaging = ">=20.0" @@ -692,6 +774,22 @@ python-dateutil = ">=2.7" [package.extras] dev = ["meson-python (>=0.13.1,<0.17.0)", "numpy (>=1.25)", "pybind11 (>=2.6,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7)"] +[[package]] +name = "matplotlib-inline" +version = "0.1.7" +description = "Inline Matplotlib backend for Jupyter" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"}, + {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"}, +] + +[package.dependencies] +traitlets = "*" + [[package]] name = "narwhals" version = "1.22.0" @@ -877,6 +975,39 @@ sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-d test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] xml = ["lxml (>=4.9.2)"] +[[package]] +name = "parso" +version = "0.8.4" +description = "A Python Parser" +optional = false +python-versions = ">=3.6" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"}, + {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"}, +] + +[package.extras] +qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] +testing = ["docopt", "pytest"] + +[[package]] +name = "pexpect" +version = "4.9.0" +description = "Pexpect allows easy control of interactive console applications." +optional = false +python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (sys_platform != \"win32\" and sys_platform != \"emscripten\")" +files = [ + {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, + {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, +] + +[package.dependencies] +ptyprocess = ">=0.5" + [[package]] name = "pillow" version = "11.1.0" @@ -984,6 +1115,67 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "prompt-toolkit" +version = "3.0.48" +description = "Library for building powerful interactive command lines in Python" +optional = false +python-versions = ">=3.7.0" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e"}, + {file = "prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90"}, +] + +[package.dependencies] +wcwidth = "*" + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +optional = false +python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (sys_platform != \"win32\" and sys_platform != \"emscripten\")" +files = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +description = "Safely evaluate AST nodes without side effects" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, + {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, +] + +[package.extras] +tests = ["pytest"] + +[[package]] +name = "pygments" +version = "2.19.1" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, + {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + [[package]] name = "pyparsing" version = "3.2.1" @@ -1218,6 +1410,27 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +[[package]] +name = "stack-data" +version = "0.6.3" +description = "Extract data from python stack frames and tracebacks for informative displays" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, + {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"}, +] + +[package.dependencies] +asttokens = ">=2.1.0" +executing = ">=1.2.0" +pure-eval = "*" + +[package.extras] +tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] + [[package]] name = "tomli" version = "2.2.1" @@ -1261,6 +1474,23 @@ files = [ {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, ] +[[package]] +name = "traitlets" +version = "5.14.3" +description = "Traitlets Python configuration system" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, + {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"}, +] + +[package.extras] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] +test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1287,6 +1517,19 @@ files = [ {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, ] +[[package]] +name = "wcwidth" +version = "0.2.13" +description = "Measures the displayed width of unicode strings in a terminal" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, + {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, +] + [[package]] name = "wordcloud" version = "1.9.4" @@ -1375,28 +1618,7 @@ matplotlib = "*" numpy = ">=1.6.1" pillow = "*" -[[package]] -name = "zipp" -version = "3.21.0" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.9" -groups = ["main"] -markers = "python_version < \"3.10\"" -files = [ - {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"}, - {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"}, -] - -[package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] -cover = ["pytest-cov"] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -enabler = ["pytest-enabler (>=2.2)"] -test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] -type = ["pytest-mypy"] - [metadata] lock-version = "2.1" -python-versions = ">=3.9" -content-hash = "59c35ae9a55fc7b3e34da397abd2fa9e106358d4dc7bee7af834e74abd0827eb" +python-versions = ">=3.10" +content-hash = "f700daabdf0f9af82ee428823d6ea5cace8eb6802252334cfe5c32c2df127d88" diff --git a/pyproject.toml b/pyproject.toml index ca593f6..bc79304 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,11 +7,12 @@ license = "MIT" readme = "README.md" [tool.poetry.dependencies] -python = ">=3.9" +python = ">=3.10" matplotlib = ">=3.4.3" pandas = "^2.2.3" altair = "^5.5.0" wordcloud = "^1.9.4" +ipython = "^8.31.0" [tool.poetry.group.dev.dependencies] pytest = "^8.3.4" diff --git a/src/dataprofiler/dataprofiler.py b/src/dataprofiler/dataprofiler.py index e0aa7fa..219ac37 100644 --- a/src/dataprofiler/dataprofiler.py +++ b/src/dataprofiler/dataprofiler.py @@ -1,8 +1,7 @@ import pandas as pd import numpy as np import altair as alt -from wordcloud import WordCloud -import matplotlib.pyplot as plt +from itertools import combinations def summarize_data(df): """ @@ -122,66 +121,125 @@ def detect_anomalies(df): return report -def plotify(df): +def plotify(df, plot_types=None): """ - Visualize a DataFrame by generating appropriate plots based on column datatypes. - - This function analyzes the datatypes of the columns in a DataFrame and generates - visualizations that are appropriate for the column types. It supports the following: - - Numeric: Visualizations like histograms and density plots - - Categorical: Visualization like bar chart and - - Numeric vs Numeric: Visualizations such as scatter plots, pair plots, or correlation matrices. - - Numeric vs Categorical: Visualizations like box plots, violin plots, or bar charts. - - Categorical vs Categorical: Visualizations such as stacked bar charts or mosaic plots. - - Binary Columns: Treated as a special case of categorical data. - - Text Columns: Generates a word cloud for columns containing textual data. + Visualize a DataFrame by generating specified plots based on column datatypes. Parameters ---------- df : pandas.DataFrame - The DataFrame containing the data to be visualized. It can contain - columns with data types that include numeric, categorical, binary, or textual. + The DataFrame containing the data to be visualized. + + plot_types : list of str, optional + A list of plot types to generate. Available options include: + - 'histogram' : Plot a histogram for numeric columns. + - 'density' : Plot a density plot for numeric columns. + - 'bar' : Plot a bar chart for categorical columns. + - 'scatter' : Plot scatter plots for pairwise numeric columns. + - 'correlation' : Plot a correlation heatmap for numeric columns. + - 'box' : Plot box plots for numeric vs categorical columns. + - 'stacked_bar' : Plot stacked bar charts for pairwise categorical columns. + If None, all plot types are generated by default. Returns ------- None - The function generates and displays plots directly. It does not return any value. - + Generates and displays specified plots based on the provided column types. + Raises ------ TypeError If the input is not a pandas DataFrame. ValueError - If the input is an empty Dataframe. + If the input DataFrame is empty. + Notes ----- - - Binary columns are considered categorical. - - Categorical columns are identified based on data types. - - Textual data is identified by the object or string dtype. - - Examples - -------- - >>> import pandas as pd - >>> import numpy as np - >>> from dataprofiler.dataprofiler import plotify - - # Example DataFrame - >>> data = { - ... 'Age': [25, 30, 35, 40, 45], - ... 'Salary': [50000, 60000, 70000, 80000, 90000], - ... 'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'], - ... 'Comments': ['Great product', 'Good service', 'Average experience', 'Excellent quality', 'Poor support'] - ... } - >>> df = pd.DataFrame(data) - >>> plotify(df) - - This will generate: - - A scatter plot for 'Age' vs 'Salary'. - - A box plot for 'Salary' vs 'Gender'. - - A word cloud for the 'Comments' column, highlighting words such as 'Great', 'Good', 'Average', 'Excellent', and 'Poor'. + - Numeric columns are those of types 'int64', 'float64'. + - Categorical columns are those of types 'object', 'category', and 'bool'. """ + # Validate input if not isinstance(df, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame.") if df.empty: - raise ValueError("DataFrame is empty.") - pass + raise ValueError("Input DataFrame is empty.") + + # Set default plot types if not specified + if plot_types is None: + plot_types = ['histogram', 'density', 'bar', 'scatter', 'correlation', 'box', 'stacked_bar'] + + # Analyze columns + numeric_cols = df.select_dtypes(include='number').columns.tolist() + categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist() + + # Individual column visualizations + if 'histogram' in plot_types or 'density' in plot_types: + for col in numeric_cols: + print(f"Visualizing numeric column: {col}") + if 'histogram' in plot_types: + hist_chart = alt.Chart(df).mark_bar().encode( + x=alt.X(col, bin=True, title=f"{col} (binned)"), + y=alt.Y('count()', title='Count') + ).properties(title=f"Histogram of {col}") + hist_chart.display() + if 'density' in plot_types: + density_chart = alt.Chart(df).transform_density( + col, as_=[col, 'density'] + ).mark_area(opacity=0.5).encode( + x=alt.X(col, title=col), + y=alt.Y('density:Q', title='Density') + ).properties(title=f"Density Plot of {col}") + density_chart.display() + + if 'bar' in plot_types: + for col in categorical_cols: + print(f"Visualizing categorical column: {col}") + bar_chart = alt.Chart(df).mark_bar().encode( + x=alt.X(col, title=col), + y=alt.Y('count()', title='Count') + ).properties(title=f"Bar Chart of {col}") + bar_chart.display() + + # Pairwise relationships + if 'scatter' in plot_types: + for col1, col2 in combinations(numeric_cols, 2): + print(f"Visualizing numeric vs numeric: {col1} vs {col2}") + scatter_chart = alt.Chart(df).mark_circle(size=60).encode( + x=alt.X(col1, title=col1), + y=alt.Y(col2, title=col2), + tooltip=[col1, col2] + ).properties(title=f"Scatter Plot: {col1} vs {col2}") + scatter_chart.display() + + if 'correlation' in plot_types and len(numeric_cols) > 1: + print("Visualizing correlation heatmap") + corr_matrix = df[numeric_cols].corr().stack().reset_index() + corr_matrix.columns = ['Variable 1', 'Variable 2', 'Correlation'] + heatmap = alt.Chart(corr_matrix).mark_rect().encode( + x=alt.X('Variable 1:N'), + y=alt.Y('Variable 2:N'), + color=alt.Color('Correlation:Q', scale=alt.Scale(scheme='viridis')), + tooltip=['Variable 1', 'Variable 2', 'Correlation'] + ).properties(title='Correlation Heatmap') + heatmap.display() + + if 'box' in plot_types: + for num_col in numeric_cols: + for cat_col in categorical_cols: + print(f"Visualizing numeric vs categorical: {num_col} vs {cat_col}") + box_plot = alt.Chart(df).mark_boxplot().encode( + x=alt.X(cat_col, title=cat_col), + y=alt.Y(num_col, title=num_col), + color=alt.Color(cat_col, legend=None) + ).properties(title=f"Box Plot: {num_col} vs {cat_col}") + box_plot.display() + + if 'stacked_bar' in plot_types: + for cat_col1, cat_col2 in combinations(categorical_cols, 2): + print(f"Visualizing categorical vs categorical: {cat_col1} vs {cat_col2}") + stacked_bar = alt.Chart(df).mark_bar().encode( + x=alt.X(cat_col1, title=cat_col1), + y=alt.Y('count()', title='Count'), + color=alt.Color(cat_col2, title=cat_col2) + ).properties(title=f"Stacked Bar Chart: {cat_col1} vs {cat_col2}") + stacked_bar.display() \ No newline at end of file diff --git a/tests/test_plotify.py b/tests/test_plotify.py index 363c4c5..f8f8359 100644 --- a/tests/test_plotify.py +++ b/tests/test_plotify.py @@ -2,22 +2,110 @@ import pytest import pandas as pd -def test_wrong_input(): - with pytest.raises(TypeError): - plotify(123) - plotify('abcd') +@pytest.fixture +def valid_df(): + """ + Fixture to create a sample DataFrame with both numeric and categorical columns. + """ + data = { + 'age': [25, 30, 35, 40, 45], + 'income': [50000, 60000, 70000, 80000, 90000], + 'gender': ['M', 'F', 'M', 'F', 'M'], + 'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'] + } + return pd.DataFrame(data) + +def test_plotify_valid_df(valid_df): + """ + Test to verify that the plotify function generates all plots when no specific plot types are provided. + + Args: + valid_df (pd.DataFrame): Sample DataFrame containing numeric and categorical columns. + """ + plotify(valid_df, plot_types=None) # Test with all plot types -def test_empty_df(): - df = pd.DataFrame() +def test_plotify_empty_df(): + """ + Test to verify that the plotify function raises a ValueError when an empty DataFrame is passed. + + This tests the validation logic for empty DataFrames. + """ + empty_df = pd.DataFrame() with pytest.raises(ValueError): - plotify(df) + plotify(empty_df) + +def test_plotify_invalid_input(): + """ + Test to verify that the plotify function raises a TypeError when the input is not a pandas DataFrame. + + This tests the input type validation logic. + """ + invalid_input = [1, 2, 3, 4] + with pytest.raises(TypeError): + plotify(invalid_input) + +def test_plotify_specific_plots(valid_df): + """ + Test to verify that the plotify function generates only specified plot types. + + Args: + valid_df (pd.DataFrame): Sample DataFrame containing numeric and categorical columns. + """ + plotify(valid_df, plot_types=['histogram', 'scatter']) # Test with specific plot types + +def test_plotify_scatter_plot(valid_df): + """ + Test to verify that the plotify function generates scatter plots for numeric columns. + + Args: + valid_df (pd.DataFrame): Sample DataFrame containing numeric columns for pairwise scatter plotting. + """ + plotify(valid_df, plot_types=['scatter']) -def test_valid_df(): +def test_plotify_correlation_heatmap(valid_df): + """ + Test to verify that the plotify function generates a correlation heatmap when there are multiple numeric columns. + + Args: + valid_df (pd.DataFrame): Sample DataFrame containing multiple numeric columns. + """ + plotify(valid_df, plot_types=['correlation']) + +def test_plotify_box_plot(valid_df): + """ + Test to verify that the plotify function generates box plots for numeric vs categorical columns. + + Args: + valid_df (pd.DataFrame): Sample DataFrame containing both numeric and categorical columns. + """ + plotify(valid_df, plot_types=['box']) + +def test_plotify_stacked_bar(valid_df): + """ + Test to verify that the plotify function generates stacked bar charts for pairwise categorical columns. + + Args: + valid_df (pd.DataFrame): Sample DataFrame containing categorical columns for pairwise stacked bar plotting. + """ + plotify(valid_df, plot_types=['stacked_bar']) + +def test_plotify_empty_plot_types(valid_df): + """ + Test to verify that the plotify function generates all plots when the plot_types argument is an empty list. + + Args: + valid_df (pd.DataFrame): Sample DataFrame containing both numeric and categorical columns. + """ + plotify(valid_df, plot_types=[]) # Expect all plots to be generated as the list is empty + +def test_plotify_missing_columns(): + """ + Test to verify that the plotify function handles cases where only numeric columns are present. + + This tests the scenario where only numeric columns are available for generating plots. + """ df = pd.DataFrame({ - 'Age': [25, 30, 35, 40, 45], - 'Salary': [50000, 60000, 70000, 80000, 90000], - 'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'], - 'Comments': ['Great service', 'Good experience', 'Average', 'Excellent quality', 'Poor support'] + 'age': [25, 30, 35, 40, 45], + 'income': [50000, 60000, 70000, 80000, 90000] }) - plotify(df) - + plotify(df, plot_types=['scatter', 'box']) # Test scatter plot and box plot