diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index 1ff64a50f..72b25a697 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -11,6 +11,13 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.typeset_relations import is_pandas_1 + + +def to_datetime(series: pd.Series) -> pd.Series: + if is_pandas_1(): + return pd.to_datetime(series, errors="coerce") + return pd.to_datetime(series, format="mixed", errors="coerce") @describe_date_1d.register @@ -29,6 +36,12 @@ def pandas_describe_date_1d( Returns: A dict containing calculated series description values. """ + og_series = series.dropna() + series = to_datetime(og_series) + invalid_values = og_series[series.isna()] + + series = series.dropna() + if summary["value_counts_without_nan"].empty: values = series.values summary.update( @@ -53,5 +66,12 @@ def pandas_describe_date_1d( if config.vars.num.chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(values) - summary.update(histogram_compute(config, values, summary["n_distinct"])) + summary.update(histogram_compute(config, values, series.nunique())) + summary.update( + { + "invalid_dates": invalid_values.nunique(), + "n_invalid_dates": len(invalid_values), + "p_invalid_dates": len(invalid_values) / summary["n"], + } + ) return config, values, summary diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index bbb401fd0..5d15b2d3c 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -44,6 +44,7 @@ def pandas_describe_1d( and series.name in typeset.type_schema ): vtype = typeset.type_schema[series.name] + elif config.infer_dtypes: # Infer variable types vtype = typeset.infer_type(series) diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py index c75a80a5e..1f142daae 100644 --- a/src/ydata_profiling/report/structure/variables/render_date.py +++ b/src/ydata_profiling/report/structure/variables/render_date.py @@ -62,6 +62,16 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: [ {"name": "Minimum", "value": fmt(summary["min"]), "alert": False}, {"name": "Maximum", "value": fmt(summary["max"]), "alert": False}, + { + "name": "Invalid dates", + "value": fmt(summary["n_invalid_dates"]), + "alert": False, + }, + { + "name": "Invalid dates (%)", + "value": fmt_percent(summary["p_invalid_dates"]), + "alert": False, + }, ], style=config.html.style, ) diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py index 0eb10b7b7..0918be08e 100644 --- a/tests/unit/test_describe.py +++ b/tests/unit/test_describe.py @@ -582,3 +582,19 @@ def test_describe_list(summarizer, typeset): with pytest.raises(NotImplementedError): describe(config, "", [1, 2, 3], summarizer, typeset) + + +def test_decribe_series_type_schema(config, summarizer): + "Test describe with invalid date types." + typeset = ProfilingTypeSet(config, type_schema={"date": "datetime"}) + data = { + "value": [1, 2, 3, 4], + "date": ["0001-01-01", "9999-12-31", "2022-10-03", "2022-10-04"], + } + df = pd.DataFrame(data) + result = describe(config, df, summarizer, typeset) + + assert result.variables["date"]["type"] == "DateTime" + assert result.variables["date"]["n_missing"] == 0 + assert result.variables["date"]["n_invalid_dates"] == 2 + assert result.variables["date"]["p_invalid_dates"] == 0.5