diff --git a/sdmetrics/reports/single_table/_properties/column_pair_trends.py b/sdmetrics/reports/single_table/_properties/column_pair_trends.py index a30536a8..693e384a 100644 --- a/sdmetrics/reports/single_table/_properties/column_pair_trends.py +++ b/sdmetrics/reports/single_table/_properties/column_pair_trends.py @@ -10,6 +10,8 @@ from sdmetrics.reports.utils import PlotConfig from sdmetrics.utils import is_datetime +DEFAULT_NUM_ROWS_SUBSAMPLE = 50000 + class ColumnPairTrends(BaseSingleTableProperty): """Column pair trends property. @@ -267,7 +269,7 @@ def _generate_details( continue - columns_real, columns_synthetic, metric = self._get_columns_data_and_metric( + col_real, col_synthetic, metric = self._get_columns_data_and_metric( column_name_1, column_name_2, processed_real_data, @@ -277,6 +279,12 @@ def _generate_details( metadata, ) + metric_params = {} + if (metric == ContingencySimilarity) and ( + max(len(col_real), len(col_synthetic)) > DEFAULT_NUM_ROWS_SUBSAMPLE + ): + metric_params['num_rows_subsample'] = DEFAULT_NUM_ROWS_SUBSAMPLE + try: error = self._preprocessing_failed( column_name_1, column_name_2, sdtype_col_1, sdtype_col_2 @@ -285,7 +293,7 @@ def _generate_details( raise Exception('Preprocessing failed') score_breakdown = metric.compute_breakdown( - real_data=columns_real, synthetic_data=columns_synthetic + real_data=col_real, synthetic_data=col_synthetic, **metric_params ) pair_score = score_breakdown['score'] if metric.__name__ == 'CorrelationSimilarity': diff --git a/tests/integration/column_pairs/statistical/test_contingency_similarity.py b/tests/integration/column_pairs/statistical/test_contingency_similarity.py index 6fce63d3..1a428eb8 100644 --- a/tests/integration/column_pairs/statistical/test_contingency_similarity.py +++ b/tests/integration/column_pairs/statistical/test_contingency_similarity.py @@ -10,6 +10,7 @@ def test_with_num_rows_subsample(): Here the `real_data` and `syntehtic_data` have 218 rows. """ # Setup + np.random.seed(42) real_data, synthetic_data, _ = load_demo('single_table') real_data = real_data[['degree_type', 'high_spec']] synthetic_data = synthetic_data[['degree_type', 'high_spec']] diff --git a/tests/integration/reports/multi_table/test_quality_report.py b/tests/integration/reports/multi_table/test_quality_report.py index 8e372459..99e0d268 100644 --- a/tests/integration/reports/multi_table/test_quality_report.py +++ b/tests/integration/reports/multi_table/test_quality_report.py @@ -376,3 +376,39 @@ def test_quality_report_with_no_relationships(): properties = report.get_properties() pd.testing.assert_frame_equal(properties, expected_properties) assert score == 0.6271818780763356 + + +def test_with_large_dataset(): + """Test the quality report for large multi-table datasets. + + The tables of the demo dataset have 10 rows each. We will replicate the rows 10000 times. + """ + # Setup + np.random.seed(42) + real_data, synthetic_data, metadata = load_demo(modality='multi_table') + real_data['users'] = pd.concat([real_data['users']] * 10000, ignore_index=True) + synthetic_data['users'] = pd.concat([synthetic_data['users']] * 10000, ignore_index=True) + real_data['transactions'] = pd.concat([real_data['transactions']] * 10000, ignore_index=True) + synthetic_data['transactions'] = pd.concat( + [synthetic_data['transactions']] * 10000, ignore_index=True + ) + report_1 = QualityReport() + report_2 = QualityReport() + + # Run + report_1.generate(real_data, synthetic_data, metadata, verbose=False) + score_1_run_1 = report_1.get_score() + report_1.generate(real_data, synthetic_data, metadata, verbose=False) + score_1_run_2 = report_1.get_score() + report_2.generate(real_data, synthetic_data, metadata, verbose=False) + + # Assert + cpt_report_1 = report_1.get_properties().iloc[1]['Score'] + cpt_report_2 = report_2.get_properties().iloc[1]['Score'] + intertable_trends_1 = report_1.get_properties().iloc[3]['Score'] + intertable_trends_2 = report_2.get_properties().iloc[3]['Score'] + assert score_1_run_1 != score_1_run_2 + assert np.isclose(score_1_run_1, score_1_run_2, atol=0.001) + assert np.isclose(report_2.get_score(), score_1_run_1, atol=0.001) + assert np.isclose(cpt_report_1, cpt_report_2, atol=0.001) + assert np.isclose(intertable_trends_1, intertable_trends_2, atol=0.001) diff --git a/tests/integration/reports/single_table/test_quality_report.py b/tests/integration/reports/single_table/test_quality_report.py index 005edcc8..39b513bd 100644 --- a/tests/integration/reports/single_table/test_quality_report.py +++ b/tests/integration/reports/single_table/test_quality_report.py @@ -159,6 +159,35 @@ def test_report_end_to_end(self): assert report_info['num_rows_synthetic_data'] == 215 assert report_info['generation_time'] <= generate_end_time - generate_start_time + def test_with_large_dataset(self): + """Test the quality report with a large dataset (>50000 rows). + + The `real_data` and `synthetic_data` in the demo have 215 rows. + So we augment them to be larger than 50000 rows. + """ + # Setup + real_data, synthetic_data, metadata = load_demo(modality='single_table') + real_data = pd.concat([real_data] * 1000, ignore_index=True) + synthetic_data = pd.concat([synthetic_data] * 1000, ignore_index=True) + + report_1 = QualityReport() + report_2 = QualityReport() + + # Run + report_1.generate(real_data, synthetic_data, metadata, verbose=False) + score_1_run_1 = report_1.get_score() + report_1.generate(real_data, synthetic_data, metadata, verbose=False) + score_1_run_2 = report_1.get_score() + report_2.generate(real_data, synthetic_data, metadata, verbose=False) + + # Assert + cpt_report_1 = report_1.get_properties().iloc[1]['Score'] + cpt_report_2 = report_2.get_properties().iloc[1]['Score'] + assert score_1_run_1 != score_1_run_2 + assert np.isclose(score_1_run_1, score_1_run_2, atol=0.001) + assert np.isclose(report_2.get_score(), score_1_run_1, atol=0.001) + assert np.isclose(cpt_report_1, cpt_report_2, atol=0.001) + def test_quality_report_with_object_datetimes(self): """Test the quality report with object datetimes. diff --git a/tests/unit/reports/single_table/_properties/test_column_pair_trends.py b/tests/unit/reports/single_table/_properties/test_column_pair_trends.py index 8384ab87..6856ee2f 100644 --- a/tests/unit/reports/single_table/_properties/test_column_pair_trends.py +++ b/tests/unit/reports/single_table/_properties/test_column_pair_trends.py @@ -155,7 +155,7 @@ def test__get_processed_data_with_nans(self): pd.testing.assert_frame_equal(processed_data, expected_processed_data) pd.testing.assert_frame_equal(discrete_data, expected_discrete_data) - def test_get_columns_data_and_metric(self): + def test__get_columns_data_and_metric(self): """Test the ``_get_columns_data_and_metric`` method. The method should return the correct data for each combination of column types. @@ -341,9 +341,43 @@ def test__generate_details(self, correlation_compute_mock, contingency_compute_m ] for idx, call1 in enumerate(contingency_compute_mock.call_args_list): _, contingency_kwargs = call1 + assert contingency_kwargs.keys() == {'real_data', 'synthetic_data'} assert contingency_kwargs['real_data'].equals(expected_real_data[idx]) assert contingency_kwargs['synthetic_data'].equals(expected_synthetic_data[idx]) + @patch( + 'sdmetrics.reports.single_table._properties.column_pair_trends.' + 'ContingencySimilarity.compute_breakdown' + ) + def test__generate_details_large_dataset(self, contingency_compute_mock): + """Test the ``_generate_details`` for data with more than 50000 rows.""" + # Setup + real_data = pd.DataFrame({ + 'col1': ['a', 'b', 'c'] * 20000, + 'col2': [False, True, True] * 20000, + }) + synthetic_data = pd.DataFrame({ + 'col1': ['c', 'a', 'b'] * 20000, + 'col2': [False, False, True] * 20000, + }) + metadata = { + 'columns': { + 'col1': {'sdtype': 'categorical'}, + 'col2': {'sdtype': 'boolean'}, + } + } + + cpt_property = ColumnPairTrends() + + # Run + cpt_property._generate_details(real_data, synthetic_data, metadata, None) + + # Assert + contingency_kwargs = contingency_compute_mock.call_args_list[0][1] + pd.testing.assert_frame_equal(contingency_kwargs['real_data'], real_data) + pd.testing.assert_frame_equal(contingency_kwargs['synthetic_data'], synthetic_data) + assert contingency_kwargs['num_rows_subsample'] == 50000 + def test__get_correlation_matrix_score(self): """Test the ``_get_correlation_matrix`` method to generate the ``Score`` heatmap.""" # Setup