Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up calculation of the QualityReport #723

Merged
merged 9 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from sdmetrics.reports.utils import PlotConfig
from sdmetrics.utils import is_datetime

DEFAULT_NUM_ROWS_SUBSAMPLE = 50000


class ColumnPairTrends(BaseSingleTableProperty):
"""Column pair trends property.
Expand Down Expand Up @@ -267,7 +269,7 @@ def _generate_details(

continue

columns_real, columns_synthetic, metric = self._get_columns_data_and_metric(
col_real, col_synthetic, metric = self._get_columns_data_and_metric(
column_name_1,
column_name_2,
processed_real_data,
Expand All @@ -277,6 +279,12 @@ def _generate_details(
metadata,
)

metric_params = {}
if (metric == ContingencySimilarity) and (
max(len(col_real), len(col_synthetic)) > DEFAULT_NUM_ROWS_SUBSAMPLE
):
metric_params['num_rows_subsample'] = DEFAULT_NUM_ROWS_SUBSAMPLE

try:
error = self._preprocessing_failed(
column_name_1, column_name_2, sdtype_col_1, sdtype_col_2
Expand All @@ -285,7 +293,7 @@ def _generate_details(
raise Exception('Preprocessing failed')

score_breakdown = metric.compute_breakdown(
real_data=columns_real, synthetic_data=columns_synthetic
real_data=col_real, synthetic_data=col_synthetic, **metric_params
)
pair_score = score_breakdown['score']
if metric.__name__ == 'CorrelationSimilarity':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def test_with_num_rows_subsample():
Here the `real_data` and `syntehtic_data` have 218 rows.
"""
# Setup
np.random.seed(42)
real_data, synthetic_data, _ = load_demo('single_table')
real_data = real_data[['degree_type', 'high_spec']]
synthetic_data = synthetic_data[['degree_type', 'high_spec']]
Expand Down
36 changes: 36 additions & 0 deletions tests/integration/reports/multi_table/test_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,3 +376,39 @@ def test_quality_report_with_no_relationships():
properties = report.get_properties()
pd.testing.assert_frame_equal(properties, expected_properties)
assert score == 0.6271818780763356


def test_with_large_dataset():
"""Test the quality report for large multi-table datasets.

The tables of the demo dataset have 10 rows each. We will replicate the rows 10000 times.
"""
# Setup
np.random.seed(42)
real_data, synthetic_data, metadata = load_demo(modality='multi_table')
real_data['users'] = pd.concat([real_data['users']] * 10000, ignore_index=True)
synthetic_data['users'] = pd.concat([synthetic_data['users']] * 10000, ignore_index=True)
real_data['transactions'] = pd.concat([real_data['transactions']] * 10000, ignore_index=True)
synthetic_data['transactions'] = pd.concat(
[synthetic_data['transactions']] * 10000, ignore_index=True
)
report_1 = QualityReport()
report_2 = QualityReport()

# Run
report_1.generate(real_data, synthetic_data, metadata, verbose=False)
score_1_run_1 = report_1.get_score()
report_1.generate(real_data, synthetic_data, metadata, verbose=False)
score_1_run_2 = report_1.get_score()
report_2.generate(real_data, synthetic_data, metadata, verbose=False)

# Assert
cpt_report_1 = report_1.get_properties().iloc[1]['Score']
cpt_report_2 = report_2.get_properties().iloc[1]['Score']
intertable_trends_1 = report_1.get_properties().iloc[3]['Score']
intertable_trends_2 = report_2.get_properties().iloc[3]['Score']
assert score_1_run_1 != score_1_run_2
assert np.isclose(score_1_run_1, score_1_run_2, atol=0.001)
assert np.isclose(report_2.get_score(), score_1_run_1, atol=0.001)
assert np.isclose(cpt_report_1, cpt_report_2, atol=0.001)
assert np.isclose(intertable_trends_1, intertable_trends_2, atol=0.001)
29 changes: 29 additions & 0 deletions tests/integration/reports/single_table/test_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,35 @@ def test_report_end_to_end(self):
assert report_info['num_rows_synthetic_data'] == 215
assert report_info['generation_time'] <= generate_end_time - generate_start_time

def test_with_large_dataset(self):
"""Test the quality report with a large dataset (>50000 rows).

The `real_data` and `synthetic_data` in the demo have 215 rows.
So we augment them to be larger than 50000 rows.
"""
# Setup
real_data, synthetic_data, metadata = load_demo(modality='single_table')
real_data = pd.concat([real_data] * 1000, ignore_index=True)
synthetic_data = pd.concat([synthetic_data] * 1000, ignore_index=True)

report_1 = QualityReport()
report_2 = QualityReport()

# Run
report_1.generate(real_data, synthetic_data, metadata, verbose=False)
score_1_run_1 = report_1.get_score()
report_1.generate(real_data, synthetic_data, metadata, verbose=False)
score_1_run_2 = report_1.get_score()
report_2.generate(real_data, synthetic_data, metadata, verbose=False)

# Assert
cpt_report_1 = report_1.get_properties().iloc[1]['Score']
cpt_report_2 = report_2.get_properties().iloc[1]['Score']
assert score_1_run_1 != score_1_run_2
assert np.isclose(score_1_run_1, score_1_run_2, atol=0.001)
assert np.isclose(report_2.get_score(), score_1_run_1, atol=0.001)
assert np.isclose(cpt_report_1, cpt_report_2, atol=0.001)

def test_quality_report_with_object_datetimes(self):
"""Test the quality report with object datetimes.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test__get_processed_data_with_nans(self):
pd.testing.assert_frame_equal(processed_data, expected_processed_data)
pd.testing.assert_frame_equal(discrete_data, expected_discrete_data)

def test_get_columns_data_and_metric(self):
def test__get_columns_data_and_metric(self):
"""Test the ``_get_columns_data_and_metric`` method.

The method should return the correct data for each combination of column types.
Expand Down Expand Up @@ -341,9 +341,43 @@ def test__generate_details(self, correlation_compute_mock, contingency_compute_m
]
for idx, call1 in enumerate(contingency_compute_mock.call_args_list):
_, contingency_kwargs = call1
assert contingency_kwargs.keys() == {'real_data', 'synthetic_data'}
assert contingency_kwargs['real_data'].equals(expected_real_data[idx])
assert contingency_kwargs['synthetic_data'].equals(expected_synthetic_data[idx])

@patch(
'sdmetrics.reports.single_table._properties.column_pair_trends.'
'ContingencySimilarity.compute_breakdown'
)
def test__generate_details_large_dataset(self, contingency_compute_mock):
"""Test the ``_generate_details`` for data with more than 50000 rows."""
# Setup
real_data = pd.DataFrame({
'col1': ['a', 'b', 'c'] * 20000,
'col2': [False, True, True] * 20000,
})
synthetic_data = pd.DataFrame({
'col1': ['c', 'a', 'b'] * 20000,
'col2': [False, False, True] * 20000,
})
metadata = {
'columns': {
'col1': {'sdtype': 'categorical'},
'col2': {'sdtype': 'boolean'},
}
}

cpt_property = ColumnPairTrends()

# Run
cpt_property._generate_details(real_data, synthetic_data, metadata, None)

# Assert
contingency_kwargs = contingency_compute_mock.call_args_list[0][1]
pd.testing.assert_frame_equal(contingency_kwargs['real_data'], real_data)
pd.testing.assert_frame_equal(contingency_kwargs['synthetic_data'], synthetic_data)
assert contingency_kwargs['num_rows_subsample'] == 50000

def test__get_correlation_matrix_score(self):
"""Test the ``_get_correlation_matrix`` method to generate the ``Score`` heatmap."""
# Setup
Expand Down
Loading