diff --git a/examples/basic/verifying_pfd.py b/examples/basic/verifying_pfd.py new file mode 100644 index 0000000000..e4c227038a --- /dev/null +++ b/examples/basic/verifying_pfd.py @@ -0,0 +1,59 @@ +import desbordante +import pandas as pd + +ERROR = 0.3 +PER_TUPLE = 'per_tuple' +PER_VALUE = 'per_value' +TABLE = 'examples/datasets/glitchy_sensor_2.csv' + + +def print_results(verifier): + error = verifier.get_error() + if error <= ERROR: + print('PFD holds') + else: + print(f'PFD with error {ERROR} does not hold') + print(f'But it holds with error {error}') + print() + print('Additional info:') + print(f'Number of rows violating PFD: {verifier.get_num_violating_rows()}') + print(f'Number of clusters violating PFD: {verifier.get_num_violating_clusters()}') + print() + + table = pd.read_csv(TABLE) + violating_clusters = verifier.get_violating_clusters() + number_names = ['First', 'Second', 'Third'] + cluster_number = 0 + for violating_cluster in violating_clusters: + print(f'{number_names[cluster_number]} violating cluster:') + cluster_number += 1 + violating_series = [] + for i, row in table.iterrows(): + if i not in violating_cluster: + continue + violating_series.append(row) + print(pd.DataFrame(violating_series)) + print() + +# Loading input data +algo = desbordante.pfd_verification.algorithms.Default() +algo.load_data(table=(TABLE, ',', True)) + +# Print dataset +print(f'Dataset: {TABLE}') +print(pd.read_csv(TABLE)) +print() + +# Checking whether PFD (DeviceId) -> (Data) holds for PerValue measure +algo.execute(lhs_indices=[1], rhs_indices=[2], error=ERROR, pfd_error_measure=PER_VALUE) +print('-' * 80) +print(f'Checking whether PFD (DeviceId) -> (Data) holds for {PER_VALUE} error measure') +print('-' * 80) +print_results(algo) + +# Checking whether the same PFD holds for PerTuple measure +algo.execute(lhs_indices=[1], rhs_indices=[2], error=ERROR, pfd_error_measure=PER_TUPLE) +print('-' * 80) +print(f'Checking whether the same PFD holds for {PER_TUPLE} error measure:') +print('-' * 80) +print_results(algo) diff --git a/examples/datasets/glitchy_sensor_2.csv b/examples/datasets/glitchy_sensor_2.csv new file mode 100644 index 0000000000..0fae50533d --- /dev/null +++ b/examples/datasets/glitchy_sensor_2.csv @@ -0,0 +1,17 @@ +Id,DeviceId,Data +1,D-1,1001 +2,D-1,1002 +3,D-1,1003 +4,D-1,1004 +5,D-1,1005 +6,D-1,1006 +7,D-2,1000 +8,D-2,1001 +9,D-2,1000 +10,D-3,1010 +11,D-4,1011 +12,D-4,1011 +13,D-5,1015 +14,D-5,1014 +15,D-5,1015 +16,D-5,1015 \ No newline at end of file diff --git a/src/core/algorithms/fd/pfd_verifier/pfd_stats_calculator.h b/src/core/algorithms/fd/pfd_verifier/pfd_stats_calculator.h index 6ea27d6799..2f086b1497 100644 --- a/src/core/algorithms/fd/pfd_verifier/pfd_stats_calculator.h +++ b/src/core/algorithms/fd/pfd_verifier/pfd_stats_calculator.h @@ -11,8 +11,7 @@ namespace algos { class PFDStatsCalculator { private: std::shared_ptr relation_; - config::ErrorType max_fd_error_; - config::ErrorMeasureType error_measure_; + config::PfdErrorMeasureType error_measure_; std::vector clusters_violating_pfd_; size_t num_rows_violating_pfd_ = 0; @@ -20,8 +19,8 @@ class PFDStatsCalculator { public: explicit PFDStatsCalculator(std::shared_ptr relation, - config::ErrorMeasureType measure, config::ErrorType max_fd_error) - : relation_(std::move(relation)), max_fd_error_(max_fd_error), error_measure_(measure) {} + config::PfdErrorMeasureType measure) + : relation_(std::move(relation)), error_measure_(measure) {} void ResetState() { clusters_violating_pfd_.clear(); @@ -29,10 +28,6 @@ class PFDStatsCalculator { error_ = 0; } - bool PFDHolds() const { - return error_ <= max_fd_error_; - } - size_t GetNumViolatingClusters() const { return clusters_violating_pfd_.size(); } @@ -79,7 +74,7 @@ class PFDStatsCalculator { clusters_violating_pfd_.push_back(x_cluster); } num_rows_violating_pfd_ += x_cluster_size - max; - sum += error_measure_ == +ErrorMeasure::per_tuple + sum += error_measure_ == +PfdErrorMeasure::per_tuple ? static_cast(max) : static_cast(max) / x_cluster_size; cluster_rows_count += x_cluster.size(); @@ -87,9 +82,9 @@ class PFDStatsCalculator { unsigned int unique_rows = static_cast(x_pli->GetRelationSize() - cluster_rows_count); double probability = - static_cast(sum + unique_rows) / (error_measure_ == +ErrorMeasure::per_tuple - ? x_pli->GetRelationSize() - : x_index.size() + unique_rows); + static_cast(sum + unique_rows) / + (error_measure_ == +PfdErrorMeasure::per_tuple ? x_pli->GetRelationSize() + : x_index.size() + unique_rows); error_ = 1.0 - probability; } }; diff --git a/src/core/algorithms/fd/pfd_verifier/pfd_verifier.cpp b/src/core/algorithms/fd/pfd_verifier/pfd_verifier.cpp index 8f7a21e003..a1fc2ce1f2 100644 --- a/src/core/algorithms/fd/pfd_verifier/pfd_verifier.cpp +++ b/src/core/algorithms/fd/pfd_verifier/pfd_verifier.cpp @@ -7,7 +7,6 @@ #include "config/names.h" #include "config/tabular_data/input_table/option.h" #include "equal_nulls/option.h" -#include "error/option.h" #include "error_measure/option.h" #include "indices/option.h" @@ -19,13 +18,12 @@ void PFDVerifier::RegisterOptions() { RegisterOption(config::kEqualNullsOpt(&is_null_equal_null_)); RegisterOption(config::kLhsIndicesOpt(&lhs_indices_, get_schema_cols)); RegisterOption(config::kRhsIndicesOpt(&rhs_indices_, get_schema_cols)); - RegisterOption(config::kErrorMeasureOpt(&error_measure_)); - RegisterOption(config::kErrorOpt(&max_fd_error_)); + RegisterOption(config::kPfdErrorMeasureOpt(&error_measure_)); } void PFDVerifier::MakeExecuteOptsAvailable() { using namespace config::names; - MakeOptionsAvailable({kLhsIndices, kRhsIndices, kErrorMeasure, kError}); + MakeOptionsAvailable({kLhsIndices, kRhsIndices, kPfdErrorMeasure}); } void PFDVerifier::LoadDataInternal() { @@ -37,8 +35,7 @@ void PFDVerifier::LoadDataInternal() { unsigned long long PFDVerifier::ExecuteInternal() { auto start_time = std::chrono::system_clock::now(); - stats_calculator_ = - std::make_unique(relation_, error_measure_, max_fd_error_); + stats_calculator_ = std::make_unique(relation_, error_measure_); VerifyPFD(); auto elapsed_milliseconds = std::chrono::duration_cast( std::chrono::system_clock::now() - start_time); diff --git a/src/core/algorithms/fd/pfd_verifier/pfd_verifier.h b/src/core/algorithms/fd/pfd_verifier/pfd_verifier.h index 678c6a1a23..89c54aaf15 100644 --- a/src/core/algorithms/fd/pfd_verifier/pfd_verifier.h +++ b/src/core/algorithms/fd/pfd_verifier/pfd_verifier.h @@ -21,8 +21,7 @@ class PFDVerifier : public Algorithm { config::IndicesType lhs_indices_; config::IndicesType rhs_indices_; config::EqNullsType is_null_equal_null_; - config::ErrorType max_fd_error_; - config::ErrorMeasureType error_measure_ = +ErrorMeasure::per_tuple; + config::PfdErrorMeasureType error_measure_ = +PfdErrorMeasure::per_tuple; std::shared_ptr relation_; std::unique_ptr stats_calculator_; @@ -41,11 +40,6 @@ class PFDVerifier : public Algorithm { std::shared_ptr CalculatePLI(config::IndicesType const& indices) const; public: - bool PFDHolds() const { - assert(stats_calculator_); - return stats_calculator_->PFDHolds(); - } - size_t GetNumViolatingClusters() const { assert(stats_calculator_); return stats_calculator_->GetNumViolatingClusters(); diff --git a/src/python_bindings/bindings.cpp b/src/python_bindings/bindings.cpp index c7894bf32f..1aa0141d14 100644 --- a/src/python_bindings/bindings.cpp +++ b/src/python_bindings/bindings.cpp @@ -20,6 +20,7 @@ #include "nd/bind_nd.h" #include "nd/bind_nd_verification.h" #include "od/bind_od.h" +#include "pfd/bind_pfd_verification.h" #include "sfd/bind_sfd.h" #include "statistics/bind_statistics.h" #include "ucc/bind_ucc.h" @@ -60,7 +61,8 @@ PYBIND11_MODULE(desbordante, module, pybind11::mod_gil_not_used()) { BindNdVerification, BindSFD, BindMd, - BindDCVerification}) { + BindDCVerification, + BindPfdVerification}) { bind_func(module); } } diff --git a/src/python_bindings/pfd/bind_pfd_verification.cpp b/src/python_bindings/pfd/bind_pfd_verification.cpp new file mode 100644 index 0000000000..edd5ffea6d --- /dev/null +++ b/src/python_bindings/pfd/bind_pfd_verification.cpp @@ -0,0 +1,25 @@ +#include "bind_pfd_verification.h" + +#include +#include + +#include "algorithms/fd/pfd_verifier/pfd_verifier.h" +#include "py_util/bind_primitive.h" + +namespace { +namespace py = pybind11; +} // namespace + +namespace python_bindings { +void BindPfdVerification(py::module_& main_module) { + using namespace algos; + auto pfd_verification_module = main_module.def_submodule("pfd_verification"); + + BindPrimitiveNoBase(pfd_verification_module, "PFDVerifier") + .def("get_num_violating_clusters", &PFDVerifier::GetNumViolatingClusters) + .def("get_num_violating_rows", &PFDVerifier::GetNumViolatingRows) + .def("get_violating_clusters", &PFDVerifier::GetViolatingClusters) + .def("get_error", &PFDVerifier::GetError); + main_module.attr("pfd_verification") = pfd_verification_module; +} +} // namespace python_bindings diff --git a/src/python_bindings/pfd/bind_pfd_verification.h b/src/python_bindings/pfd/bind_pfd_verification.h new file mode 100644 index 0000000000..8f78b12f47 --- /dev/null +++ b/src/python_bindings/pfd/bind_pfd_verification.h @@ -0,0 +1,7 @@ +#pragma once + +#include + +namespace python_bindings { +void BindPfdVerification(pybind11::module_& main_module); +} // namespace python_bindings diff --git a/src/tests/test_pfd_verifier.cpp b/src/tests/test_pfd_verifier.cpp index 1353de46eb..e1d26daf6f 100644 --- a/src/tests/test_pfd_verifier.cpp +++ b/src/tests/test_pfd_verifier.cpp @@ -1,7 +1,7 @@ #include #include "algorithms/algo_factory.h" -#include "algorithms/fd/pfdtane/pfd_verifier/pfd_verifier.h" +#include "algorithms/fd/pfd_verifier/pfd_verifier.h" #include "all_csv_configs.h" #include "config/indices/type.h" #include "config/names.h" @@ -19,16 +19,17 @@ struct PFDVerifyingParams { std::vector const clusters_violating_pfd; PFDVerifyingParams(config::IndicesType lhs_indices, config::IndicesType rhs_indices, - config::ErrorMeasureType error_measure, config::ErrorType error, + config::PfdErrorMeasureType error_measure, config::ErrorType error, size_t num_violating_clusters, size_t num_violating_rows, std::vector clusters_violating_pfd, CSVConfig const& csv_config) - : params({{onam::kCsvConfig, csv_config}, + : params({ + {onam::kCsvConfig, csv_config}, {onam::kEqualNulls, true}, {onam::kLhsIndices, std::move(lhs_indices)}, {onam::kRhsIndices, std::move(rhs_indices)}, - {onam::kErrorMeasure, error_measure}, - {onam::kError, error}}), + {onam::kPfdErrorMeasure, error_measure}, + }), expected_error(error), num_violating_clusters(num_violating_clusters), num_violating_rows(num_violating_rows), @@ -44,7 +45,6 @@ TEST_P(TestPFDVerifying, DefaultTest) { auto verifier = algos::CreateAndLoadAlgorithm(p.params); double const eps = 0.0001; verifier->Execute(); - EXPECT_TRUE(verifier->PFDHolds()); EXPECT_NEAR(p.expected_error, verifier->GetError(), eps); EXPECT_EQ(p.num_violating_clusters, verifier->GetNumViolatingClusters()); EXPECT_EQ(p.num_violating_rows, verifier->GetNumViolatingRows()); @@ -53,21 +53,23 @@ TEST_P(TestPFDVerifying, DefaultTest) { INSTANTIATE_TEST_SUITE_P( PFDVerifierTestSuite, TestPFDVerifying, - ::testing::Values(PFDVerifyingParams({2}, {3}, +algos::ErrorMeasure::per_value, 0.0625, 1, - 1, {{0, 1}}, kTestFD), - PFDVerifyingParams({0, 1}, {4}, +algos::ErrorMeasure::per_value, 0.166667, - 2, 2, {{0, 1, 2}, {6, 7, 8}}, kTestFD), - PFDVerifyingParams({4}, {5}, +algos::ErrorMeasure::per_value, 0.3334, 4, - 4, {{0, 8}, {1, 2}, {3, 4, 5}, {9, 10, 11}}, kTestFD), - PFDVerifyingParams({5}, {1}, +algos::ErrorMeasure::per_value, 0.0, 0, 0, - {}, kTestFD), - PFDVerifyingParams({2}, {3}, +algos::ErrorMeasure::per_tuple, 0.0834, 1, - 1, {{0, 1}}, kTestFD), - PFDVerifyingParams({0, 1}, {4}, +algos::ErrorMeasure::per_tuple, 0.1667, - 2, 2, {{0, 1, 2}, {6, 7, 8}}, kTestFD), - PFDVerifyingParams({4}, {5}, +algos::ErrorMeasure::per_tuple, 0.3334, 4, - 4, {{0, 8}, {1, 2}, {3, 4, 5}, {9, 10, 11}}, kTestFD), - PFDVerifyingParams({5}, {1}, +algos::ErrorMeasure::per_tuple, 0.0, 0, 0, - {}, kTestFD))); + ::testing::Values(PFDVerifyingParams({2}, {3}, +algos::PfdErrorMeasure::per_value, 0.0625, + 1, 1, {{0, 1}}, kTestFD), + PFDVerifyingParams({0, 1}, {4}, +algos::PfdErrorMeasure::per_value, + 0.166667, 2, 2, {{0, 1, 2}, {6, 7, 8}}, kTestFD), + PFDVerifyingParams({4}, {5}, +algos::PfdErrorMeasure::per_value, 0.3334, + 4, 4, {{0, 8}, {1, 2}, {3, 4, 5}, {9, 10, 11}}, + kTestFD), + PFDVerifyingParams({5}, {1}, +algos::PfdErrorMeasure::per_value, 0.0, 0, + 0, {}, kTestFD), + PFDVerifyingParams({2}, {3}, +algos::PfdErrorMeasure::per_tuple, 0.0834, + 1, 1, {{0, 1}}, kTestFD), + PFDVerifyingParams({0, 1}, {4}, +algos::PfdErrorMeasure::per_tuple, + 0.1667, 2, 2, {{0, 1, 2}, {6, 7, 8}}, kTestFD), + PFDVerifyingParams({4}, {5}, +algos::PfdErrorMeasure::per_tuple, 0.3334, + 4, 4, {{0, 8}, {1, 2}, {3, 4, 5}, {9, 10, 11}}, + kTestFD), + PFDVerifyingParams({5}, {1}, +algos::PfdErrorMeasure::per_tuple, 0.0, 0, + 0, {}, kTestFD))); -} // namespace tests \ No newline at end of file +} // namespace tests