From 12ad75a3ea0cfb26d3a6f8c8e3130a286802b51e Mon Sep 17 00:00:00 2001 From: Nate Parsons <4307001+thehomebrewnerd@users.noreply.github.com> Date: Wed, 1 May 2024 15:53:37 -0500 Subject: [PATCH] Remove Dask and Spark DataFrame Support (#2705) * mass deletion * cleanup tests * fix * try to fix unit tests * fix ww main test ci yaml * more ci work * fix fixture * update release notes * update miniconda hash * more cleanup * docs cleanup * update release notes * revert 3.12 change * remove sql and update checker * fix test * try install test fix * remove dask references * remove sql from complete install due to psycopg2 issue * more install fixes * lint * fix complete install * remove dask_tokenize * remove outdated link * revert dask_tokenize change * remove isinstance checks * remove agg_type * remove use of cache for install test --- .github/workflows/build_docs.yaml | 1 - .github/workflows/install_test.yaml | 23 +- .../workflows/latest_dependency_checker.yaml | 2 +- ...oking_glass_airflow_performance_tests.yaml | 2 +- .../workflows/minimum_dependency_checker.yaml | 8 - .github/workflows/tests_with_latest_deps.yaml | 48 +- .../workflows/tests_with_minimum_deps.yaml | 57 +- .../tests_with_woodwork_main_branch.yaml | 28 +- Makefile | 2 +- README.md | 21 +- contributing.md | 26 +- .../getting_started/using_entitysets.ipynb | 12 +- .../getting_started/woodwork_types.ipynb | 2 +- docs/source/guides/guides_index.rst | 2 - docs/source/guides/performance.ipynb | 33 +- docs/source/guides/tuning_dfs.ipynb | 6 +- .../source/guides/using_dask_entitysets.ipynb | 280 --------- .../guides/using_spark_entitysets.ipynb | 277 --------- docs/source/install.md | 98 +-- docs/source/release_notes.rst | 11 +- .../frequently_asked_questions.ipynb | 35 +- .../resources/transition_to_ft_v1.0.ipynb | 2 +- .../calculate_feature_matrix.py | 124 +--- .../feature_set_calculator.py | 178 ++---- featuretools/computational_backends/utils.py | 25 +- featuretools/entityset/deserialize.py | 12 +- featuretools/entityset/entityset.py | 301 ++------- featuretools/entityset/serialize.py | 6 - featuretools/feature_base/feature_base.py | 10 - .../__init__.py | 0 .../primitives/base/primitive_base.py | 4 - .../standard/aggregation/all_primitive.py | 16 +- .../standard/aggregation/any_primitive.py | 16 +- .../standard/aggregation/avg_time_between.py | 3 +- .../primitives/standard/aggregation/count.py | 7 +- .../standard/aggregation/entropy.py | 3 +- .../primitives/standard/aggregation/first.py | 3 +- .../primitives/standard/aggregation/last.py | 3 +- .../standard/aggregation/max_primitive.py | 7 +- .../primitives/standard/aggregation/mean.py | 7 +- .../primitives/standard/aggregation/median.py | 3 +- .../standard/aggregation/min_primitive.py | 7 +- .../primitives/standard/aggregation/mode.py | 3 +- .../standard/aggregation/n_most_common.py | 3 +- .../standard/aggregation/num_true.py | 19 +- .../standard/aggregation/num_unique.py | 29 +- .../standard/aggregation/percent_true.py | 28 +- .../primitives/standard/aggregation/skew.py | 3 +- .../primitives/standard/aggregation/std.py | 7 +- .../standard/aggregation/sum_primitive.py | 7 +- .../standard/aggregation/time_since_first.py | 3 +- .../standard/aggregation/time_since_last.py | 3 +- .../primitives/standard/aggregation/trend.py | 3 +- .../standard/transform/binary/add_numeric.py | 3 +- .../transform/binary/add_numeric_scalar.py | 2 - .../transform/binary/and_primitive.py | 3 +- .../transform/binary/divide_by_feature.py | 2 - .../transform/binary/divide_numeric.py | 3 +- .../transform/binary/divide_numeric_scalar.py | 2 - .../standard/transform/binary/equal.py | 3 +- .../standard/transform/binary/equal_scalar.py | 2 - .../standard/transform/binary/greater_than.py | 2 - .../transform/binary/greater_than_equal_to.py | 3 +- .../binary/greater_than_equal_to_scalar.py | 2 - .../transform/binary/greater_than_scalar.py | 2 - .../standard/transform/binary/less_than.py | 3 +- .../transform/binary/less_than_equal_to.py | 3 +- .../binary/less_than_equal_to_scalar.py | 2 - .../transform/binary/less_than_scalar.py | 2 - .../transform/binary/modulo_by_feature.py | 2 - .../transform/binary/modulo_numeric.py | 3 +- .../transform/binary/modulo_numeric_scalar.py | 2 - .../transform/binary/multiply_boolean.py | 2 - .../transform/binary/multiply_numeric.py | 3 +- .../binary/multiply_numeric_boolean.py | 2 - .../binary/multiply_numeric_scalar.py | 2 - .../standard/transform/binary/not_equal.py | 2 - .../transform/binary/not_equal_scalar.py | 2 - .../standard/transform/binary/or_primitive.py | 3 +- .../binary/scalar_subtract_numeric_feature.py | 2 - .../transform/binary/subtract_numeric.py | 2 - .../binary/subtract_numeric_scalar.py | 2 - .../standard/transform/datetime/age.py | 2 - .../standard/transform/datetime/day.py | 3 +- .../transform/datetime/day_of_year.py | 3 +- .../transform/datetime/days_in_month.py | 3 +- .../standard/transform/datetime/hour.py | 3 +- .../transform/datetime/is_leap_year.py | 3 +- .../transform/datetime/is_lunch_time.py | 3 +- .../transform/datetime/is_month_end.py | 3 +- .../transform/datetime/is_month_start.py | 3 +- .../transform/datetime/is_quarter_end.py | 3 +- .../transform/datetime/is_quarter_start.py | 3 +- .../standard/transform/datetime/is_weekend.py | 3 +- .../transform/datetime/is_working_hours.py | 3 +- .../transform/datetime/is_year_end.py | 3 +- .../transform/datetime/is_year_start.py | 3 +- .../standard/transform/datetime/minute.py | 3 +- .../standard/transform/datetime/month.py | 3 +- .../transform/datetime/part_of_day.py | 3 +- .../standard/transform/datetime/quarter.py | 3 +- .../standard/transform/datetime/second.py | 3 +- .../standard/transform/datetime/time_since.py | 2 - .../standard/transform/datetime/week.py | 3 +- .../standard/transform/datetime/weekday.py | 3 +- .../standard/transform/datetime/year.py | 3 +- .../primitives/standard/transform/is_in.py | 2 - .../primitives/standard/transform/is_null.py | 2 - .../natural_language/num_characters.py | 3 +- .../transform/natural_language/num_words.py | 3 +- .../standard/transform/not_primitive.py | 2 - .../standard/transform/numeric/absolute.py | 3 +- .../standard/transform/numeric/cosine.py | 3 +- .../transform/numeric/natural_logarithm.py | 3 +- .../standard/transform/numeric/negate.py | 2 - .../standard/transform/numeric/sine.py | 3 +- .../standard/transform/numeric/square_root.py | 3 +- .../standard/transform/numeric/tangent.py | 3 +- .../transform/postal/one_digit_postal_code.py | 2 - .../transform/postal/two_digit_postal_code.py | 3 +- featuretools/primitives/utils.py | 30 +- .../synthesis/deep_feature_synthesis.py | 29 +- featuretools/synthesis/encode_features.py | 4 - .../synthesis/get_valid_primitives.py | 19 +- .../test_calculate_feature_matrix.py | 404 +++++------- .../test_dask_features.py | 37 -- .../computational_backend/test_feature_set.py | 8 +- .../test_feature_set_calculator.py | 255 +++----- .../tests/computational_backend/test_utils.py | 8 +- featuretools/tests/conftest.py | 576 +----------------- .../tests/entityset_tests/test_dask_es.py | 213 ------- featuretools/tests/entityset_tests/test_es.py | 576 +++--------------- .../tests/entityset_tests/test_es_metadata.py | 19 +- .../entityset_tests/test_last_time_index.py | 72 +-- .../tests/entityset_tests/test_plotting.py | 39 +- .../entityset_tests/test_serialization.py | 121 ++-- .../tests/entityset_tests/test_spark_es.py | 213 ------- .../tests/entityset_tests/test_timedelta.py | 9 +- .../tests/entityset_tests/test_ww_es.py | 261 ++------ .../test_agg_primitives.py | 4 +- .../test_rolling_primitive.py | 54 +- .../tests/primitive_tests/test_agg_feats.py | 70 +-- .../primitive_tests/test_dask_primitives.py | 122 ---- .../primitive_tests/test_direct_features.py | 24 +- .../test_feature_serialization.py | 85 ++- .../test_groupby_transform_primitives.py | 122 ++-- .../tests/primitive_tests/test_overrides.py | 15 +- .../primitive_tests/test_primitive_utils.py | 2 - .../test_rolling_primitive_utils.py | 140 ++--- .../test_transform_features.py | 513 +++++----------- .../test_cumulative_time_since.py | 8 +- .../test_expanding_primitives.py | 74 +-- .../test_full_name_primitives.py | 12 +- .../test_percent_change.py | 4 +- .../test_postal_primitives.py | 13 +- .../requirement_files/latest_requirements.txt | 1 - .../minimum_dask_requirements.txt | 1 - .../minimum_spark_requirements.txt | 12 - .../tests/selection/test_selection.py | 8 +- featuretools/tests/synthesis/test_dask_dfs.py | 512 ---------------- .../synthesis/test_deep_feature_synthesis.py | 146 +---- .../tests/synthesis/test_dfs_method.py | 87 +-- .../tests/synthesis/test_encode_features.py | 101 ++- .../synthesis/test_get_valid_primitives.py | 18 +- .../tests/synthesis/test_spark_dfs.py | 534 ---------------- featuretools/tests/testing_utils/__init__.py | 2 +- featuretools/tests/testing_utils/es_utils.py | 38 -- .../testing_utils/generate_fake_dataframe.py | 4 - featuretools/tests/testing_utils/mock_ds.py | 6 - .../tests/utils_tests/test_entry_point.py | 15 +- .../tests/utils_tests/test_gen_utils.py | 28 - featuretools/utils/gen_utils.py | 36 -- featuretools/utils/spark_utils.py | 49 -- pyproject.toml | 37 +- 174 files changed, 1297 insertions(+), 6487 deletions(-) delete mode 100644 docs/source/guides/using_dask_entitysets.ipynb delete mode 100644 docs/source/guides/using_spark_entitysets.ipynb rename featuretools/{tests/integration_data => feature_discovery}/__init__.py (100%) delete mode 100644 featuretools/tests/computational_backend/test_dask_features.py delete mode 100644 featuretools/tests/entityset_tests/test_dask_es.py delete mode 100644 featuretools/tests/entityset_tests/test_spark_es.py delete mode 100644 featuretools/tests/primitive_tests/test_dask_primitives.py delete mode 100644 featuretools/tests/requirement_files/minimum_spark_requirements.txt delete mode 100644 featuretools/tests/synthesis/test_dask_dfs.py delete mode 100644 featuretools/tests/synthesis/test_spark_dfs.py delete mode 100644 featuretools/utils/spark_utils.py diff --git a/.github/workflows/build_docs.yaml b/.github/workflows/build_docs.yaml index b5e759df17..05d4b104b6 100644 --- a/.github/workflows/build_docs.yaml +++ b/.github/workflows/build_docs.yaml @@ -50,7 +50,6 @@ jobs: sudo apt update sudo apt install -y pandoc sudo apt install -y graphviz - sudo apt install -y openjdk-11-jre-headless python -m pip check - name: Build docs run: make -C docs/ -e "SPHINXOPTS=-W -j auto" clean html diff --git a/.github/workflows/install_test.yaml b/.github/workflows/install_test.yaml index a7eeb17ee8..124c81a12f 100644 --- a/.github/workflows/install_test.yaml +++ b/.github/workflows/install_test.yaml @@ -14,10 +14,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python_version: ["3.9", "3.10"] - exclude: - - python_version: "3.10" - os: macos-latest + python_version: ["3.9", "3.10", "3.11"] runs-on: ${{ matrix.os }} steps: - name: Checkout repository @@ -31,29 +28,19 @@ jobs: python-version: ${{ matrix.python_version }} cache: 'pip' cache-dependency-path: 'pyproject.toml' - - uses: actions/cache@v3 - id: cache - with: - path: ${{ env.pythonLocation }} - key: ${{ matrix.os- }}-${{ matrix.python_version }}-install-${{ env.pythonLocation }}-${{ hashFiles('**/pyproject.toml') }}-v01 - name: Build featuretools package run: | make package - - name: Install complete version of featuretools from sdist (not using cache) - if: steps.cache.outputs.cache-hit != 'true' + - name: Install complete version of featuretools from sdist run: | python -m pip install "unpacked_sdist/[complete]" - - name: Install complete version of featuretools from sdist (using cache) - if: steps.cache.outputs.cache-hit == 'true' - run: | - python -m pip install "unpacked_sdist/[complete]" --no-deps - name: Test by importing packages run: | - python -c "import alteryx_open_src_update_checker" - python -c "from featuretools_sql import DBConnector" + python -c "import premium_primitives" + python -c "from nlp_primitives import PolarityScore" - name: Check package conflicts run: | python -m pip check - name: Verify extra_requires commands run: | - python -m pip install "unpacked_sdist/[nlp,spark,updater,sql]" + python -m pip install "unpacked_sdist/[nlp]" diff --git a/.github/workflows/latest_dependency_checker.yaml b/.github/workflows/latest_dependency_checker.yaml index e010dcb36c..b5a82bd423 100644 --- a/.github/workflows/latest_dependency_checker.yaml +++ b/.github/workflows/latest_dependency_checker.yaml @@ -23,7 +23,7 @@ jobs: - name: Update dependencies run: | python -m pip install --upgrade pip - python -m pip install -e ".[dask,spark,test]" + python -m pip install -e ".[dask,test]" make checkdeps OUTPUT_PATH=featuretools/tests/requirement_files/latest_requirements.txt - name: Create pull request uses: peter-evans/create-pull-request@v3 diff --git a/.github/workflows/looking_glass_airflow_performance_tests.yaml b/.github/workflows/looking_glass_airflow_performance_tests.yaml index 41dae4bcb4..16f59c9c24 100644 --- a/.github/workflows/looking_glass_airflow_performance_tests.yaml +++ b/.github/workflows/looking_glass_airflow_performance_tests.yaml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: true matrix: - test_type: ["pandas", "dask", "spark"] + test_type: ["pandas"] steps: - name: Generate default ISO timestamp run: | diff --git a/.github/workflows/minimum_dependency_checker.yaml b/.github/workflows/minimum_dependency_checker.yaml index 0a74157b6d..8b3e74bafa 100644 --- a/.github/workflows/minimum_dependency_checker.yaml +++ b/.github/workflows/minimum_dependency_checker.yaml @@ -38,14 +38,6 @@ jobs: options: 'dependencies' extras_require: 'dask' output_filepath: featuretools/tests/requirement_files/minimum_dask_requirements.txt - - name: Run min dep generator - spark - id: min_dep_gen_spark - uses: alteryx/minimum-dependency-generator@v3 - with: - paths: 'pyproject.toml' - options: 'dependencies' - extras_require: 'spark' - output_filepath: featuretools/tests/requirement_files/minimum_spark_requirements.txt - name: Create Pull Request uses: peter-evans/create-pull-request@v3 with: diff --git a/.github/workflows/tests_with_latest_deps.yaml b/.github/workflows/tests_with_latest_deps.yaml index 9f05cae043..76535d7164 100644 --- a/.github/workflows/tests_with_latest_deps.yaml +++ b/.github/workflows/tests_with_latest_deps.yaml @@ -8,13 +8,12 @@ on: workflow_dispatch: jobs: tests: - name: ${{ matrix.python_version }} tests ${{ matrix.libraries }} + name: ${{ matrix.python_version }} unit tests runs-on: ubuntu-latest strategy: fail-fast: false matrix: python_version: ["3.9", "3.10", "3.11"] - libraries: ["core", "spark/dask - misc", "spark/dask - computational", "spark/dask - entityset_1", "spark/dask - entityset_2", "spark/dask - primitives"] steps: - uses: actions/setup-python@v4 @@ -32,20 +31,11 @@ jobs: pip config --site set global.progress_bar off python -m pip install --upgrade pip sudo apt update && sudo apt install -y graphviz - - if: ${{ !startsWith(matrix.libraries, 'spark/dask') }} - name: Install featuretools with test requirements + - name: Install featuretools with test requirements run: | python -m pip install -e unpacked_sdist/ python -m pip install -e unpacked_sdist/[test] - - if: ${{ startsWith(matrix.libraries, 'spark/dask') }} - name: Install spark pkg, featuretools with test requirements and spark/dask requirements - run: | - sudo apt install -y openjdk-11-jre-headless - JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" - python -m pip install -e unpacked_sdist/[dask] - python -m pip install -e unpacked_sdist/[spark] - python -m pip install -e unpacked_sdist/[test] - - if: ${{ matrix.python_version == 3.9 && startsWith(matrix.libraries, 'spark/dask') }} + - if: ${{ matrix.python_version == 3.9 }} name: Generate coverage args run: echo "coverage_args=--cov=featuretools --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml" >> $GITHUB_ENV - if: ${{ env.coverage_args }} @@ -53,36 +43,10 @@ jobs: run: | cd unpacked_sdist coverage erase - - if: ${{ !startsWith(matrix.libraries, 'spark/dask') }} - name: Run unit tests (no code coverage) - run: | - cd unpacked_sdist - pytest featuretools/ -n auto - - if: ${{ matrix.libraries == 'spark/dask - misc' }} - name: Run unit tests (misc) - run: | - cd unpacked_sdist - pytest featuretools/ -n auto --ignore=featuretools/tests/computational_backend --ignore=featuretools/tests/entityset_tests --ignore=featuretools/tests/primitive_tests ${{ env.coverage_args }} - - if: ${{ matrix.libraries == 'spark/dask - computational' }} - name: Run unit tests (computational backend) - run: | - cd unpacked_sdist - pytest featuretools/tests/computational_backend/ -n auto ${{ env.coverage_args }} - - if: ${{ matrix.libraries == 'spark/dask - entityset_1' }} - name: Run unit tests (entityset batch 1) - run: | - cd unpacked_sdist - pytest featuretools/tests/entityset_tests -n auto --ignore=featuretools/tests/entityset_tests/test_es.py --ignore=featuretools/tests/entityset_tests/test_ww_es.py ${{ env.coverage_args }} - - if: ${{ matrix.libraries == 'spark/dask - entityset_2' }} - name: Run unit tests (entityset batch 2) - run: | - cd unpacked_sdist - pytest featuretools/tests/entityset_tests/test_es.py featuretools/tests/entityset_tests/test_ww_es.py ${{ env.coverage_args }} - - if: ${{ matrix.libraries == 'spark/dask - primitives' }} - name: Run unit tests (primitives) + - name: Run unit tests run: | cd unpacked_sdist - pytest featuretools/tests/primitive_tests -n auto ${{ env.coverage_args }} + pytest featuretools/ -n auto ${{ env.coverage_args }} - if: ${{ env.coverage_args }} name: Upload coverage to Codecov uses: codecov/codecov-action@v3 @@ -109,7 +73,7 @@ jobs: $ProgressPreference = "silentlyContinue" Invoke-WebRequest -Uri $Uri -Outfile "$env:USERPROFILE/$File" $hashFromFile = Get-FileHash "$env:USERPROFILE/$File" -Algorithm SHA256 - $hashFromUrl = "ff53a36b7024f8398cbfd043020f1f662cd4c5c2095c0007ddb4348aa5459375" + $hashFromUrl = "21b56b75861573ec8ab146d555b20e1ed4462a06aa286d7e92a1cd31acc64dba" if ($hashFromFile.Hash -ne "$hashFromUrl") { Throw "$File hashes do not match" } diff --git a/.github/workflows/tests_with_minimum_deps.yaml b/.github/workflows/tests_with_minimum_deps.yaml index b23da4a69f..75deea3762 100644 --- a/.github/workflows/tests_with_minimum_deps.yaml +++ b/.github/workflows/tests_with_minimum_deps.yaml @@ -7,13 +7,13 @@ on: - main workflow_dispatch: jobs: - py38_tests_minimum_dependencies: + py39_tests_minimum_dependencies: name: Tests - 3.9 Minimum Dependencies runs-on: ubuntu-latest strategy: fail-fast: false matrix: - libraries: ["core", "dask", "spark - misc", "spark - computational", "spark - entityset_1", "spark - entityset_2", "spark - primitives"] + python_version: ["3.9"] steps: - name: Checkout repository uses: actions/checkout@v3 @@ -33,59 +33,14 @@ jobs: - name: Install featuretools with no dependencies run: | python -m pip install -e . --no-dependencies - - if: ${{ startsWith(matrix.libraries, 'spark') }} - name: Install numpy for spark - run: | - NUMPY_VERSION=$(cat featuretools/tests/requirement_files/minimum_spark_requirements.txt | grep numpy) - python -m pip uninstall numpy -y - python -m pip install $NUMPY_VERSION --no-build-isolation - - if: ${{ matrix.libraries == 'core' }} - name: Install numpy for core - run: | - NUMPY_VERSION=$(cat featuretools/tests/requirement_files/minimum_core_requirements.txt | grep numpy) - python -m pip uninstall numpy -y - python -m pip install $NUMPY_VERSION --no-build-isolation - - if: ${{ matrix.libraries == 'dask' }} - name: Install numpy for dask - run: | - NUMPY_VERSION=$(cat featuretools/tests/requirement_files/minimum_dask_requirements.txt | grep numpy) - python -m pip uninstall numpy -y - python -m pip install $NUMPY_VERSION --no-build-isolation - name: Install featuretools - minimum tests dependencies run: | python -m pip install -r featuretools/tests/requirement_files/minimum_test_requirements.txt - - if: ${{ startsWith(matrix.libraries, 'spark') }} - name: Install featuretools - minimum spark, core dependencies - run: | - sudo apt install -y openjdk-11-jre-headless - JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" - python -m pip install -r featuretools/tests/requirement_files/minimum_spark_requirements.txt - - if: ${{ matrix.libraries == 'core' }} - name: Install featuretools - minimum core dependencies + - name: Install featuretools - minimum core dependencies run: | python -m pip install -r featuretools/tests/requirement_files/minimum_core_requirements.txt - - if: ${{ matrix.libraries == 'dask' }} - name: Install featuretools - minimum dask dependencies + - name: Install featuretools - minimum Dask dependencies run: | python -m pip install -r featuretools/tests/requirement_files/minimum_dask_requirements.txt - - if: ${{ matrix.libraries == 'core' }} - name: Run unit tests without code coverage - run: python -m pytest -x -n auto featuretools/tests/ - - if: ${{ matrix.libraries == 'dask' }} - name: Run dask unit tests without code coverage - run: python -m pytest -x -n auto featuretools/tests/ - - if: ${{ matrix.libraries == 'spark - misc' }} - name: Run unit tests (misc) - run: pytest featuretools/ -n auto --ignore=featuretools/tests/computational_backend --ignore=featuretools/tests/entityset_tests --ignore=featuretools/tests/primitive_tests - - if: ${{ matrix.libraries == 'spark - computational' }} - name: Run unit tests (computational backend) - run: pytest featuretools/tests/computational_backend/ -n auto - - if: ${{ matrix.libraries == 'spark - entityset_1' }} - name: Run unit tests (entityset batch 1) - run: pytest featuretools/tests/entityset_tests -n auto --ignore=featuretools/tests/entityset_tests/test_es.py --ignore=featuretools/tests/entityset_tests/test_ww_es.py - - if: ${{ matrix.libraries == 'spark - entityset_2' }} - name: Run unit tests (entityset batch 2) - run: pytest featuretools/tests/entityset_tests/test_es.py featuretools/tests/entityset_tests/test_ww_es.py - - if: ${{ matrix.libraries == 'spark - primitives' }} - name: Run unit tests (primitives) - run: pytest featuretools/tests/primitive_tests -n auto + - name: Run unit tests without code coverage + run: python -m pytest -x -n auto featuretools/tests/ \ No newline at end of file diff --git a/.github/workflows/tests_with_woodwork_main_branch.yaml b/.github/workflows/tests_with_woodwork_main_branch.yaml index 1a3f17d642..f6cacfe810 100644 --- a/.github/workflows/tests_with_woodwork_main_branch.yaml +++ b/.github/workflows/tests_with_woodwork_main_branch.yaml @@ -10,7 +10,6 @@ jobs: fail-fast: true matrix: python_version: ["3.9", "3.10", "3.11"] - libraries: ["core", "spark - misc", "spark - computational", "spark - entityset_1", "spark - entityset_2", "spark - primitives"] steps: - uses: actions/setup-python@v4 @@ -25,13 +24,6 @@ jobs: pip config --site set global.progress_bar off python -m pip install -U pip sudo apt update && sudo apt install -y graphviz - - if: ${{ startsWith(matrix.libraries, 'spark')}} - name: Install Woodwork & Featuretools with spark pkg - spark requirements - run: | - sudo apt install -y openjdk-11-jre-headless - JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" - PYSPARK_SUBMIT_ARGS="--master local[2] pyspark-shell" - python -m pip install -e unpacked_sdist/[spark] - name: Install Woodwork & Featuretools - test requirements run: | python -m pip install -e unpacked_sdist/[test] @@ -39,26 +31,10 @@ jobs: python -m pip install https://github.com/alteryx/woodwork/archive/main.zip - name: Log test run info run: | - echo "Run unit tests without code coverage for ${{ matrix.python_version }} and ${{ matrix.libraries }}" + echo "Run unit tests without code coverage for ${{ matrix.python_version }}" echo "Testing with woodwork version:" `python -c "import woodwork; print(woodwork.__version__)"` - - if: ${{ matrix.libraries == 'core' }} - name: Run unit tests without code coverage + - name: Run unit tests without code coverage run: pytest featuretools/ -n auto - - if: ${{ matrix.libraries == 'spark - misc' }} - name: Run unit tests (misc) - run: pytest featuretools/ -n auto --ignore=featuretools/tests/computational_backend --ignore=featuretools/tests/entityset_tests --ignore=featuretools/tests/primitive_tests - - if: ${{ matrix.libraries == 'spark - computational' }} - name: Run unit tests (computational backend) - run: pytest featuretools/tests/computational_backend/ -n auto - - if: ${{ matrix.libraries == 'spark - entityset_1' }} - name: Run unit tests (entityset batch 1) - run: pytest featuretools/tests/entityset_tests -n auto --ignore=featuretools/tests/entityset_tests/test_es.py --ignore=featuretools/tests/entityset_tests/test_ww_es.py - - if: ${{ matrix.libraries == 'spark - entityset_2' }} - name: Run unit tests (entityset batch 2) - run: pytest featuretools/tests/entityset_tests/test_es.py featuretools/tests/entityset_tests/test_ww_es.py - - if: ${{ matrix.libraries == 'spark - primitives' }} - name: Run unit tests (primitives) - run: pytest featuretools/tests/primitive_tests -n auto slack_alert_failure: name: Send Slack alert if failure diff --git a/Makefile b/Makefile index 323ef82282..83f1848d15 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ installdeps-test: upgradepip .PHONY: checkdeps checkdeps: - $(eval allow_list='holidays|scipy|numpy|pandas|tqdm|cloudpickle|distributed|dask|psutil|pyspark|woodwork') + $(eval allow_list='holidays|scipy|numpy|pandas|tqdm|cloudpickle|distributed|dask|psutil|woodwork') pip freeze | grep -v "alteryx/featuretools.git" | grep -E $(allow_list) > $(OUTPUT_PATH) .PHONY: upgradepip diff --git a/README.md b/README.md index 0bf3eb8bbd..cfa7d6050c 100644 --- a/README.md +++ b/README.md @@ -47,41 +47,30 @@ conda install -c conda-forge featuretools ### Add-ons -You can install add-ons individually or all at once by running +You can install add-ons individually or all at once by running: ``` python -m pip install "featuretools[complete]" ``` -**Update checker** - Receive automatic notifications of new Featuretools releases - -``` -python -m pip install "featuretools[updater]" -``` - -**Premium Primitives** - Use Premium Primitives, including Natural Language Processing primitives: +**Premium Primitives** - Use Premium Primitives from the premium-primitives repo ``` python -m pip install "featuretools[premium]" ``` -**TSFresh Primitives** - Use 60+ primitives from [tsfresh](https://tsfresh.readthedocs.io/en/latest/) within Featuretools +**NLP Primitives** - Use Natural Language Primitives from the nlp-primitives repo ``` -python -m pip install "featuretools[tsfresh]" +python -m pip install "featuretools[nlp]" ``` -**Dask Support** - Use Dask Dataframes to create EntitySets or run DFS with njobs > 1 +**Dask Support** - Use Dask to run DFS with njobs > 1 ``` python -m pip install "featuretools[dask]" ``` -**SQL** - Automatic EntitySet generation from relational data stored in a SQL database: - -``` -python -m pip install "featuretools[sql]" -``` ## Example Below is an example of using Deep Feature Synthesis (DFS) to perform automated feature engineering. In this example, we apply DFS to a multi-table dataset consisting of timestamped customer transactions. diff --git a/contributing.md b/contributing.md index e746f6260f..b4dcff926c 100644 --- a/contributing.md +++ b/contributing.md @@ -40,41 +40,23 @@ Before starting major work, you should touch base with the maintainers of Featur git checkout -b issue####-branch_name ``` -* You will need to install Spark, Scala, GraphViz, and Pandoc to run all unit tests & build docs: - - > If you do not install Spark/Scala, you can still run the unit tests (the Spark tests will be skipped). +* You will need to install GraphViz, and Pandoc to run all unit tests & build docs: > Pandoc is only needed to build the documentation locally. **macOS (Intel)** (use [Homebrew](https://brew.sh/)): ```console - brew tap AdoptOpenJDK/openjdk - brew install --cask adoptopenjdk11 - brew install scala apache-spark graphviz pandoc - echo 'export JAVA_HOME=$(/usr/libexec/java_home)' >> ~/.zshrc - echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc + brew install graphviz pandoc ``` **macOS (M1)** (use [Homebrew](https://brew.sh/)): ```console - brew install openjdk@11 scala apache-spark graphviz pandoc - echo 'export PATH="/opt/homebrew/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc - echo 'export CPPFLAGS="-I/opt/homebrew/opt/openjdk@11/include:$CPPFLAGS"' >> ~/.zprofile - sudo ln -sfn /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk /Library/Java/JavaVirtualMachines/openjdk-11.jdk + brew install graphviz pandoc ``` **Ubuntu**: ```console - sudo apt install openjdk-11-jre openjdk-11-jdk scala graphviz pandoc -y - echo "export SPARK_HOME=/opt/spark" >> ~/.profile - echo "export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin" >> ~/.profile - echo "export PYSPARK_PYTHON=/usr/bin/python3" >> ~/.profile - ``` - - **Amazon Linux**: - ```console - sudo amazon-linux-extras install java-openjdk11 scala -y - amazon-linux-extras enable java-openjdk11 + sudo apt install graphviz pandoc -y ``` #### 2. Implement your Pull Request diff --git a/docs/source/getting_started/using_entitysets.ipynb b/docs/source/getting_started/using_entitysets.ipynb index 2fa9202770..e93701823a 100644 --- a/docs/source/getting_started/using_entitysets.ipynb +++ b/docs/source/getting_started/using_entitysets.ipynb @@ -325,15 +325,13 @@ { "cell_type": "raw", "metadata": { - "raw_mimetype": "text/restructuredtext" + "raw_mimetype": "text/restructuredtext", + "vscode": { + "languageId": "raw" + } }, "source": [ - "As we can see, the features from DFS use the relational structure of our `EntitySet`. Therefore it is important to think carefully about the dataframes that we create.\n", - "\n", - "Dask and Spark EntitySets\n", - "~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "\n", - "EntitySets can also be created using Dask dataframes or Spark dataframes. For more information refer to :doc:`../guides/using_dask_entitysets` and :doc:`../guides/using_spark_entitysets`." + "As we can see, the features from DFS use the relational structure of our `EntitySet`. Therefore it is important to think carefully about the dataframes that we create." ] } ], diff --git a/docs/source/getting_started/woodwork_types.ipynb b/docs/source/getting_started/woodwork_types.ipynb index 0d01de8fae..105d9c37df 100644 --- a/docs/source/getting_started/woodwork_types.ipynb +++ b/docs/source/getting_started/woodwork_types.ipynb @@ -21,7 +21,7 @@ "## Physical Types \n", "Physical types define how the data in a Woodwork DataFrame is stored on disk or in memory. You might also see the physical type for a column referred to as the column’s `dtype`.\n", "\n", - "Knowing a Woodwork DataFrame's physical types is important because Pandas, Dask, and Spark rely on these types when performing DataFrame operations. Each Woodwork `LogicalType` class has a single physical type associated with it.\n", + "Knowing a Woodwork DataFrame's physical types is important because Pandas relies on these types when performing DataFrame operations. Each Woodwork `LogicalType` class has a single physical type associated with it.\n", "\n", "## Logical Types\n", "Logical types add additional information about how data should be interpreted or parsed beyond what can be contained in a physical type. In fact, multiple logical types have the same physical type, each imparting a different meaning that's not contained in the physical type alone.\n", diff --git a/docs/source/guides/guides_index.rst b/docs/source/guides/guides_index.rst index edfb27630d..b3d4d726ac 100644 --- a/docs/source/guides/guides_index.rst +++ b/docs/source/guides/guides_index.rst @@ -9,8 +9,6 @@ Guides on more advanced Featuretools functionality tuning_dfs specifying_primitive_options performance - using_dask_entitysets - using_spark_entitysets deployment advanced_custom_primitives feature_descriptions diff --git a/docs/source/guides/performance.ipynb b/docs/source/guides/performance.ipynb index b134dcd0f6..57490b0374 100644 --- a/docs/source/guides/performance.ipynb +++ b/docs/source/guides/performance.ipynb @@ -44,22 +44,7 @@ "\n", "## Parallel Feature Computation\n", "\n", - "Computational performance can often be improved by parallelizing the feature calculation process. There are several different approaches that can be used to perform parallel feature computation with Featuretools. An overview of the most commonly used approaches is provided below.\n", - "\n", - "### Computation with Dask and Spark EntitySets (BETA)" - ] - }, - { - "cell_type": "raw", - "id": "abd2207f", - "metadata": { - "raw_mimetype": "text/restructuredtext" - }, - "source": [ - ".. note::\n", - " Support for Dask EntitySets and Spark EntitySets is still in Beta. While the key functionality has been implemented, development is ongoing to add the remaining functionality.\n", - "\n", - " All planned improvements to the Featuretools/Dask and Featuretools/Spark integration are documented on Github (`Dask issues `_, `Spark issues `_). If you see an open issue that is important for your application, please let us know by upvoting or commenting on the issue. If you encounter any errors using Dask or Spark dataframes, or find missing functionality that does not yet have an open issue, please create a `new issue on Github `_." + "Computational performance can often be improved by parallelizing the feature calculation process. There are several different approaches that can be used to perform parallel feature computation with Featuretools. An overview of the most commonly used approaches is provided below." ] }, { @@ -67,9 +52,6 @@ "id": "b47e770f", "metadata": {}, "source": [ - "Dask or Spark can be used with Featuretools to perform parallel feature computation with virtually no changes to the workflow required. Featuretools supports creating an `EntitySet` directly from Dask or Spark dataframes instead of using pandas dataframes, enabling the parallel and distributed computation capabilities of Dask or Spark to be used. By creating an `EntitySet` directly from Dask or Spark dataframes, Featuretools can be used to generate a larger-than-memory feature matrix, something that may be difficult with other approaches. When computing a feature matrix from an `EntitySet` created from Dask or Spark dataframes, the resulting feature matrix will be returned as a Dask or Spark dataframe depending on which type was used.\n", - "\n", - "These methods do have some limitations in terms of the primitives that are available and the optional parameters that can be used when calculating the feature matrix. For more information on generating a feature matrix with this approach, refer to the guides [Using Dask Entitysets (BETA)](using_dask_entitysets.ipynb) and [Using Spark Entitysets (BETA)](using_spark_entitysets.ipynb).\n", "\n", "### Simple Parallel Feature Computation\n", "If using a pandas `EntitySet`, Featuretools can optionally compute features on multiple cores. The simplest way to control the amount of parallelism is to specify the `n_jobs` parameter:\n", @@ -167,17 +149,6 @@ "As an alternative to Featuretools' parallelization, the data can be partitioned and the feature calculations run on multiple cores or a cluster using Dask or Apache Spark with PySpark. This approach may be necessary with a large pandas `EntitySet` because the current parallel implementation sends the entire `EntitySet` to each worker which may exhaust the worker memory. Dask and Spark allow Featuretools to scale to multiple cores on a single machine or multiple machines on a cluster." ] }, - { - "cell_type": "raw", - "id": "94c74ef7", - "metadata": { - "raw_mimetype": "text/restructuredtext" - }, - "source": [ - ".. note::\n", - " Partitioning data is not necessary when using a Dask or Spark ``EntitySet``, as the Dask or Spark dataframes that make up the ``EntitySet`` are already partitioned. Partitioning is only needed when working with pandas dataframes." - ] - }, { "cell_type": "markdown", "id": "795cc323", @@ -189,7 +160,7 @@ "\n", "An additional example of partitioning data to distribute on multiple cores or a cluster using Dask can be seen in the [Featuretools on Dask notebook](https://github.com/Featuretools/Automated-Manual-Comparison/blob/main/Loan%20Repayment/notebooks/Featuretools%20on%20Dask.ipynb). This approach is detailed in the [Parallelizing Feature Engineering with Dask article](https://medium.com/feature-labs-engineering/scaling-featuretools-with-dask-ce46f9774c7d) on the Feature Labs engineering blog. Dask allows for simple scaling to multiple cores on a single computer or multiple machines on a cluster.\n", "\n", - "For a similar partition and distribute implementation using Apache Spark with PySpark, refer to the [Feature Engineering on Spark notebook](https://github.com/Featuretools/predict-customer-churn/blob/main/churn/4.%20Feature%20Engineering%20on%20Spark.ipynb). This implementation shows how to carry out feature engineering on a cluster of EC2 instances using Spark as the distributed framework. A write-up of this approach is described in the [Featuretools on Spark article](https://blog.featurelabs.com/featuretools-on-spark-2/) on the Feature Labs engineering blog." + "For a similar partition and distribute implementation using Apache Spark with PySpark, refer to the [Feature Engineering on Spark notebook](https://github.com/Featuretools/predict-customer-churn/blob/main/churn/4.%20Feature%20Engineering%20on%20Spark.ipynb). This implementation shows how to carry out feature engineering on a cluster of EC2 instances using Spark as the distributed framework." ] } ], diff --git a/docs/source/guides/tuning_dfs.ipynb b/docs/source/guides/tuning_dfs.ipynb index 00e2e917de..7bae5d3c06 100644 --- a/docs/source/guides/tuning_dfs.ipynb +++ b/docs/source/guides/tuning_dfs.ipynb @@ -66,9 +66,7 @@ "\n", "By default, where clauses are built using the ``interesting_values`` of a column.\n", "\n", - "Interesting values can be automatically determined and added for each DataFrame in a pandas EntitySet by calling `es.add_interesting_values()`. \n", - "\n", - "Note that Dask and Spark EntitySets cannot have interesting values determined automatically for their DataFrames. For those EntitySets, or when interesting values are already known for columns, the `dataframe_name` and `values` parameters can be used to set interesting values for individual columns in a DataFrame in an EntitySet." + "Interesting values can be automatically determined and added for each DataFrame in a pandas EntitySet by calling `es.add_interesting_values()`." ] }, { @@ -231,7 +229,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/docs/source/guides/using_dask_entitysets.ipynb b/docs/source/guides/using_dask_entitysets.ipynb deleted file mode 100644 index e178507c4d..0000000000 --- a/docs/source/guides/using_dask_entitysets.ipynb +++ /dev/null @@ -1,280 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "99052370", - "metadata": {}, - "source": [ - "# Using Dask EntitySets (BETA)" - ] - }, - { - "cell_type": "raw", - "id": "84275fe9", - "metadata": { - "raw_mimetype": "text/restructuredtext" - }, - "source": [ - ".. note::\n", - " Support for Dask EntitySets is still in Beta. While the key functionality has been implemented, development is ongoing to add the remaining functionality.\n", - "\n", - " All planned improvements to the Featuretools/Dask integration are `documented on Github `_. If you see an open issue that is important for your application, please let us know by upvoting or commenting on the issue. If you encounter any errors using Dask dataframes in EntitySets, or find missing functionality that does not yet have an open issue, please create a `new issue on Github `_." - ] - }, - { - "cell_type": "markdown", - "id": "49c496cb", - "metadata": {}, - "source": [ - "Creating a feature matrix from a very large dataset can be problematic if the underlying pandas dataframes that make up the EntitySet cannot easily fit in memory. To help get around this issue, Featuretools supports creating `EntitySet` objects from Dask dataframes. A Dask `EntitySet` can then be passed to `featuretools.dfs` or `featuretools.calculate_feature_matrix` to create a feature matrix, which will be returned as a Dask dataframe. In addition to working on larger than memory datasets, this approach also allows users to take advantage of the parallel and distributed processing capabilities offered by Dask.\n", - "\n", - "This guide will provide an overview of how to create a Dask `EntitySet` and then generate a feature matrix from it. If you are already familiar with creating a feature matrix starting from pandas DataFrames, this process will seem quite familiar, as there are no differences in the process. There are, however, some limitations when using Dask dataframes, and those limitations are reviewed in more detail below.\n", - "\n", - "## Creating EntitySets\n", - "\n", - "For this example, we will create a very small pandas DataFrame and then convert this into a Dask DataFrame to use in the remainder of the process. Normally when using Dask, you would just read your data directly into a Dask DataFrame without the intermediate step of using pandas." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96a8f65a", - "metadata": {}, - "outputs": [], - "source": [ - "import dask.dataframe as dd\n", - "import pandas as pd\n", - "\n", - "import featuretools as ft\n", - "\n", - "id = [0, 1, 2, 3, 4]\n", - "values = [12, -35, 14, 103, -51]\n", - "df = pd.DataFrame({\"id\": id, \"values\": values})\n", - "dask_df = dd.from_pandas(df, npartitions=2)\n", - "\n", - "dask_df" - ] - }, - { - "cell_type": "markdown", - "id": "e0c3d410", - "metadata": {}, - "source": [ - "Now that we have our Dask DataFrame, we can start to create the `EntitySet`. Inferring Woodwork logical types for the columns in a Dask dataframe can be computationally expensive. To avoid this expense, logical type inference can be skipped by supplying a dictionary of logical types using the `logical_types` parameter when calling `es.add_dataframe()`. Logical types can be specified as Woodwork LogicalType classes, or their equivalent string representation. For more information refer to the [Woodwork Typing in Featuretools](../getting_started/woodwork_types.ipynb) guide.\n", - "\n", - "Aside from supplying the logical types, the rest of the process of creating an `EntitySet` is the same as if we were using pandas DataFrames." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ffe671d9", - "metadata": {}, - "outputs": [], - "source": [ - "from woodwork.logical_types import Double, Integer\n", - "\n", - "es = ft.EntitySet(id=\"dask_es\")\n", - "es = es.add_dataframe(\n", - " dataframe_name=\"dask_input_df\",\n", - " dataframe=dask_df,\n", - " index=\"id\",\n", - " logical_types={\"id\": Integer, \"values\": Double},\n", - ")\n", - "\n", - "es" - ] - }, - { - "cell_type": "markdown", - "id": "b2175c84", - "metadata": {}, - "source": [ - "Notice that when we print our `EntitySet`, the number of rows for the DataFrame named `dask_input_df` is returned as a Dask `Delayed` object. This is because obtaining the length of a Dask DataFrame may require an expensive compute operation to sum up the lengths of all the individual partitions that make up the DataFrame and that operation is not performed by default.\n", - "\n", - "\n", - "## Running DFS\n", - "We can pass the `EntitySet` we created above to `featuretools.dfs` in order to create a feature matrix. If the `EntitySet` we pass to `dfs` is made of Dask DataFrames, the feature matrix we get back will be a Dask DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a90e3640", - "metadata": {}, - "outputs": [], - "source": [ - "feature_matrix, features = ft.dfs(\n", - " entityset=es,\n", - " target_dataframe_name=\"dask_input_df\",\n", - " trans_primitives=[\"negate\"],\n", - " max_depth=1,\n", - ")\n", - "feature_matrix" - ] - }, - { - "cell_type": "markdown", - "id": "03e03d97", - "metadata": {}, - "source": [ - "This feature matrix can be saved to disk or computed and brought into memory, using the appropriate Dask DataFrame methods." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19457b84", - "metadata": {}, - "outputs": [], - "source": [ - "fm_computed = feature_matrix.compute()\n", - "fm_computed" - ] - }, - { - "cell_type": "markdown", - "id": "af511e72", - "metadata": {}, - "source": [ - "While this is a simple example to illustrate the process of using Dask DataFrames with Featuretools, this process will also work with an `EntitySet` containing multiple dataframes, as well as with aggregation primitives.\n", - "\n", - "## Limitations\n", - "\n", - "The key functionality of Featuretools is available for use with a Dask `EntitySet`, and work is ongoing to add the remaining functionality that is available when using a pandas `EntitySet`. There are, however, some limitations to be aware of when creating a Dask `Entityset` and then using it to generate a feature matrix. The most significant limitations are reviewed in more detail in this section." - ] - }, - { - "cell_type": "raw", - "id": "a2212141", - "metadata": { - "raw_mimetype": "text/restructuredtext" - }, - "source": [ - ".. note::\n", - " If the limitations of using a Dask ``EntitySet`` are problematic for your problem, you may still be able to compute a larger-than-memory feature matrix by partitioning your data as described in :doc:`performance`." - ] - }, - { - "cell_type": "markdown", - "id": "7f99e3d0", - "metadata": {}, - "source": [ - "### Supported Primitives\n", - "\n", - "When creating a feature matrix from a Dask `EntitySet`, only certain primitives can be used. Primitives that rely on the order of the entire DataFrame or require an entire column for computation are currently not supported when using a Dask `EntitySet`. Multivariable and time-dependent aggregation primitives also are not currently supported.\n", - "\n", - "To obtain a list of the primitives that can be used with a Dask `EntitySet`, you can call `featuretools.list_primitives()`. This will return a table of all primitives. Any primitive that can be used with a Dask `EntitySet` will have a value of `True` in the `dask_compatible` column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7410cef", - "metadata": {}, - "outputs": [], - "source": [ - "primitives_df = ft.list_primitives()\n", - "dask_compatible_df = primitives_df[primitives_df[\"dask_compatible\"] == True]\n", - "dask_compatible_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc1c6d6b", - "metadata": {}, - "outputs": [], - "source": [ - "dask_compatible_df.tail()" - ] - }, - { - "cell_type": "markdown", - "id": "07aaee73", - "metadata": {}, - "source": [ - "### DataFrame Limitations\n", - "\n", - "Featuretools stores the DataFrames that make up an EntitySet as Woodwork DataFrames which include additional typing information about the columns that are in the DataFrame. When adding a DataFrame to an `EntitySet`, Woodwork will attempt to infer the logical types for any columns that do not have a logical type defined. This inference process can be quite expensive for Dask DataFrames. In order to skip type inference and speed up the process of adding a Dask DataFrame to an `EntitySet`, users can specify the logical type to use for each column in the DataFrame. A list of available logical types can be obtained by running ``featuretools.list_logical_types()``. To learn more about the limitations of a Dask dataframe with Woodwork typing, see the [Woodwork guide on Dask dataframes](https://woodwork.alteryx.com/en/stable/guides/using_woodwork_with_dask_and_spark.html#Dask-DataFrame-Example).\n", - "\n", - "By default, Woodwork checks that pandas DataFrames have unique index values. Because performing this same check with Dask would require an expensive compute operation, this check is not performed when adding a Dask DataFrame to an `EntitySet`. When using Dask DataFrames, users must ensure that the supplied index values are unique.\n", - "\n", - "When using a pandas DataFrames, the ordering of the underlying DataFrame rows is maintained by Featuretools. For a Dask DataFrame, the ordering of the DataFrame rows is not guaranteed, and Featuretools does not attempt to maintain row order. If ordering is important, close attention must be paid to any output to avoid issues.\n", - "\n", - "### EntitySet Limitations\n", - "\n", - "When creating a Featuretools `EntitySet` that will be made of Dask DataFrames, all of the DataFrames used to create the `EntitySet` must be of the same type, either all Dask DataFrames or all pandas DataFrames. Featuretools does not support creating an `EntitySet` containing a mix of Dask and pandas DataFrames.\n", - "\n", - "Additionally, ``EntitySet.add_interesting_values()`` cannot be used in Dask EntitySets to find interesting values; however, it can be used set a column's interesting values with the `values` parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a3c6b9b8", - "metadata": {}, - "outputs": [], - "source": [ - "values_dict = {\"values\": [12, 103]}\n", - "es.add_interesting_values(dataframe_name=\"dask_input_df\", values=values_dict)\n", - "\n", - "es[\"dask_input_df\"].ww.columns[\"values\"].metadata" - ] - }, - { - "cell_type": "markdown", - "id": "35d1b5c0", - "metadata": {}, - "source": [ - "\n", - "### DFS Limitations\n", - "\n", - "There are a few key limitations when generating a feature matrix from a Dask `EntitySet`.\n", - "\n", - "If a `cutoff_time` parameter is passed to `featuretools.dfs()` it should be a single cutoff time value, or a pandas DataFrame. The current implementation will still work if a Dask DataFrame is supplied for cutoff times, but a `.compute()` call will be made on the DataFrame to convert it into a pandas DataFrame. This conversion will result in a warning, and the process could take a considerable amount of time to complete depending on the size of the supplied DataFrame.\n", - "\n", - "Additionally, Featuretools does not currently support the use of the `approximate` or `training_window` parameters when working with Dask EntitySets, but should in future releases.\n", - "\n", - "Finally, if the output feature matrix contains a boolean column with `NaN` values included, the column type may have a different datatype than the same feature matrix generated from a pandas `EntitySet`. If feature matrix column data types are critical, the feature matrix should be inspected to make sure the types are of the expected types, and recast as necessary.\n", - "\n", - "### Other Limitations\n", - "\n", - "In some instances, generating a feature matrix with a large number of features has resulted in memory issues on Dask workers. The underlying reason for this is that the partition size of the feature matrix grows too large for Dask to handle as the number of feature columns grows large. This issue is most prevalent when the feature matrix contains a large number of columns compared to the DataFrames in the EntitySet. Possible solutions to this problem include reducing the partition size used when creating the DataFrames or increasing the memory available on Dask workers.\n", - "\n", - "Currently `featuretools.encode_features()` does not work with a Dask DataFrame as input. This will hopefully be resolved in a future release of Featuretools.\n", - "\n", - "The utility function `featuretools.make_temporal_cutoffs()` will not work properly with Dask inputs for `instance_ids` or `cutoffs`. However, as noted above, if a `cutoff_time` DataFrame is supplied to `dfs`, the supplied DataFrame should be a pandas DataFrame, and this can be generated by supplying pandas inputs to `make_temporal_cutoffs()`.\n", - "\n", - "The use of `featuretools.remove_low_information_features()` cannot currently be used with a Dask feature matrix.\n", - "\n", - "When manually defining a `Feature`, the `use_previous` parameter cannot be used if this feature will be applied to calculate a feature matrix from a Dask `EntitySet`.\n", - "\n", - "### Dask `string[pyarrow]`\n", - "Featuretools may have issues with the new string storage model used by Dask. To workaround this, add `dask.config.set({'dataframe.convert-string': False})`, prior to running dask operations." - ] - } - ], - "metadata": { - "celltoolbar": "Raw Cell Format", - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/guides/using_spark_entitysets.ipynb b/docs/source/guides/using_spark_entitysets.ipynb deleted file mode 100644 index fb357ab735..0000000000 --- a/docs/source/guides/using_spark_entitysets.ipynb +++ /dev/null @@ -1,277 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "87378ca2", - "metadata": {}, - "source": [ - "# Using Spark EntitySets (BETA)" - ] - }, - { - "cell_type": "raw", - "id": "ac77ea82", - "metadata": { - "raw_mimetype": "text/restructuredtext" - }, - "source": [ - ".. note::\n", - " Support for Spark EntitySets is still in Beta. While the key functionality has been implemented, development is ongoing to add the remaining functionality.\n", - " \n", - " All planned improvements to the Featuretools/Spark integration are `documented on Github `_. If you see an open issue that is important for your application, please let us know by upvoting or commenting on the issue. If you encounter any errors using Spark dataframes in EntitySets, or find missing functionality that does not yet have an open issue, please create a `new issue on Github `_." - ] - }, - { - "cell_type": "markdown", - "id": "01778198", - "metadata": {}, - "source": [ - "Creating a feature matrix from a very large dataset can be problematic if the underlying pandas dataframes that make up the EntitySet cannot easily fit in memory. To help get around this issue, Featuretools supports creating ``EntitySet`` objects from Spark dataframes. A Spark ``EntitySet`` can then be passed to ``featuretools.dfs`` or ``featuretools.calculate_feature_matrix`` to create a feature matrix, which will be returned as a Spark dataframe. In addition to working on larger than memory datasets, this approach also allows users to take advantage of the parallel and distributed processing capabilities offered by Spark and Spark.\n", - "\n", - "This guide will provide an overview of how to create a Spark ``EntitySet`` and then generate a feature matrix from it. If you are already familiar with creating a feature matrix starting from pandas dataframes, this process will seem quite familiar, as there are no differences in the process. There are, however, some limitations when using Spark dataframes, and those limitations are reviewed in more detail below.\n", - "\n", - "## Creating EntitySets\n", - "\n", - "Spark ``EntitySets`` require PySpark. Both can be installed directly with ``pip install featuretools[spark]``. Java is also required for PySpark and may need to be installed, see [the Spark documentation](https://spark.apache.org/docs/latest/index.html) for more details. We will create a very small Spark dataframe for this example. Spark dataframes can also be created from pandas dataframes, Spark dataframes, or read in directly from a file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20acbac9", - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "import pyspark.sql as sql\n", - "\n", - "spark = (\n", - " sql.SparkSession.builder.master(\"local[2]\")\n", - " .config(\n", - " \"spark.driver.extraJavaOptions\", \"-Dio.netty.tryReflectionSetAccessible=True\"\n", - " )\n", - " .config(\"spark.sql.shuffle.partitions\", \"2\")\n", - " .config(\"spark.driver.bindAddress\", \"127.0.0.1\")\n", - " .getOrCreate()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af6545db", - "metadata": {}, - "outputs": [], - "source": [ - "import pyspark.pandas as ps\n", - "\n", - "import featuretools as ft\n", - "\n", - "ps.set_option(\"compute.default_index_type\", \"distributed\")\n", - "\n", - "id = [0, 1, 2, 3, 4]\n", - "values = [12, -35, 14, 103, -51]\n", - "spark_df = ps.DataFrame({\"id\": id, \"values\": values})\n", - "spark_df" - ] - }, - { - "cell_type": "markdown", - "id": "3c27b229", - "metadata": {}, - "source": [ - "Now that we have our Spark dataframe, we can start to create the ``EntitySet``. Inferring Woodwork logical types for the columns in a Spark dataframe can be computationally expensive. To avoid this expense, logical type inference can be skipped by supplying a dictionary of logical types using the `logical_types` parameter when calling `es.add_dataframe()`. Logical types can be specified as Woodwork LogicalType classes, or their equivalent string representation. For more information on using Woodwork types refer to the [Woodwork Typing in Featuretools](../getting_started/woodwork_types.ipynb) guide.\n", - "\n", - "Aside from supplying the logical types, the rest of the process of creating an `EntitySet` is the same as if we were using pandas DataFrames." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5ca0be3", - "metadata": {}, - "outputs": [], - "source": [ - "from woodwork.logical_types import Double, Integer\n", - "\n", - "es = ft.EntitySet(id=\"spark_es\")\n", - "es = es.add_dataframe(\n", - " dataframe_name=\"spark_input_df\",\n", - " dataframe=spark_df,\n", - " index=\"id\",\n", - " logical_types={\"id\": Integer, \"values\": Double},\n", - ")\n", - "\n", - "es" - ] - }, - { - "cell_type": "markdown", - "id": "9d1b8525", - "metadata": {}, - "source": [ - "## Running DFS\n", - "\n", - "We can pass the ``EntitySet`` we created above to ``featuretools.dfs`` in order to create a feature matrix. If the ``EntitySet`` we pass to ``dfs`` is made of Spark dataframes, the feature matrix we get back will be a Spark dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc48d985", - "metadata": {}, - "outputs": [], - "source": [ - "feature_matrix, features = ft.dfs(\n", - " entityset=es,\n", - " target_dataframe_name=\"spark_input_df\",\n", - " trans_primitives=[\"negate\"],\n", - " max_depth=1,\n", - ")\n", - "feature_matrix" - ] - }, - { - "cell_type": "markdown", - "id": "a4f8ccfd", - "metadata": {}, - "source": [ - "This feature matrix can be saved to disk or converted to a pandas dataframe and brought into memory, using the appropriate Spark dataframe methods.\n", - "\n", - "While this is a simple example to illustrate the process of using Spark dataframes with Featuretools, this process will also work with an ``EntitySet`` containing multiple dataframes, as well as with aggregation primitives.\n", - "\n", - "## Limitations\n", - "\n", - "The key functionality of Featuretools is available for use with a Spark ``EntitySet``, and work is ongoing to add the remaining functionality that is available when using a pandas ``EntitySet``. There are, however, some limitations to be aware of when creating a Spark ``Entityset`` and then using it to generate a feature matrix. The most significant limitations are reviewed in more detail in this section." - ] - }, - { - "cell_type": "raw", - "id": "71efc4dc", - "metadata": { - "raw_mimetype": "text/restructuredtext" - }, - "source": [ - ".. note::\n", - " If the limitations of using a Spark ``EntitySet`` are problematic for your problem, you may still be able to compute a larger-than-memory feature matrix by partitioning your data as described in :doc:`performance`." - ] - }, - { - "cell_type": "markdown", - "id": "854c0156", - "metadata": {}, - "source": [ - "### Supported Primitives\n", - "\n", - "When creating a feature matrix from a Spark ``EntitySet``, only certain primitives can be used. Primitives that rely on the order of the entire dataframe or require an entire column for computation are currently not supported when using a Spark ``EntitySet``. Multivariable and time-dependent aggregation primitives also are not currently supported.\n", - "\n", - "To obtain a list of the primitives that can be used with a Spark ``EntitySet``, you can call ``featuretools.list_primitives()``. This will return a table of all primitives. Any primitive that can be used with a Spark ``EntitySet`` will have a value of ``True`` in the ``spark_compatible`` column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bed6a8c8", - "metadata": {}, - "outputs": [], - "source": [ - "primitives_df = ft.list_primitives()\n", - "spark_compatible_df = primitives_df[primitives_df[\"spark_compatible\"] == True]\n", - "spark_compatible_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e5baee7", - "metadata": {}, - "outputs": [], - "source": [ - "spark_compatible_df.tail()" - ] - }, - { - "cell_type": "markdown", - "id": "8abc5442", - "metadata": {}, - "source": [ - "### DataFrame Limitations\n", - "\n", - "Featuretools stores the DataFrames that make up an EntitySet as Woodwork DataFrames, which include additional typing information about the columns that are in the DataFrame. When adding a DataFrame to an `EntitySet`, Woodwork will attempt to infer the logical types for any columns that do not have a logical type defined. This inference process can be quite expensive for Spark DataFrames. In order to skip type inference and speed up the process of adding a Spark DataFrame to an `EntitySet`, users can specify the logical type to use for each column in the DataFrame. A list of available logical types can be obtained by running ``featuretools.list_logical_types()``. To learn more about the limitations of a Spark dataframe with Woodwork typing, see the [Woodwork guide on Spark dataframes](https://woodwork.alteryx.com/en/stable/guides/using_woodwork_with_dask_and_spark.html#Spark-DataFrame-Example).\n", - "\n", - "By default, Woodwork checks that pandas dataframes have unique index values. Because performing this same check with Spark could be computationally expensive, this check is not performed when adding a Spark dataframe to an `EntitySet`. When using Spark dataframes, users must ensure that the supplied index values are unique.\n", - "\n", - "When using a pandas DataFrames, the ordering of the underlying DataFrame rows is maintained by Featuretools. For a Spark DataFrame, the ordering of the DataFrame rows is not guaranteed, and Featuretools does not attempt to maintain row order in a Spark DataFrame. If ordering is important, close attention must be paid to any output to avoid issues.\n", - "\n", - "### EntitySet Limitations\n", - "\n", - "When creating a Featuretools ``EntitySet`` that will be made of Spark dataframes, all of the dataframes used to create the ``EntitySet`` must be of the same type, either all Spark dataframe, all Dask dataframes, or all pandas dataframes. Featuretools does not support creating an ``EntitySet`` containing a mix of Spark, Dask, and pandas dataframes.\n", - "\n", - "Additionally, ``EntitySet.add_interesting_values()`` cannot be used in Spark EntitySets to find interesting values; however, it can be used set a column's interesting values with the `values` parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e13e5ef", - "metadata": {}, - "outputs": [], - "source": [ - "values_dict = {\"values\": [12, 103]}\n", - "es.add_interesting_values(dataframe_name=\"spark_input_df\", values=values_dict)\n", - "\n", - "es[\"spark_input_df\"].ww.columns[\"values\"].metadata" - ] - }, - { - "cell_type": "markdown", - "id": "80f6b033", - "metadata": {}, - "source": [ - "\n", - "### DFS Limitations\n", - "\n", - "There are a few key limitations when generating a feature matrix from a Spark ``EntitySet``.\n", - "\n", - "If a ``cutoff_time`` parameter is passed to ``featuretools.dfs()`` it should be a single cutoff time value, or a pandas dataframe. The current implementation will still work if a Spark dataframe is supplied for cutoff times, but a ``.to_pandas()`` call will be made on the dataframe to convert it into a pandas dataframe. This conversion will result in a warning, and the process could take a considerable amount of time to complete depending on the size of the supplied dataframe.\n", - "\n", - "Additionally, Featuretools does not currently support the use of the ``approximate`` or ``training_window`` parameters when working with Spark EntitySets, but should in future releases.\n", - "\n", - "Finally, if the output feature matrix contains a boolean column with ``NaN`` values included, the column type may have a different datatype than the same feature matrix generated from a pandas ``EntitySet``. If feature matrix column data types are critical, the feature matrix should be inspected to make sure the types are of the proper types, and recast as necessary.\n", - "\n", - "### Other Limitations\n", - "\n", - "Currently ``featuretools.encode_features()`` does not work with a Spark dataframe as input. This will hopefully be resolved in a future release of Featuretools.\n", - "\n", - "The utility function ``featuretools.make_temporal_cutoffs()`` will not work properly with Spark inputs for ``instance_ids`` or ``cutoffs``. However, as noted above, if a ``cutoff_time`` dataframe is supplied to ``dfs``, the supplied dataframe should be a pandas dataframe, and this can be generated by supplying pandas inputs to ``make_temporal_cutoffs()``.\n", - "\n", - "The use of ``featuretools.remove_low_information_features()`` cannot currently be used with a Spark feature matrix.\n", - "\n", - "When manually defining a ``Feature``, the ``use_previous`` parameter cannot be used if this feature will be applied to calculate a feature matrix from a Spark ``EntitySet``." - ] - } - ], - "metadata": { - "celltoolbar": "Raw Cell Format", - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/install.md b/docs/source/install.md index 9ef8e1d076..a3f50a9118 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -20,10 +20,6 @@ $ conda install -c conda-forge featuretools Featuretools allows users to install add-ons individually or all at once: -```{hint} -Be sure to install [Scala and Spark](#scala-and-spark) if you want to use Spark -``` - ````{tab} PyPI ```{tab} All Add-ons ```console @@ -33,66 +29,34 @@ $ python -m pip install "featuretools[complete]" ```console $ python -m pip install "featuretools[dask]" ``` -```{tab} Spark -```console -$ python -m pip install "featuretools[spark]" -``` -```{tab} TSFresh Primitives -```console -$ python -m pip install "featuretools[tsfresh]" -``` -```{tab} AutoNormalize -```console -$ python -m pip install "featuretools[autonormalize]" -``` -```{tab} Update Checker -```console -$ python -m pip install "featuretools[updater]" -``` -```{tab} SQL +```{tab} NLP Primitives ```console -$ python -m pip install "featuretools[sql]" +$ python -m pip install "featuretools[nlp]" ``` -```{tab} scikit-learn Transformer +```{tab} Premium Primitives ```console -$ python -m pip install "featuretools[sklearn]" +$ python -m pip install "featuretools[premium]" ``` + ```` ````{tab} Conda ```{tab} All Add-ons ```console -$ conda install -c conda-forge nlp-primitives featuretools-tsfresh-primitives pyspark alteryx-open-src-update-checker +$ conda install -c conda-forge nlp-primitives dask distributed ``` -```{tab} TSFresh Primitives +```{tab} NLP Primitives ```console -$ conda install -c conda-forge featuretools-tsfresh-primitives +$ conda install -c conda-forge nlp-primitives ``` ```{tab} Dask ```console $ conda install -c conda-forge dask distributed ``` -```{tab} Spark -```console -$ conda install -c conda-forge pyspark -``` -```{tab} SQL -```console -$ conda install -c conda-forge featuretools_sql -``` -```{tab} Update Checker -```console -$ conda install -c conda-forge alteryx-open-src-update-checker -``` ```` - **NLP Primitives**: Use Natural Language Processing Primitives in Featuretools -- **TSFresh Primitives**: Use 60+ primitives from [tsfresh](https://tsfresh.readthedocs.io/en/latest/) in Featuretools -- **Dask**: Use Woodwork with Dask DataFrames and run `calculate_feature_matrix` in parallel with `n_jobs` -- **Spark**: Use Woodwork with Spark DataFrames -- **AutoNormalize**: Automated creation of normalized `EntitySet` from denormalized data -- **Update Checker**: Receive automatic notifications of new Featuretools releases -- **SQL**: Automated `EntitySet` creation from relational data stored in a SQL database -- **scikit-learn Transformer**: Featuretools' DFS as a scikit-learn transformer +- **Premium Primitives**: Use primitives from Premium Primitives in Featuretools +- **Dask**: Use to run `calculate_feature_matrix` in parallel with `n_jobs` ## Installing Graphviz @@ -142,54 +106,12 @@ If you installed graphviz for **Windows** with `pip`, install graphviz.exe from To install Featuretools from source, clone the repository from [GitHub](https://github.com/alteryx/featuretools), and install the dependencies. -```{hint} -Be sure to install [Scala and Spark](#scala-and-spark) if you want to run all unit tests -``` - ```bash git clone https://github.com/alteryx/featuretools.git cd featuretools python -m pip install . ``` -## Scala and Spark - -````{tab} macOS (Intel) -:new-set: -```console -$ brew tap AdoptOpenJDK/openjdk -$ brew install --cask adoptopenjdk11 -$ brew install scala apache-spark -$ echo 'export JAVA_HOME=$(/usr/libexec/java_home)' >> ~/.zshrc -$ echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc -``` -```` - -````{tab} macOS (M1) -```console -$ brew install openjdk@11 scala apache-spark graphviz -$ echo 'export PATH="/opt/homebrew/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc -$ echo 'export CPPFLAGS="-I/opt/homebrew/opt/openjdk@11/include:$CPPFLAGS"' >> ~/.zprofile -$ sudo ln -sfn /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk /Library/Java/JavaVirtualMachines/openjdk-11.jdk -``` -```` - -````{tab} Ubuntu -```console -$ sudo apt install openjdk-11-jre openjdk-11-jdk scala -y -$ echo "export SPARK_HOME=/opt/spark" >> ~/.profile -$ echo "export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin" >> ~/.profile -$ echo "export PYSPARK_PYTHON=/usr/bin/python3" >> ~/.profile -``` -```` - -````{tab} Amazon Linux -```console -$ sudo amazon-linux-extras install java-openjdk11 scala -y -$ amazon-linux-extras enable java-openjdk11 -``` -```` - ## Docker It is also possible to run Featuretools inside a Docker container. diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 20c648195d..ddaf5aecc6 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -8,14 +8,21 @@ Future Release * Enhancements * Fixes * Changes - * Temporarily restrict Dask version :pr:`2694` + * Temporarily restrict Dask version (:pr:`2694`) + * Remove support for creating ``EntitySets`` from Dask or Pyspark dataframes (:pr:`2705`) * Documentation Changes * Testing Changes - * Fix serialization test to work with pytest 8.1.1 :pr:`2694` + * Fix serialization test to work with pytest 8.1.1 (:pr:`2694`) Thanks to the following people for contributing to this release: :user:`thehomebrewnerd` +Breaking Changes +++++++++++++++++ +* With this release of Featuretools, EntitySets can no longer be created from Dask or Pyspark dataframes. The behavior when using pandas + dataframes to create EntitySets remains unchanged. + + v1.30.0 Feb 26, 2024 ==================== * Changes diff --git a/docs/source/resources/frequently_asked_questions.ipynb b/docs/source/resources/frequently_asked_questions.ipynb index 7ecfdc0ad2..1772a386aa 100644 --- a/docs/source/resources/frequently_asked_questions.ipynb +++ b/docs/source/resources/frequently_asked_questions.ipynb @@ -455,21 +455,6 @@ "feature_matrix[[\"COUNT(sessions WHERE product_id_device = 5 and tablet)\"]]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Can I create an `EntitySet` using Dask or Spark dataframes? (BETA)\n", - "\n", - "Support for Dask EntitySets and Spark EntitySets is still in Beta - if you encounter any errors using either of these approaches, please let us know by creating a [new issue on Github](https://github.com/alteryx/featuretools/issues).\n", - "\n", - "Yes! Featuretools supports creating an `EntitySet` from Dask dataframes or from Spark dataframes. You can simply follow the same process you would when creating an `EntitySet` from pandas dataframes.\n", - "\n", - "There are some limitations to be aware of when using Dask or Spark dataframes. When creating a `DataFrame`, type inference can significantly slow down the runtime compared to pandas DataFrames, so users are encouraged to specify logical types for all columns during creation. Also, other quality checks are not performed, such as checking for unique index values. An `EntitySet` must be created entirely of one type of DataFrame (Dask, Spark, or pandas) - you cannot mix pandas DataFrames, Dask DataFrames, and Spark DataFrames with each other in the same `EntitySet`.\n", - "\n", - "For more information on creating an `EntitySet` from Dask dataframes or from Spark dataframes, see the [Using Dask EntitySets](../guides/using_dask_entitysets.rst) and the [Using Spark EntitySets](../guides/using_spark_entitysets.rst) guides." - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1637,7 +1622,7 @@ "source": [ "### How do I get a list of all Aggregation and Transform primitives?\n", "\n", - "You can do `featuretools.list_primitives()` to get all the primitive in Featuretools. It will return a DataFrame with the names, type, and description of the primitives, and if the primitive can be used with entitysets created from Dask dataframes." + "You can do `featuretools.list_primitives()` to get all the primitive in Featuretools. It will return a DataFrame with the names, type, and description of the primitives." ] }, { @@ -1659,19 +1644,6 @@ "df_primitives.tail()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### What primitives can I use when creating a feature matrix from a Dask `EntitySet`? (BETA)\n", - "\n", - "Support for Dask EntitySets is still in Beta - if you encounter any errors using this approach, please let us know by creating a [new issue on Github](https://github.com/alteryx/featuretools/issues).\n", - "\n", - "When creating a feature matrix from a Dask `EntitySet`, only certain primitives can be used. Computation of certain features is quite expensive in a distributed environment, and as a result only a subset of Featuretools primitives are currently supported when using a Dask `EntitySet`.\n", - "\n", - "The table returned by `featuretools.list_primitives()` will contain a column labeled `dask_compatible`. Any primitive that has a value of `True` in this column can be used safely when computing a feature matrix from a Dask `EntitySet`." - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -2031,10 +2003,9 @@ "\n", "/usr/local/Caskroom/miniconda/base/envs/featuretools/lib/python3.8/site-packages/woodwork/table_accessor.py in _check_index(dataframe, index)\n", " 903 # User specifies an index that is in the dataframe but not unique\n", - " 904 # Does not check for Dask as Dask does not support is_unique\n", - "--> 905 raise IndexError('Index column must be unique')\n", + "--> 904 raise IndexError('Index column must be unique')\n", + " 905 \n", " 906 \n", - " 907 \n", "\n", "IndexError: Index column must be unique\n", "```" diff --git a/docs/source/resources/transition_to_ft_v1.0.ipynb b/docs/source/resources/transition_to_ft_v1.0.ipynb index 26097d4233..e4a275da83 100644 --- a/docs/source/resources/transition_to_ft_v1.0.ipynb +++ b/docs/source/resources/transition_to_ft_v1.0.ipynb @@ -86,7 +86,7 @@ "\n", "### Adding dataframes to an EntitySet\n", "\n", - "When adding dataframes to an EntitySet, users can pass in a Woodwork dataframe or a regular dataframe without Woodwork typing information. As before, Featuretools supports creating EntitySets from pandas, Dask and Spark dataframes. If users supply a dataframe that has Woodwork typing information initialized, Featuretools will simply use this typing information directly. If users supply a dataframe without Woodwork initialized, Featuretools will initialize Woodwork on the dataframe, performing type inference for any column that does not have typing information specified.\n", + "When adding dataframes to an EntitySet, users can pass in a Woodwork dataframe or a regular dataframe without Woodwork typing information. If users supply a dataframe that has Woodwork typing information initialized, Featuretools will simply use this typing information directly. If users supply a dataframe without Woodwork initialized, Featuretools will initialize Woodwork on the dataframe, performing type inference for any column that does not have typing information specified.\n", "\n", "Below are some examples to illustrate this process. First we will create two small dataframes to use for the example." ] diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py index c07aea7c51..d91943d37e 100644 --- a/featuretools/computational_backends/calculate_feature_matrix.py +++ b/featuretools/computational_backends/calculate_feature_matrix.py @@ -36,16 +36,10 @@ from featuretools.feature_base import AggregationFeature, FeatureBase from featuretools.utils import Trie from featuretools.utils.gen_utils import ( - Library, - import_or_none, import_or_raise, - is_instance, make_tqdm_iterator, ) -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - logger = logging.getLogger("featuretools.computational_backend") PBAR_FORMAT = "Elapsed: {elapsed} | Progress: {l_bar}{bar}" @@ -173,14 +167,6 @@ def calculate_feature_matrix( else: raise TypeError("No dataframes or valid EntitySet provided") - if entityset.dataframe_type == Library.DASK: - if approximate: - msg = "Using approximate is not supported with Dask dataframes" - raise ValueError(msg) - if training_window: - msg = "Using training_window is not supported with Dask dataframes" - raise ValueError(msg) - target_dataframe = entityset[features[0].dataframe_name] cutoff_time = _validate_cutoff_time(cutoff_time, target_dataframe) @@ -217,11 +203,6 @@ def calculate_feature_matrix( ) instance_ids = df[index_col] - if is_instance(instance_ids, dd, "Series"): - instance_ids = instance_ids.compute() - elif is_instance(instance_ids, ps, "Series"): - instance_ids = instance_ids.to_pandas() - # convert list or range object into series if not isinstance(instance_ids, pd.Series): instance_ids = pd.Series(instance_ids) @@ -351,23 +332,22 @@ def calculate_feature_matrix( ) # ensure rows are sorted by input order - if isinstance(feature_matrix, pd.DataFrame): - if isinstance(cutoff_time, pd.DataFrame): - feature_matrix = feature_matrix.ww.reindex( - pd.MultiIndex.from_frame( - cutoff_time[["instance_id", "time"]], - names=feature_matrix.index.names, - ), - ) - else: - # Maintain index dtype - index_dtype = feature_matrix.index.get_level_values(0).dtype - feature_matrix = feature_matrix.ww.reindex( - cutoff_time[1].astype(index_dtype), - level=0, - ) - if not cutoff_time_in_index: - feature_matrix.ww.reset_index(level="time", drop=True, inplace=True) + if isinstance(cutoff_time, pd.DataFrame): + feature_matrix = feature_matrix.ww.reindex( + pd.MultiIndex.from_frame( + cutoff_time[["instance_id", "time"]], + names=feature_matrix.index.names, + ), + ) + else: + # Maintain index dtype + index_dtype = feature_matrix.index.get_level_values(0).dtype + feature_matrix = feature_matrix.ww.reindex( + cutoff_time[1].astype(index_dtype), + level=0, + ) + if not cutoff_time_in_index: + feature_matrix.ww.reset_index(level="time", drop=True, inplace=True) if save_progress and os.path.exists(os.path.join(save_progress, "temp")): shutil.rmtree(os.path.join(save_progress, "temp")) @@ -447,9 +427,8 @@ def update_progress_callback(done): progress_callback=update_progress_callback, include_cutoff_time=include_cutoff_time, ) - if isinstance(_feature_matrix, pd.DataFrame): - time_index = pd.Index([time_last] * len(ids), name="time") - _feature_matrix = _feature_matrix.set_index(time_index, append=True) + time_index = pd.Index([time_last] * len(ids), name="time") + _feature_matrix = _feature_matrix.set_index(time_index, append=True) feature_matrix.append(_feature_matrix) else: @@ -541,10 +520,7 @@ def update_progress_callback(done): include_cutoff_time=include_cutoff_time, ) - if is_instance(_feature_matrix, (dd, ps), "DataFrame"): - id_name = _feature_matrix.columns[-1] - else: - id_name = _feature_matrix.index.name + id_name = _feature_matrix.index.name # if approximate, merge feature matrix with group frame to get original # cutoff times and passed columns @@ -577,43 +553,16 @@ def update_progress_callback(done): }, inplace=True, ) - if isinstance(_feature_matrix, pd.DataFrame): - time_index = pd.Index([time_last] * num_rows, name="time") - _feature_matrix = _feature_matrix.set_index( - time_index, - append=True, - ) - if len(pass_columns) > 0: - pass_through.set_index([id_name, "time"], inplace=True) - for col in pass_columns: - _feature_matrix[col] = pass_through[col] - elif is_instance(_feature_matrix, dd, "DataFrame") and ( - len(pass_columns) > 0 - ): - _feature_matrix["time"] = time_last - for col in pass_columns: - pass_df = dd.from_pandas( - pass_through[[id_name, "time", col]], - npartitions=_feature_matrix.npartitions, - ) - _feature_matrix = _feature_matrix.merge( - pass_df, - how="outer", - ) - _feature_matrix = _feature_matrix.drop(columns=["time"]) - elif is_instance(_feature_matrix, ps, "DataFrame") and ( - len(pass_columns) > 0 - ): - _feature_matrix["time"] = time_last + + time_index = pd.Index([time_last] * num_rows, name="time") + _feature_matrix = _feature_matrix.set_index( + time_index, + append=True, + ) + if len(pass_columns) > 0: + pass_through.set_index([id_name, "time"], inplace=True) for col in pass_columns: - pass_df = ps.from_pandas( - pass_through[[id_name, "time", col]], - ) - _feature_matrix = _feature_matrix.merge( - pass_df, - how="outer", - ) - _feature_matrix = _feature_matrix.drop(columns=["time"]) + _feature_matrix[col] = pass_through[col] feature_matrix.append(_feature_matrix) ww_init_kwargs = get_ww_types_from_features( @@ -968,26 +917,15 @@ def init_ww_and_concat_fm(feature_matrix, ww_init_kwargs): for fm in feature_matrix: updated_cols = set() for col in cols_to_check: - # Only convert types for pandas if null values are present - # Always convert for Dask/Spark to avoid pulling data into memory for null check - is_pandas_df_with_null = ( - isinstance(fm, pd.DataFrame) and fm[col].isnull().any() - ) - is_dask_df = is_instance(fm, dd, "DataFrame") - is_spark_df = is_instance(fm, ps, "DataFrame") - if is_pandas_df_with_null or is_dask_df or is_spark_df: + # Only convert types if null values are present + if fm[col].isnull().any(): current_type = ww_init_kwargs["logical_types"][col].type_string ww_init_kwargs["logical_types"][col] = replacement_type[current_type] updated_cols.add(col) cols_to_check = cols_to_check - updated_cols fm.ww.init(**ww_init_kwargs) - if any(is_instance(fm, dd, "DataFrame") for fm in feature_matrix): - feature_matrix = dd.concat(feature_matrix) - elif any(is_instance(fm, ps, "DataFrame") for fm in feature_matrix): - feature_matrix = ps.concat(feature_matrix) - else: - feature_matrix = pd.concat(feature_matrix) + feature_matrix = pd.concat(feature_matrix) feature_matrix.ww.init(**ww_init_kwargs) return feature_matrix diff --git a/featuretools/computational_backends/feature_set_calculator.py b/featuretools/computational_backends/feature_set_calculator.py index 2b509ccb25..02913d5343 100644 --- a/featuretools/computational_backends/feature_set_calculator.py +++ b/featuretools/computational_backends/feature_set_calculator.py @@ -15,15 +15,7 @@ TransformFeature, ) from featuretools.utils import Trie -from featuretools.utils.gen_utils import ( - Library, - get_relationship_column_id, - import_or_none, - is_instance, -) - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") +from featuretools.utils.gen_utils import get_relationship_column_id class FeatureSetCalculator(object): @@ -134,43 +126,38 @@ def progress_callback(*args): # df_trie. df = df_trie.value - # Fill in empty rows with default values. This only works for pandas dataframes - # and is not currently supported for Dask dataframes. - if isinstance(df, pd.DataFrame): - index_dtype = df.index.dtype.name - if df.empty: - return self.generate_default_df(instance_ids=instance_ids) - - missing_ids = [ - i for i in instance_ids if i not in df[target_dataframe.ww.index] - ] - if missing_ids: - default_df = self.generate_default_df( - instance_ids=missing_ids, - extra_columns=df.columns, - ) + # Fill in empty rows with default values. + index_dtype = df.index.dtype.name + if df.empty: + return self.generate_default_df(instance_ids=instance_ids) + + missing_ids = [ + i for i in instance_ids if i not in df[target_dataframe.ww.index] + ] + if missing_ids: + default_df = self.generate_default_df( + instance_ids=missing_ids, + extra_columns=df.columns, + ) - df = pd.concat([df, default_df], sort=True) + df = pd.concat([df, default_df], sort=True) - df.index.name = self.entityset[self.feature_set.target_df_name].ww.index + df.index.name = self.entityset[self.feature_set.target_df_name].ww.index - # Order by instance_ids - unique_instance_ids = pd.unique(instance_ids) - unique_instance_ids = unique_instance_ids.astype(instance_ids.dtype) - df = df.reindex(unique_instance_ids) + # Order by instance_ids + unique_instance_ids = pd.unique(instance_ids) + unique_instance_ids = unique_instance_ids.astype(instance_ids.dtype) + df = df.reindex(unique_instance_ids) - # Keep categorical index if original index was categorical - if index_dtype == "category": - df.index = df.index.astype("category") + # Keep categorical index if original index was categorical + if index_dtype == "category": + df.index = df.index.astype("category") column_list = [] for feat in self.feature_set.target_features: column_list.extend(feat.get_feature_names()) - if is_instance(df, (dd, ps), "DataFrame"): - column_list.extend([target_dataframe.ww.index]) - return df[column_list] def _calculate_features_for_dataframe( @@ -286,9 +273,6 @@ def _calculate_features_for_dataframe( # Pass filtered values, even if we are using a full df. if need_full_dataframe: - if is_instance(filter_values, dd, "Series"): - msg = "Cannot use primitives that require full dataframe with Dask EntitySets" - raise ValueError(msg) filtered_df = df[df[filter_column].isin(filter_values)] else: filtered_df = df @@ -438,13 +422,11 @@ def _add_ancestor_relationship_columns( ) # ensure index is maintained - # TODO: Review for dask dataframes - if isinstance(df, pd.DataFrame): - df.set_index( - relationship.child_dataframe.ww.index, - drop=False, - inplace=True, - ) + df.set_index( + relationship.child_dataframe.ww.index, + drop=False, + inplace=True, + ) return df, new_relationship_columns @@ -502,7 +484,7 @@ def _calculate_transform_features( _df_trie, progress_callback, ): - frame_empty = frame.empty if isinstance(frame, pd.DataFrame) else False + frame_empty = frame.empty feature_values = [] for f in features: # handle when no data @@ -646,29 +628,22 @@ def _calculate_direct_features( # merge the identity feature from the parent dataframe into the child merge_df = parent_df[list(col_map.keys())].rename(columns=col_map) - if is_instance(merge_df, (dd, ps), "DataFrame"): - new_df = child_df.merge( - merge_df, - left_on=merge_col, - right_on=merge_col, - how="left", + + if index_as_feature is not None: + merge_df.set_index( + index_as_feature.get_name(), + inplace=True, + drop=False, ) else: - if index_as_feature is not None: - merge_df.set_index( - index_as_feature.get_name(), - inplace=True, - drop=False, - ) - else: - merge_df.set_index(merge_col, inplace=True) + merge_df.set_index(merge_col, inplace=True) - new_df = child_df.merge( - merge_df, - left_on=merge_col, - right_index=True, - how="left", - ) + new_df = child_df.merge( + merge_df, + left_on=merge_col, + right_index=True, + how="left", + ) progress_callback(len(features) / float(self.num_features)) @@ -678,7 +653,6 @@ def _calculate_agg_features(self, features, frame, df_trie, progress_callback): test_feature = features[0] child_dataframe = test_feature.base_features[0].dataframe base_frame = df_trie.get_node(test_feature.relationship_path).value - parent_merge_col = test_feature.relationship_path[0][1]._parent_column_name # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here @@ -694,17 +668,13 @@ def _calculate_agg_features(self, features, frame, df_trie, progress_callback): return frame # handle where - base_frame_empty = ( - base_frame.empty if isinstance(base_frame, pd.DataFrame) else False - ) + base_frame_empty = base_frame.empty where = test_feature.where if where is not None and not base_frame_empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan - base_frame_empty = ( - base_frame.empty if isinstance(base_frame, pd.DataFrame) else False - ) + base_frame_empty = base_frame.empty if base_frame_empty: feature_values = [] for f in features: @@ -750,12 +720,7 @@ def last_n(df): column_id = f.base_features[0].get_name() if column_id not in to_agg: to_agg[column_id] = [] - if is_instance(base_frame, dd, "DataFrame"): - func = f.get_function(agg_type=Library.DASK) - elif is_instance(base_frame, ps, "DataFrame"): - func = f.get_function(agg_type=Library.SPARK) - else: - func = f.get_function() + func = f.get_function() # for some reason, using the string count is significantly # faster than any method a primitive can return @@ -775,11 +740,6 @@ def last_n(df): func.__name__ = funcname - if dd and isinstance(func, dd.Aggregation): - # TODO: handle aggregation being applied to same column twice - # (see above partial wrapping of functions) - funcname = func.__name__ - to_agg[column_id].append(func) # this is used below to rename columns that pandas names for us agg_rename["{}-{}".format(column_id, funcname)] = f.get_name() @@ -818,14 +778,11 @@ def last_n(df): # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) - if is_instance(base_frame, (dd, ps), "DataFrame"): - to_merge = base_frame.groupby(groupby_col).agg(to_agg) - else: - to_merge = base_frame.groupby( - base_frame[groupby_col], - observed=True, - sort=False, - ).agg(to_agg) + to_merge = base_frame.groupby( + base_frame[groupby_col], + observed=True, + sort=False, + ).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [agg_rename["-".join(x)] for x in to_merge.columns] to_merge = to_merge[list(agg_rename.values())] @@ -841,21 +798,13 @@ def last_n(df): ) to_merge.index = to_merge.index.astype(object).astype(categories) - if is_instance(frame, (dd, ps), "DataFrame"): - frame = frame.merge( - to_merge, - left_on=parent_merge_col, - right_index=True, - how="left", - ) - else: - frame = pd.merge( - left=frame, - right=to_merge, - left_index=True, - right_index=True, - how="left", - ) + frame = pd.merge( + left=frame, + right=to_merge, + left_index=True, + right_index=True, + how="left", + ) # determine number of features that were just merged progress_callback(len(to_merge.columns) / float(self.num_features)) @@ -942,18 +891,7 @@ def update_feature_columns(feature_data, data): data.update(new_cols) return data - # Handle pandas input - if isinstance(data, pd.DataFrame): - return pd.concat([data, pd.DataFrame(new_cols, index=data.index)], axis=1) - - # Handle dask/spark input - for name, col in new_cols.items(): - col.name = name - if is_instance(data, dd, "DataFrame"): - data = dd.concat([data, col], axis=1) - else: - data = ps.concat([data, col], axis=1) - return data + return pd.concat([data, pd.DataFrame(new_cols, index=data.index)], axis=1) def strip_values_if_series(values): diff --git a/featuretools/computational_backends/utils.py b/featuretools/computational_backends/utils.py index 773320188e..2efc359036 100644 --- a/featuretools/computational_backends/utils.py +++ b/featuretools/computational_backends/utils.py @@ -13,10 +13,9 @@ from featuretools.entityset.relationship import RelationshipPath from featuretools.feature_base import AggregationFeature, DirectFeature from featuretools.utils import Trie -from featuretools.utils.gen_utils import Library, import_or_none, is_instance +from featuretools.utils.gen_utils import import_or_none from featuretools.utils.wrangle import _check_time_type, _check_timedelta -dd = import_or_none("dask.dataframe") logger = logging.getLogger("featuretools.computational_backend") @@ -220,10 +219,7 @@ def get_client_cluster(): return Client, LocalCluster -if dd: - CutoffTimeType = typing.Union[dd.DataFrame, pd.DataFrame, str, datetime] -else: - CutoffTimeType = typing.Union[pd.DataFrame, str, datetime] +CutoffTimeType = typing.Union[pd.DataFrame, str, datetime] def _validate_cutoff_time( @@ -234,14 +230,6 @@ def _validate_cutoff_time( Verify that the cutoff time is a single value or a pandas dataframe with the proper columns containing no duplicate rows """ - if is_instance(cutoff_time, dd, "DataFrame"): - msg = ( - "cutoff_time should be a Pandas DataFrame: " - "computing cutoff_time, this may take a while" - ) - warnings.warn(msg) - cutoff_time = cutoff_time.compute() - if isinstance(cutoff_time, pd.DataFrame): cutoff_time = cutoff_time.reset_index(drop=True) @@ -397,15 +385,6 @@ def get_ww_types_from_features( semantic_tags[column] = cutoff_schema.semantic_tags[column] origins[column] = "base" - if entityset.dataframe_type in (Library.DASK, Library.SPARK): - target_dataframe_name = features[0].dataframe_name - table_schema = entityset[target_dataframe_name].ww.schema - index_col = table_schema.index - logical_types[index_col] = table_schema.logical_types[index_col] - semantic_tags[index_col] = table_schema.semantic_tags[index_col] - semantic_tags[index_col] -= {"index"} - origins[index_col] = "base" - ww_init = { "logical_types": logical_types, "semantic_tags": semantic_tags, diff --git a/featuretools/entityset/deserialize.py b/featuretools/entityset/deserialize.py index d88dff0b7b..fbcd6c6ef8 100644 --- a/featuretools/entityset/deserialize.py +++ b/featuretools/entityset/deserialize.py @@ -8,14 +8,10 @@ from woodwork.deserialize import read_woodwork_table from featuretools.entityset.relationship import Relationship -from featuretools.utils.gen_utils import Library, import_or_none from featuretools.utils.s3_utils import get_transport_params, use_smartopen_es from featuretools.utils.schema_utils import check_schema_version from featuretools.utils.wrangle import _is_local_tar, _is_s3, _is_url -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - def description_to_entityset(description, **kwargs): """Deserialize entityset from data description. @@ -45,7 +41,7 @@ def description_to_entityset(description, **kwargs): kwargs["filename"] = df["name"] + ".parquet" dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: - dataframe = empty_dataframe(df, description["data_type"]) + dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) @@ -56,7 +52,7 @@ def description_to_entityset(description, **kwargs): return entityset -def empty_dataframe(description, data_type=Library.PANDAS): +def empty_dataframe(description): """Deserialize empty dataframe from dataframe description. Args: @@ -105,10 +101,6 @@ def empty_dataframe(description, data_type=Library.PANDAS): category_dtypes[col_name] = cat_object dataframe = pd.DataFrame(columns=columns).astype(category_dtypes) - if data_type == Library.DASK: - dataframe = dd.from_pandas(dataframe, npartitions=1) - elif data_type == Library.SPARK: - dataframe = ps.from_pandas(dataframe) dataframe.ww.init( name=description.get("name"), diff --git a/featuretools/entityset/entityset.py b/featuretools/entityset/entityset.py index 68c8d49bcb..9984725264 100644 --- a/featuretools/entityset/entityset.py +++ b/featuretools/entityset/entityset.py @@ -11,7 +11,6 @@ from featuretools.entityset import deserialize, serialize from featuretools.entityset.relationship import Relationship, RelationshipPath from featuretools.feature_base.feature_base import _ES_REF -from featuretools.utils.gen_utils import Library, import_or_none, is_instance from featuretools.utils.plot_utils import ( check_graphviz, get_graphviz_format, @@ -19,9 +18,6 @@ ) from featuretools.utils.wrangle import _check_timedelta -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - pd.options.mode.chained_assignment = None # default='warn' logger = logging.getLogger("featuretools.entityset") @@ -183,21 +179,6 @@ def __deepcopy__(self, memo): def dataframes(self): return list(self.dataframe_dict.values()) - @property - def dataframe_type(self): - """String specifying the library used for the dataframes. Null if no dataframes""" - df_type = None - - if self.dataframes: - if isinstance(self.dataframes[0], pd.DataFrame): - df_type = Library.PANDAS - elif is_instance(self.dataframes[0], dd, "DataFrame"): - df_type = Library.DASK - elif is_instance(self.dataframes[0], ps, "DataFrame"): - df_type = Library.SPARK - - return df_type - @property def metadata(self): """Returns the metadata for this EntitySet. The metadata will be recomputed if it does not exist.""" @@ -271,8 +252,6 @@ def to_csv( compression (str) : Name of the compression to use. Possible values are: {'gzip', 'bz2', 'zip', 'xz', None}. profile_name (str) : Name of AWS profile to use, False to use an anonymous profile, or None. """ - if self.dataframe_type == Library.SPARK: - compression = str(compression) serialize.write_data_description( self, path, @@ -397,7 +376,7 @@ def add_relationship( # default to object dtypes for categorical columns, but # indexes/foreign keys default to ints. In this case, we convert # the empty column's type to int - if isinstance(child_df, pd.DataFrame) and ( + if ( child_df.empty and child_df[child_column].dtype == object and parent_df.ww.columns[parent_column].is_numeric @@ -698,15 +677,6 @@ def add_dataframe( "Cannot add dataframe to EntitySet without a name. " "Please provide a value for the dataframe_name parameter.", ) - # Warn when performing inference on Dask or Spark DataFrames - if not set(dataframe.columns).issubset(set(logical_types.keys())) and ( - is_instance(dataframe, dd, "DataFrame") - or is_instance(dataframe, ps, "DataFrame") - ): - warnings.warn( - "Performing type inference on Dask or Spark DataFrames may be computationally intensive. " - "Specify logical types for each column to speed up EntitySet initialization.", - ) index_was_created, index, dataframe = _get_or_create_index( index, @@ -986,9 +956,6 @@ def normalize_dataframe( ti_cols = [c if c != old_ti_name else secondary_time_index for c in ti_cols] make_secondary_time_index = {secondary_time_index: ti_cols} - if is_instance(new_dataframe, ps, "DataFrame"): - already_sorted = False - # will initialize Woodwork on this DataFrame logical_types = {} semantic_tags = {} @@ -1052,17 +1019,11 @@ def concat(self, other, inplace=False): else: combined_es = copy.deepcopy(self) - lib = pd - if self.dataframe_type == Library.SPARK: - lib = ps - elif self.dataframe_type == Library.DASK: - lib = dd - has_last_time_index = [] for df in self.dataframes: self_df = df other_df = other[df.ww.name] - combined_df = lib.concat([self_df, other_df]) + combined_df = pd.concat([self_df, other_df]) # If both DataFrames have made indexes, there will likely # be overlap in the index column, so we use the other values if self_df.ww.metadata.get("created_index") or other_df.ww.metadata.get( @@ -1161,21 +1122,11 @@ def add_last_time_indexes(self, updated_dataframes=None): if es_lti_dict[dataframe.ww.name] is None: if dataframe.ww.time_index is not None: lti = dataframe[dataframe.ww.time_index].copy() - if is_instance(dataframe, dd, "DataFrame"): - # The current Dask implementation doesn't set the index of the dataframe - # to the dataframe's index, so we have to do it manually here - lti.index = dataframe[dataframe.ww.index].copy() else: lti = dataframe.ww[dataframe.ww.index].copy() - if is_instance(dataframe, dd, "DataFrame"): - lti.index = dataframe[dataframe.ww.index].copy() - lti = lti.apply(lambda x: None) - elif is_instance(dataframe, ps, "DataFrame"): - lti = ps.Series(pd.Series(index=lti.to_list(), name=lti.name)) - else: - # Cannot have a category dtype with nans when calculating last time index - lti = lti.astype("object") - lti[:] = None + # Cannot have a category dtype with nans when calculating last time index + lti = lti.astype("object") + lti[:] = None es_lti_dict[dataframe.ww.name] = lti @@ -1201,99 +1152,51 @@ def add_last_time_indexes(self, updated_dataframes=None): # updated last time from all children for child_df in child_dataframes: - # TODO: Figure out if Dask code related to indexes is important for Spark if es_lti_dict[child_df.ww.name] is None: continue link_col = child_cols[dataframe.ww.name][child_df.ww.name].name - lti_is_dask = is_instance( - es_lti_dict[child_df.ww.name], - dd, - "Series", - ) - lti_is_spark = is_instance( - es_lti_dict[child_df.ww.name], - ps, - "Series", + lti_df = pd.DataFrame( + { + "last_time": es_lti_dict[child_df.ww.name], + dataframe.ww.index: child_df[link_col], + }, ) - if lti_is_dask or lti_is_spark: - to_join = child_df[link_col] - if lti_is_dask: - to_join.index = child_df[child_df.ww.index] - - lti_df = ( - es_lti_dict[child_df.ww.name] - .to_frame(name="last_time") - .join(to_join.to_frame(name=dataframe.ww.index)) - ) - - if lti_is_dask: - new_index = lti_df.index.copy() - new_index.name = None - lti_df.index = new_index - lti_df = lti_df.groupby(lti_df[dataframe.ww.index]).agg("max") + # sort by time and keep only the most recent + lti_df.sort_values( + ["last_time", dataframe.ww.index], + kind="mergesort", + inplace=True, + ) - lti_df = ( - es_lti_dict[dataframe.ww.name] - .to_frame(name="last_time_old") - .join(lti_df) - ) + lti_df.drop_duplicates( + dataframe.ww.index, + keep="last", + inplace=True, + ) + lti_df.set_index(dataframe.ww.index, inplace=True) + lti_df = lti_df.reindex(es_lti_dict[dataframe.ww.name].index) + lti_df["last_time_old"] = es_lti_dict[dataframe.ww.name] + if lti_df.empty: + # Pandas errors out if it tries to do fillna and then max on an empty dataframe + lti_df = pd.Series([], dtype="object") else: - lti_df = pd.DataFrame( - { - "last_time": es_lti_dict[child_df.ww.name], - dataframe.ww.index: child_df[link_col], - }, + lti_df["last_time"] = lti_df["last_time"].astype( + "datetime64[ns]", ) - - # sort by time and keep only the most recent - lti_df.sort_values( - ["last_time", dataframe.ww.index], - kind="mergesort", - inplace=True, + lti_df["last_time_old"] = lti_df["last_time_old"].astype( + "datetime64[ns]", ) - - lti_df.drop_duplicates( - dataframe.ww.index, - keep="last", - inplace=True, + lti_df = lti_df.fillna( + pd.to_datetime("1800-01-01 00:00"), + ).max(axis=1) + lti_df = lti_df.replace( + pd.to_datetime("1800-01-01 00:00"), + pd.NaT, ) - lti_df.set_index(dataframe.ww.index, inplace=True) - lti_df = lti_df.reindex(es_lti_dict[dataframe.ww.name].index) - lti_df["last_time_old"] = es_lti_dict[dataframe.ww.name] - if not (lti_is_dask or lti_is_spark) and lti_df.empty: - # Pandas errors out if it tries to do fillna and then max on an empty dataframe - lti_df = pd.Series([], dtype="object") - else: - if lti_is_spark: - # TODO: Figure out a workaround for fillna and replace - if lti_df["last_time_old"].dtype != "datetime64[ns]": - lti_df["last_time_old"] = ps.to_datetime( - lti_df["last_time_old"], - ) - if lti_df["last_time"].dtype != "datetime64[ns]": - lti_df["last_time"] = ps.to_datetime( - lti_df["last_time"], - ) - lti_df = lti_df.max(axis=1) - else: - lti_df["last_time"] = lti_df["last_time"].astype( - "datetime64[ns]", - ) - lti_df["last_time_old"] = lti_df["last_time_old"].astype( - "datetime64[ns]", - ) - lti_df = lti_df.fillna( - pd.to_datetime("1800-01-01 00:00"), - ).max(axis=1) - lti_df = lti_df.replace( - pd.to_datetime("1800-01-01 00:00"), - pd.NaT, - ) - es_lti_dict[dataframe.ww.name] = lti_df es_lti_dict[dataframe.ww.name].name = "last_time" @@ -1304,16 +1207,13 @@ def add_last_time_indexes(self, updated_dataframes=None): for df in self.dataframes: lti = es_lti_dict[df.ww.name] if lti is not None: - lti_ltype = None if self.time_type == "numeric": if lti.dtype == "datetime64[ns]": # Woodwork cannot convert from datetime to numeric lti = lti.apply(lambda x: x.value) lti = init_series(lti, logical_type="Double") - lti_ltype = "Double" else: lti = init_series(lti, logical_type="Datetime") - lti_ltype = "Datetime" lti.name = LTI_COLUMN_NAME @@ -1328,30 +1228,10 @@ def add_last_time_indexes(self, updated_dataframes=None): ) # Add the new column to the DataFrame - if is_instance(df, dd, "DataFrame"): - new_df = df.merge(lti.reset_index(), on=df.ww.index) - new_df.ww.init_with_partial_schema( - schema=df.ww.schema, - logical_types={LTI_COLUMN_NAME: lti_ltype}, - ) - - new_idx = new_df[new_df.ww.index] - new_idx.name = None - new_df.index = new_idx - dfs_to_update[df.ww.name] = new_df - elif is_instance(df, ps, "DataFrame"): - new_df = df.merge(lti, left_on=df.ww.index, right_index=True) - new_df.ww.init_with_partial_schema( - schema=df.ww.schema, - logical_types={LTI_COLUMN_NAME: lti_ltype}, - ) - - dfs_to_update[df.ww.name] = new_df - else: - df.ww[LTI_COLUMN_NAME] = lti - if "last_time_index" not in df.ww.semantic_tags[LTI_COLUMN_NAME]: - df.ww.add_semantic_tags({LTI_COLUMN_NAME: "last_time_index"}) - df.ww.metadata["last_time_index"] = LTI_COLUMN_NAME + df.ww[LTI_COLUMN_NAME] = lti + if "last_time_index" not in df.ww.semantic_tags[LTI_COLUMN_NAME]: + df.ww.add_semantic_tags({LTI_COLUMN_NAME: "last_time_index"}) + df.ww.metadata["last_time_index"] = LTI_COLUMN_NAME for df in dfs_to_update.values(): df.ww.add_semantic_tags({LTI_COLUMN_NAME: "last_time_index"}) @@ -1403,12 +1283,6 @@ def add_interesting_values( If not specified, interesting values will be set for all eligible columns. If values are specified, max_values and verbose parameters will be ignored. - Notes: - - Finding interesting values is not supported with Dask or Spark EntitySets. - To set interesting values for Dask or Spark EntitySets, values must be - specified with the ``values`` parameter. - Returns: None @@ -1503,16 +1377,13 @@ def plot(self, to_file=None): column_typing_info.append(col_string) columns_string = "\l".join(column_typing_info) # noqa: W605 - if is_instance(df, dd, "DataFrame"): # dataframe is a dask dataframe - label = "{%s |%s\l}" % (df.ww.name, columns_string) # noqa: W605 - else: - nrows = df.shape[0] - label = "{%s (%d row%s)|%s\l}" % ( # noqa: W605 - df.ww.name, - nrows, - "s" * (nrows > 1), - columns_string, - ) + nrows = df.shape[0] + label = "{%s (%d row%s)|%s\l}" % ( # noqa: W605 + df.ww.name, + nrows, + "s" * (nrows > 1), + columns_string, + ) graph.node(df.ww.name, shape="record", label=label) # Draw relationships @@ -1548,10 +1419,8 @@ def _handle_time( """ schema = self[dataframe_name].ww.schema - if is_instance(df, ps, "DataFrame") and isinstance(time_last, np.datetime64): - time_last = pd.to_datetime(time_last) if schema.time_index: - df_empty = df.empty if isinstance(df, pd.DataFrame) else False + df_empty = df.empty if time_last is not None and not df_empty: if include_cutoff_time: df = df[df[schema.time_index] <= time_last] @@ -1581,16 +1450,9 @@ def _handle_time( secondary_time_indexes = schema.metadata.get("secondary_time_index") or {} for secondary_time_index, columns in secondary_time_indexes.items(): # should we use ignore time last here? - df_empty = df.empty if isinstance(df, pd.DataFrame) else False - if time_last is not None and not df_empty: + if time_last is not None and not df.empty: mask = df[secondary_time_index] >= time_last - if is_instance(df, dd, "DataFrame"): - for col in columns: - df[col] = df[col].mask(mask, np.nan) - elif is_instance(df, ps, "DataFrame"): - df.loc[mask, columns] = None - else: - df.loc[mask, columns] = np.nan + df.loc[mask, columns] = np.nan return df @@ -1643,27 +1505,8 @@ def query_by_values( df = dataframe.head(0) else: - if is_instance(instance_vals, (dd, ps), "Series"): - df = dataframe.merge( - instance_vals.to_frame(), - how="inner", - on=column_name, - ) - elif isinstance(instance_vals, pd.Series) and is_instance( - dataframe, - ps, - "DataFrame", - ): - df = dataframe.merge( - ps.DataFrame({column_name: instance_vals}), - how="inner", - on=column_name, - ) - else: - df = dataframe[dataframe[column_name].isin(instance_vals)] - - if isinstance(dataframe, pd.DataFrame): - df = df.set_index(dataframe.ww.index, drop=False) + df = dataframe[dataframe[column_name].isin(instance_vals)] + df = df.set_index(dataframe.ww.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below @@ -1671,13 +1514,7 @@ def query_by_values( # # Pandas claims that bug is fixed but it still shows up in some # cases. More investigation needed. - # - # Note: Woodwork stores categorical columns with a `string` dtype for Spark - if dataframe.ww.columns[column_name].is_categorical and not is_instance( - df, - ps, - "DataFrame", - ): + if dataframe.ww.columns[column_name].is_categorical: categories = pd.api.types.CategoricalDtype( categories=dataframe[column_name].cat.categories, ) @@ -1830,31 +1667,15 @@ def _add_references_to_metadata(self, dataframe): _ES_REF[self.id] = self def _normalize_values(self, dataframe): - def replace(x, is_spark=False): + def replace(x): if not isinstance(x, (list, tuple, np.ndarray)) and pd.isna(x): - if is_spark: - return [np.nan, np.nan] - else: - return (np.nan, np.nan) + return (np.nan, np.nan) else: return x for column, logical_type in dataframe.ww.logical_types.items(): if isinstance(logical_type, LatLong): - series = dataframe[column] - if ps and isinstance(series, ps.Series): - if len(series): - dataframe[column] = dataframe[column].apply( - replace, - args=(True,), - ) - elif is_instance(dataframe, dd, "DataFrame"): - dataframe[column] = dataframe[column].apply( - replace, - meta=(column, logical_type.primary_dtype), - ) - else: - dataframe[column] = dataframe[column].apply(replace) + dataframe[column] = dataframe[column].apply(replace) return dataframe @@ -1873,8 +1694,6 @@ def _vals_to_series(instance_vals, column_id): # convert iterable to pd.Series if isinstance(instance_vals, pd.DataFrame): out_vals = instance_vals[column_id] - elif is_instance(instance_vals, (pd, dd, ps), "Series"): - out_vals = instance_vals.rename(column_id) else: out_vals = pd.Series(instance_vals) @@ -1924,9 +1743,5 @@ def _get_or_create_index(index, make_index, df): def _create_index(df, index): - if is_instance(df, dd, "DataFrame") or is_instance(df, ps, "DataFrame"): - df[index] = 1 - df[index] = df[index].cumsum() - 1 - else: - df.insert(0, index, range(len(df))) + df.insert(0, index, range(len(df))) return df diff --git a/featuretools/entityset/serialize.py b/featuretools/entityset/serialize.py index df275049d7..2815e34cc6 100644 --- a/featuretools/entityset/serialize.py +++ b/featuretools/entityset/serialize.py @@ -6,13 +6,10 @@ from woodwork.serializers.serializer_base import typing_info_to_dict -from featuretools.utils.gen_utils import import_or_none from featuretools.utils.s3_utils import get_transport_params, use_smartopen_es from featuretools.utils.wrangle import _is_s3, _is_url from featuretools.version import ENTITYSET_SCHEMA_VERSION -ps = import_or_none("pyspark.pandas") - FORMATS = ["csv", "pickle", "parquet"] @@ -34,15 +31,12 @@ def entityset_to_description(entityset, format=None): relationship.to_dictionary() for relationship in entityset.relationships ] - data_type = entityset.dataframe_type - data_description = { "schema_version": ENTITYSET_SCHEMA_VERSION, "id": entityset.id, "dataframes": dataframes, "relationships": relationships, "format": format, - "data_type": data_type, } return data_description diff --git a/featuretools/feature_base/feature_base.py b/featuretools/feature_base/feature_base.py index 0ef232df7e..cd63d01d69 100644 --- a/featuretools/feature_base/feature_base.py +++ b/featuretools/feature_base/feature_base.py @@ -10,12 +10,8 @@ PrimitiveBase, TransformPrimitive, ) -from featuretools.utils.gen_utils import Library, import_or_none, is_instance from featuretools.utils.wrangle import _check_time_against_column, _check_timedelta -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - _ES_REF = {} @@ -52,12 +48,6 @@ def __init__( if not isinstance(primitive, PrimitiveBase): primitive = primitive() - # default library is PANDAS - if is_instance(dataframe, dd, "DataFrame"): - primitive.series_library = Library.DASK - elif is_instance(dataframe, ps, "DataFrame"): - primitive.series_library = Library.SPARK - self.primitive = primitive self.relationship_path = relationship_path diff --git a/featuretools/tests/integration_data/__init__.py b/featuretools/feature_discovery/__init__.py similarity index 100% rename from featuretools/tests/integration_data/__init__.py rename to featuretools/feature_discovery/__init__.py diff --git a/featuretools/primitives/base/primitive_base.py b/featuretools/primitives/base/primitive_base.py index 350aa3e01b..49ab0bc6ac 100644 --- a/featuretools/primitives/base/primitive_base.py +++ b/featuretools/primitives/base/primitive_base.py @@ -6,7 +6,6 @@ from featuretools import config from featuretools.utils.description_utils import convert_to_nth -from featuretools.utils.gen_utils import Library class PrimitiveBase(object): @@ -40,15 +39,12 @@ class PrimitiveBase(object): stack_on_self = True # (bool) If True will only make one feature per unique set of base features commutative = False - #: (list): Additional compatible libraries - compatibility = [Library.PANDAS] #: (str, list[str]): description template of the primitive. Input column # descriptions are passed as positional arguments to the template. Slice # number (if present) in "nth" form is passed to the template via the # `nth_slice` keyword argument. Multi-output primitives can use a list to # differentiate between the base description and a slice description. description_template = None - series_library = Library.PANDAS def __init__(self): pass diff --git a/featuretools/primitives/standard/aggregation/all_primitive.py b/featuretools/primitives/standard/aggregation/all_primitive.py index 6643e91b65..dfd4b22e4e 100644 --- a/featuretools/primitives/standard/aggregation/all_primitive.py +++ b/featuretools/primitives/standard/aggregation/all_primitive.py @@ -3,9 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library, import_or_none - -dd = import_or_none("dask.dataframe") class All(AggregationPrimitive): @@ -28,18 +25,7 @@ class All(AggregationPrimitive): ] return_type = ColumnSchema(logical_type=Boolean) stack_on_self = False - compatibility = [Library.PANDAS, Library.DASK] description_template = "whether all of {} are true" - def get_function(self, agg_type=Library.PANDAS): - if agg_type == Library.DASK: - - def chunk(s): - return s.agg(np.all) - - def agg(s): - return s.agg(np.all) - - return dd.Aggregation(self.name, chunk=chunk, agg=agg) - + def get_function(self): return np.all diff --git a/featuretools/primitives/standard/aggregation/any_primitive.py b/featuretools/primitives/standard/aggregation/any_primitive.py index 3c4bb321be..878f0fdc17 100644 --- a/featuretools/primitives/standard/aggregation/any_primitive.py +++ b/featuretools/primitives/standard/aggregation/any_primitive.py @@ -3,9 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library, import_or_none - -dd = import_or_none("dask.dataframe") class Any(AggregationPrimitive): @@ -28,18 +25,7 @@ class Any(AggregationPrimitive): ] return_type = ColumnSchema(logical_type=Boolean) stack_on_self = False - compatibility = [Library.PANDAS, Library.DASK] description_template = "whether any of {} are true" - def get_function(self, agg_type=Library.PANDAS): - if agg_type == Library.DASK: - - def chunk(s): - return s.agg(np.any) - - def agg(s): - return s.agg(np.any) - - return dd.Aggregation(self.name, chunk=chunk, agg=agg) - + def get_function(self): return np.any diff --git a/featuretools/primitives/standard/aggregation/avg_time_between.py b/featuretools/primitives/standard/aggregation/avg_time_between.py index 4520f463a0..9e09635511 100644 --- a/featuretools/primitives/standard/aggregation/avg_time_between.py +++ b/featuretools/primitives/standard/aggregation/avg_time_between.py @@ -7,7 +7,6 @@ from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive from featuretools.utils import convert_time_units -from featuretools.utils.gen_utils import Library class AvgTimeBetween(AggregationPrimitive): @@ -44,7 +43,7 @@ class AvgTimeBetween(AggregationPrimitive): def __init__(self, unit="seconds"): self.unit = unit.lower() - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def pd_avg_time_between(x): """Assumes time scales are closer to order of seconds than to nanoseconds diff --git a/featuretools/primitives/standard/aggregation/count.py b/featuretools/primitives/standard/aggregation/count.py index 782d950678..c3c283c291 100644 --- a/featuretools/primitives/standard/aggregation/count.py +++ b/featuretools/primitives/standard/aggregation/count.py @@ -3,7 +3,6 @@ from woodwork.logical_types import IntegerNullable from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Count(AggregationPrimitive): @@ -20,13 +19,9 @@ class Count(AggregationPrimitive): return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"}) stack_on_self = False default_value = 0 - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the number" - def get_function(self, agg_type=Library.PANDAS): - if agg_type in [Library.DASK, Library.SPARK]: - return "count" - + def get_function(self): return pd.Series.count def generate_name( diff --git a/featuretools/primitives/standard/aggregation/entropy.py b/featuretools/primitives/standard/aggregation/entropy.py index 70e7b090be..ad3b1efcb9 100644 --- a/featuretools/primitives/standard/aggregation/entropy.py +++ b/featuretools/primitives/standard/aggregation/entropy.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Entropy(AggregationPrimitive): @@ -36,7 +35,7 @@ def __init__(self, dropna=False, base=None): self.dropna = dropna self.base = base - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def pd_entropy(s): distribution = s.value_counts(normalize=True, dropna=self.dropna) if distribution.dtype == "Float64": diff --git a/featuretools/primitives/standard/aggregation/first.py b/featuretools/primitives/standard/aggregation/first.py index 7cd8fee07a..eb643f7745 100644 --- a/featuretools/primitives/standard/aggregation/first.py +++ b/featuretools/primitives/standard/aggregation/first.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class First(AggregationPrimitive): @@ -19,7 +18,7 @@ class First(AggregationPrimitive): stack_on_self = False description_template = "the first instance of {}" - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def pd_first(x): return x.iloc[0] diff --git a/featuretools/primitives/standard/aggregation/last.py b/featuretools/primitives/standard/aggregation/last.py index 45728cb3fe..e8b379e928 100644 --- a/featuretools/primitives/standard/aggregation/last.py +++ b/featuretools/primitives/standard/aggregation/last.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Last(AggregationPrimitive): @@ -19,7 +18,7 @@ class Last(AggregationPrimitive): stack_on_self = False description_template = "the last instance of {}" - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def pd_last(x): return x.iloc[-1] diff --git a/featuretools/primitives/standard/aggregation/max_primitive.py b/featuretools/primitives/standard/aggregation/max_primitive.py index 55153d52ae..9081bfac44 100644 --- a/featuretools/primitives/standard/aggregation/max_primitive.py +++ b/featuretools/primitives/standard/aggregation/max_primitive.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Max(AggregationPrimitive): @@ -18,11 +17,7 @@ class Max(AggregationPrimitive): input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) stack_on_self = False - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the maximum of {}" - def get_function(self, agg_type=Library.PANDAS): - if agg_type in [Library.DASK, Library.SPARK]: - return "max" - + def get_function(self): return np.max diff --git a/featuretools/primitives/standard/aggregation/mean.py b/featuretools/primitives/standard/aggregation/mean.py index 4b4b9a2f07..f4df3e83e1 100644 --- a/featuretools/primitives/standard/aggregation/mean.py +++ b/featuretools/primitives/standard/aggregation/mean.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Mean(AggregationPrimitive): @@ -27,16 +26,12 @@ class Mean(AggregationPrimitive): name = "mean" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the average of {}" def __init__(self, skipna=True): self.skipna = skipna - def get_function(self, agg_type=Library.PANDAS): - if agg_type in [Library.DASK, Library.SPARK]: - return "mean" - + def get_function(self): if self.skipna: # np.mean of series is functionally nanmean return np.mean diff --git a/featuretools/primitives/standard/aggregation/median.py b/featuretools/primitives/standard/aggregation/median.py index 1a62996d7b..9961587256 100644 --- a/featuretools/primitives/standard/aggregation/median.py +++ b/featuretools/primitives/standard/aggregation/median.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Median(AggregationPrimitive): @@ -24,5 +23,5 @@ class Median(AggregationPrimitive): return_type = ColumnSchema(semantic_tags={"numeric"}) description_template = "the median of {}" - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): return pd.Series.median diff --git a/featuretools/primitives/standard/aggregation/min_primitive.py b/featuretools/primitives/standard/aggregation/min_primitive.py index c35c122849..b68957e5f0 100644 --- a/featuretools/primitives/standard/aggregation/min_primitive.py +++ b/featuretools/primitives/standard/aggregation/min_primitive.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Min(AggregationPrimitive): @@ -18,11 +17,7 @@ class Min(AggregationPrimitive): input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) stack_on_self = False - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the minimum of {}" - def get_function(self, agg_type=Library.PANDAS): - if agg_type in [Library.DASK, Library.SPARK]: - return "min" - + def get_function(self): return np.min diff --git a/featuretools/primitives/standard/aggregation/mode.py b/featuretools/primitives/standard/aggregation/mode.py index ee6de30939..4cbf966a47 100644 --- a/featuretools/primitives/standard/aggregation/mode.py +++ b/featuretools/primitives/standard/aggregation/mode.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Mode(AggregationPrimitive): @@ -24,7 +23,7 @@ class Mode(AggregationPrimitive): return_type = None description_template = "the most frequently occurring value of {}" - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def pd_mode(s): return s.mode().get(0, np.nan) diff --git a/featuretools/primitives/standard/aggregation/n_most_common.py b/featuretools/primitives/standard/aggregation/n_most_common.py index 3481cb511b..fff30795bc 100644 --- a/featuretools/primitives/standard/aggregation/n_most_common.py +++ b/featuretools/primitives/standard/aggregation/n_most_common.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class NMostCommon(AggregationPrimitive): @@ -38,7 +37,7 @@ def __init__(self, n=3): *["the {nth_slice} most common value of {}"] * (n - 1), ] - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def n_most_common(x): # Counts of 0 remain in value_counts output if dtype is category # so we need to remove them diff --git a/featuretools/primitives/standard/aggregation/num_true.py b/featuretools/primitives/standard/aggregation/num_true.py index e0e7f5a147..6491e2666a 100644 --- a/featuretools/primitives/standard/aggregation/num_true.py +++ b/featuretools/primitives/standard/aggregation/num_true.py @@ -3,9 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable, IntegerNullable from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library, import_or_none - -dd = import_or_none("dask.dataframe") class NumTrue(AggregationPrimitive): @@ -30,21 +27,7 @@ class NumTrue(AggregationPrimitive): default_value = 0 stack_on = [] stack_on_exclude = [] - compatibility = [Library.PANDAS, Library.DASK] description_template = "the number of times {} is true" - def get_function(self, agg_type=Library.PANDAS): - if agg_type == Library.DASK: - - def chunk(s): - chunk_sum = s.agg(np.sum) - if chunk_sum.dtype == "bool": - chunk_sum = chunk_sum.astype("int64") - return chunk_sum - - def agg(s): - return s.agg(np.sum) - - return dd.Aggregation(self.name, chunk=chunk, agg=agg) - + def get_function(self): return np.sum diff --git a/featuretools/primitives/standard/aggregation/num_unique.py b/featuretools/primitives/standard/aggregation/num_unique.py index f2864f1337..d01a6094c0 100644 --- a/featuretools/primitives/standard/aggregation/num_unique.py +++ b/featuretools/primitives/standard/aggregation/num_unique.py @@ -3,9 +3,6 @@ from woodwork.logical_types import IntegerNullable from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library, import_or_none - -dd = import_or_none("dask.dataframe") class NumUnique(AggregationPrimitive): @@ -32,34 +29,12 @@ class NumUnique(AggregationPrimitive): input_types = [ColumnSchema(semantic_tags={"category"})] return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"}) stack_on_self = False - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the number of unique elements in {}" def __init__(self, use_string_for_pd_calc=True): self.use_string_for_pd_calc = use_string_for_pd_calc - def get_function(self, agg_type=Library.PANDAS): - if agg_type == Library.DASK: - - def chunk(s): - def inner_chunk(x): - x = x[:].dropna() - return set(x.unique()) - - return s.agg(inner_chunk) - - def agg(s): - def inner_agg(x): - x = x[:].dropna() - return set().union(*x.values) - - return s.agg(inner_agg) - - def finalize(s): - return s.apply(lambda x: len(x)) - - return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize) - - if self.use_string_for_pd_calc or agg_type == Library.SPARK: + def get_function(self): + if self.use_string_for_pd_calc: return "nunique" return pd.Series.nunique diff --git a/featuretools/primitives/standard/aggregation/percent_true.py b/featuretools/primitives/standard/aggregation/percent_true.py index a24acc3f28..806a3ed868 100644 --- a/featuretools/primitives/standard/aggregation/percent_true.py +++ b/featuretools/primitives/standard/aggregation/percent_true.py @@ -3,9 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable, Double from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library, import_or_none - -dd = import_or_none("dask.dataframe") class PercentTrue(AggregationPrimitive): @@ -32,32 +29,9 @@ class PercentTrue(AggregationPrimitive): stack_on = [] stack_on_exclude = [] default_value = pd.NA - compatibility = [Library.PANDAS, Library.DASK] description_template = "the percentage of true values in {}" - def get_function(self, agg_type=Library.PANDAS): - if agg_type == Library.DASK: - - def chunk(s): - def format_chunk(x): - return x[:].fillna(False) - - chunk_sum = s.agg(lambda x: format_chunk(x).sum()) - chunk_len = s.agg(lambda x: len(format_chunk(x))) - if chunk_sum.dtype == "bool": - chunk_sum = chunk_sum.astype("int64") - if chunk_len.dtype == "bool": - chunk_len = chunk_len.astype("int64") - return (chunk_sum, chunk_len) - - def agg(val, length): - return (val.sum(), length.sum()) - - def finalize(total, length): - return total / length - - return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize) - + def get_function(self): def percent_true(s): return s.fillna(False).mean() diff --git a/featuretools/primitives/standard/aggregation/skew.py b/featuretools/primitives/standard/aggregation/skew.py index 294244e1a4..03f561ce0a 100644 --- a/featuretools/primitives/standard/aggregation/skew.py +++ b/featuretools/primitives/standard/aggregation/skew.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Skew(AggregationPrimitive): @@ -26,5 +25,5 @@ class Skew(AggregationPrimitive): stack_on_self = False description_template = "the skewness of {}" - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): return pd.Series.skew diff --git a/featuretools/primitives/standard/aggregation/std.py b/featuretools/primitives/standard/aggregation/std.py index 22bd4883d5..c064286424 100644 --- a/featuretools/primitives/standard/aggregation/std.py +++ b/featuretools/primitives/standard/aggregation/std.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive -from featuretools.utils.gen_utils import Library class Std(AggregationPrimitive): @@ -18,11 +17,7 @@ class Std(AggregationPrimitive): input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) stack_on_self = False - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the standard deviation of {}" - def get_function(self, agg_type=Library.PANDAS): - if agg_type in [Library.DASK, Library.SPARK]: - return "std" - + def get_function(self): return np.std diff --git a/featuretools/primitives/standard/aggregation/sum_primitive.py b/featuretools/primitives/standard/aggregation/sum_primitive.py index 45943f85f8..d0413453ea 100644 --- a/featuretools/primitives/standard/aggregation/sum_primitive.py +++ b/featuretools/primitives/standard/aggregation/sum_primitive.py @@ -3,7 +3,6 @@ from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive from featuretools.primitives.standard.aggregation.count import Count -from featuretools.utils.gen_utils import Library class Sum(AggregationPrimitive): @@ -21,11 +20,7 @@ class Sum(AggregationPrimitive): stack_on_self = False stack_on_exclude = [Count] default_value = 0 - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the sum of {}" - def get_function(self, agg_type=Library.PANDAS): - if agg_type in [Library.DASK, Library.SPARK]: - return "sum" - + def get_function(self): return np.sum diff --git a/featuretools/primitives/standard/aggregation/time_since_first.py b/featuretools/primitives/standard/aggregation/time_since_first.py index a946a08bc2..dc8eb989d1 100644 --- a/featuretools/primitives/standard/aggregation/time_since_first.py +++ b/featuretools/primitives/standard/aggregation/time_since_first.py @@ -3,7 +3,6 @@ from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive from featuretools.utils import convert_time_units -from featuretools.utils.gen_utils import Library class TimeSinceFirst(AggregationPrimitive): @@ -49,7 +48,7 @@ class TimeSinceFirst(AggregationPrimitive): def __init__(self, unit="seconds"): self.unit = unit.lower() - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def time_since_first(values, time=None): time_since = time - values.iloc[0] return convert_time_units(time_since.total_seconds(), self.unit) diff --git a/featuretools/primitives/standard/aggregation/time_since_last.py b/featuretools/primitives/standard/aggregation/time_since_last.py index 598a9a0eac..37674071cf 100644 --- a/featuretools/primitives/standard/aggregation/time_since_last.py +++ b/featuretools/primitives/standard/aggregation/time_since_last.py @@ -3,7 +3,6 @@ from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive from featuretools.utils import convert_time_units -from featuretools.utils.gen_utils import Library class TimeSinceLast(AggregationPrimitive): @@ -49,7 +48,7 @@ class TimeSinceLast(AggregationPrimitive): def __init__(self, unit="seconds"): self.unit = unit.lower() - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def time_since_last(values, time=None): time_since = time - values.iloc[-1] return convert_time_units(time_since.total_seconds(), self.unit) diff --git a/featuretools/primitives/standard/aggregation/trend.py b/featuretools/primitives/standard/aggregation/trend.py index 0f225d8a96..98e99e1265 100644 --- a/featuretools/primitives/standard/aggregation/trend.py +++ b/featuretools/primitives/standard/aggregation/trend.py @@ -4,7 +4,6 @@ from featuretools.primitives.base.aggregation_primitive_base import AggregationPrimitive from featuretools.utils import calculate_trend -from featuretools.utils.gen_utils import Library class Trend(AggregationPrimitive): @@ -35,7 +34,7 @@ class Trend(AggregationPrimitive): return_type = ColumnSchema(semantic_tags={"numeric"}) description_template = "the linear trend of {} over time" - def get_function(self, agg_type=Library.PANDAS): + def get_function(self): def pd_trend(y, x): return calculate_trend(pd.Series(data=y.values, index=x.values)) diff --git a/featuretools/primitives/standard/transform/binary/add_numeric.py b/featuretools/primitives/standard/transform/binary/add_numeric.py index 75a94bfd55..c94913abe0 100644 --- a/featuretools/primitives/standard/transform/binary/add_numeric.py +++ b/featuretools/primitives/standard/transform/binary/add_numeric.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class AddNumeric(TransformPrimitive): @@ -26,7 +25,7 @@ class AddNumeric(TransformPrimitive): ] return_type = ColumnSchema(semantic_tags={"numeric"}) commutative = True - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the sum of {} and {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/add_numeric_scalar.py b/featuretools/primitives/standard/transform/binary/add_numeric_scalar.py index a462aa0a41..107028b388 100644 --- a/featuretools/primitives/standard/transform/binary/add_numeric_scalar.py +++ b/featuretools/primitives/standard/transform/binary/add_numeric_scalar.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class AddNumericScalar(TransformPrimitive): @@ -20,7 +19,6 @@ class AddNumericScalar(TransformPrimitive): name = "add_numeric_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=0): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/and_primitive.py b/featuretools/primitives/standard/transform/binary/and_primitive.py index 4ad16688c5..02be859bc2 100644 --- a/featuretools/primitives/standard/transform/binary/and_primitive.py +++ b/featuretools/primitives/standard/transform/binary/and_primitive.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class And(TransformPrimitive): @@ -38,7 +37,7 @@ class And(TransformPrimitive): ] return_type = ColumnSchema(logical_type=BooleanNullable) commutative = True - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} and {} are true" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/divide_by_feature.py b/featuretools/primitives/standard/transform/binary/divide_by_feature.py index 8d830e0928..1e8ad6cf9f 100644 --- a/featuretools/primitives/standard/transform/binary/divide_by_feature.py +++ b/featuretools/primitives/standard/transform/binary/divide_by_feature.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class DivideByFeature(TransformPrimitive): @@ -21,7 +20,6 @@ class DivideByFeature(TransformPrimitive): name = "divide_by_feature" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=1): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/divide_numeric.py b/featuretools/primitives/standard/transform/binary/divide_numeric.py index 17667ed2eb..6693db544d 100644 --- a/featuretools/primitives/standard/transform/binary/divide_numeric.py +++ b/featuretools/primitives/standard/transform/binary/divide_numeric.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class DivideNumeric(TransformPrimitive): @@ -29,7 +28,7 @@ class DivideNumeric(TransformPrimitive): ColumnSchema(semantic_tags={"numeric"}), ] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the result of {} divided by {}" def __init__(self, commutative=False): diff --git a/featuretools/primitives/standard/transform/binary/divide_numeric_scalar.py b/featuretools/primitives/standard/transform/binary/divide_numeric_scalar.py index 5b4f9e3bcd..6001714c7f 100644 --- a/featuretools/primitives/standard/transform/binary/divide_numeric_scalar.py +++ b/featuretools/primitives/standard/transform/binary/divide_numeric_scalar.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class DivideNumericScalar(TransformPrimitive): @@ -20,7 +19,6 @@ class DivideNumericScalar(TransformPrimitive): name = "divide_numeric_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=1): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/equal.py b/featuretools/primitives/standard/transform/binary/equal.py index d26fb4ae72..b7e7017cad 100644 --- a/featuretools/primitives/standard/transform/binary/equal.py +++ b/featuretools/primitives/standard/transform/binary/equal.py @@ -3,7 +3,6 @@ from woodwork.logical_types import BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Equal(TransformPrimitive): @@ -24,7 +23,7 @@ class Equal(TransformPrimitive): input_types = [ColumnSchema(), ColumnSchema()] return_type = ColumnSchema(logical_type=BooleanNullable) commutative = True - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} equals {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/equal_scalar.py b/featuretools/primitives/standard/transform/binary/equal_scalar.py index cbf6f032fb..3e8467dc7e 100644 --- a/featuretools/primitives/standard/transform/binary/equal_scalar.py +++ b/featuretools/primitives/standard/transform/binary/equal_scalar.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class EqualScalar(TransformPrimitive): @@ -21,7 +20,6 @@ class EqualScalar(TransformPrimitive): name = "equal_scalar" input_types = [ColumnSchema()] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=None): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/greater_than.py b/featuretools/primitives/standard/transform/binary/greater_than.py index 6b0bdb6569..2390531a10 100644 --- a/featuretools/primitives/standard/transform/binary/greater_than.py +++ b/featuretools/primitives/standard/transform/binary/greater_than.py @@ -4,7 +4,6 @@ from woodwork.logical_types import BooleanNullable, Datetime, Ordinal from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class GreaterThan(TransformPrimitive): @@ -31,7 +30,6 @@ class GreaterThan(TransformPrimitive): [ColumnSchema(logical_type=Ordinal), ColumnSchema(logical_type=Ordinal)], ] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK] description_template = "whether {} is greater than {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/greater_than_equal_to.py b/featuretools/primitives/standard/transform/binary/greater_than_equal_to.py index 0fe79e43ba..266f0903b6 100644 --- a/featuretools/primitives/standard/transform/binary/greater_than_equal_to.py +++ b/featuretools/primitives/standard/transform/binary/greater_than_equal_to.py @@ -4,7 +4,6 @@ from woodwork.logical_types import BooleanNullable, Datetime, Ordinal from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class GreaterThanEqualTo(TransformPrimitive): @@ -31,7 +30,7 @@ class GreaterThanEqualTo(TransformPrimitive): [ColumnSchema(logical_type=Ordinal), ColumnSchema(logical_type=Ordinal)], ] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} is greater than or equal to {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/greater_than_equal_to_scalar.py b/featuretools/primitives/standard/transform/binary/greater_than_equal_to_scalar.py index 7841b66f0f..6fb593d2be 100644 --- a/featuretools/primitives/standard/transform/binary/greater_than_equal_to_scalar.py +++ b/featuretools/primitives/standard/transform/binary/greater_than_equal_to_scalar.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class GreaterThanEqualToScalar(TransformPrimitive): @@ -22,7 +21,6 @@ class GreaterThanEqualToScalar(TransformPrimitive): name = "greater_than_equal_to_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=0): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/greater_than_scalar.py b/featuretools/primitives/standard/transform/binary/greater_than_scalar.py index 472e0620b8..cf63fd15f0 100644 --- a/featuretools/primitives/standard/transform/binary/greater_than_scalar.py +++ b/featuretools/primitives/standard/transform/binary/greater_than_scalar.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class GreaterThanScalar(TransformPrimitive): @@ -22,7 +21,6 @@ class GreaterThanScalar(TransformPrimitive): name = "greater_than_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=0): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/less_than.py b/featuretools/primitives/standard/transform/binary/less_than.py index ba627258aa..b4d70f2f03 100644 --- a/featuretools/primitives/standard/transform/binary/less_than.py +++ b/featuretools/primitives/standard/transform/binary/less_than.py @@ -4,7 +4,6 @@ from woodwork.logical_types import BooleanNullable, Datetime, Ordinal from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class LessThan(TransformPrimitive): @@ -31,7 +30,7 @@ class LessThan(TransformPrimitive): [ColumnSchema(logical_type=Ordinal), ColumnSchema(logical_type=Ordinal)], ] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} is less than {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/less_than_equal_to.py b/featuretools/primitives/standard/transform/binary/less_than_equal_to.py index 4210a6a411..0d8f474c50 100644 --- a/featuretools/primitives/standard/transform/binary/less_than_equal_to.py +++ b/featuretools/primitives/standard/transform/binary/less_than_equal_to.py @@ -4,7 +4,6 @@ from woodwork.logical_types import BooleanNullable, Datetime, Ordinal from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class LessThanEqualTo(TransformPrimitive): @@ -31,7 +30,7 @@ class LessThanEqualTo(TransformPrimitive): [ColumnSchema(logical_type=Ordinal), ColumnSchema(logical_type=Ordinal)], ] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} is less than or equal to {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/less_than_equal_to_scalar.py b/featuretools/primitives/standard/transform/binary/less_than_equal_to_scalar.py index 38c33435bb..cfe154416d 100644 --- a/featuretools/primitives/standard/transform/binary/less_than_equal_to_scalar.py +++ b/featuretools/primitives/standard/transform/binary/less_than_equal_to_scalar.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class LessThanEqualToScalar(TransformPrimitive): @@ -22,7 +21,6 @@ class LessThanEqualToScalar(TransformPrimitive): name = "less_than_equal_to_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=0): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/less_than_scalar.py b/featuretools/primitives/standard/transform/binary/less_than_scalar.py index 580d495286..d39ecbddb4 100644 --- a/featuretools/primitives/standard/transform/binary/less_than_scalar.py +++ b/featuretools/primitives/standard/transform/binary/less_than_scalar.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class LessThanScalar(TransformPrimitive): @@ -22,7 +21,6 @@ class LessThanScalar(TransformPrimitive): name = "less_than_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=0): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/modulo_by_feature.py b/featuretools/primitives/standard/transform/binary/modulo_by_feature.py index dcdd76c911..789bcfdd47 100644 --- a/featuretools/primitives/standard/transform/binary/modulo_by_feature.py +++ b/featuretools/primitives/standard/transform/binary/modulo_by_feature.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class ModuloByFeature(TransformPrimitive): @@ -21,7 +20,6 @@ class ModuloByFeature(TransformPrimitive): name = "modulo_by_feature" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=1): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/modulo_numeric.py b/featuretools/primitives/standard/transform/binary/modulo_numeric.py index 8ca2b69263..4a14f21a3c 100644 --- a/featuretools/primitives/standard/transform/binary/modulo_numeric.py +++ b/featuretools/primitives/standard/transform/binary/modulo_numeric.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class ModuloNumeric(TransformPrimitive): @@ -25,7 +24,7 @@ class ModuloNumeric(TransformPrimitive): ColumnSchema(semantic_tags={"numeric"}), ] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the remainder after dividing {} by {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/modulo_numeric_scalar.py b/featuretools/primitives/standard/transform/binary/modulo_numeric_scalar.py index 047c515f3c..77db2a6a89 100644 --- a/featuretools/primitives/standard/transform/binary/modulo_numeric_scalar.py +++ b/featuretools/primitives/standard/transform/binary/modulo_numeric_scalar.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class ModuloNumericScalar(TransformPrimitive): @@ -21,7 +20,6 @@ class ModuloNumericScalar(TransformPrimitive): name = "modulo_numeric_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=1): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/multiply_boolean.py b/featuretools/primitives/standard/transform/binary/multiply_boolean.py index 0d297b5774..8ace5780f0 100644 --- a/featuretools/primitives/standard/transform/binary/multiply_boolean.py +++ b/featuretools/primitives/standard/transform/binary/multiply_boolean.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class MultiplyBoolean(TransformPrimitive): @@ -38,7 +37,6 @@ class MultiplyBoolean(TransformPrimitive): ] return_type = ColumnSchema(logical_type=BooleanNullable) commutative = True - compatibility = [Library.PANDAS, Library.DASK] description_template = "the product of {} and {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/multiply_numeric.py b/featuretools/primitives/standard/transform/binary/multiply_numeric.py index 176eed3bc4..9713aa8c0f 100644 --- a/featuretools/primitives/standard/transform/binary/multiply_numeric.py +++ b/featuretools/primitives/standard/transform/binary/multiply_numeric.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class MultiplyNumeric(TransformPrimitive): @@ -26,7 +25,7 @@ class MultiplyNumeric(TransformPrimitive): ] return_type = ColumnSchema(semantic_tags={"numeric"}) commutative = True - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the product of {} and {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/multiply_numeric_boolean.py b/featuretools/primitives/standard/transform/binary/multiply_numeric_boolean.py index cd65031118..be22a4b7bd 100644 --- a/featuretools/primitives/standard/transform/binary/multiply_numeric_boolean.py +++ b/featuretools/primitives/standard/transform/binary/multiply_numeric_boolean.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class MultiplyNumericBoolean(TransformPrimitive): @@ -45,7 +44,6 @@ class MultiplyNumericBoolean(TransformPrimitive): ], ] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK] commutative = True description_template = "the product of {} and {}" diff --git a/featuretools/primitives/standard/transform/binary/multiply_numeric_scalar.py b/featuretools/primitives/standard/transform/binary/multiply_numeric_scalar.py index 0500fc5877..c7b240a250 100644 --- a/featuretools/primitives/standard/transform/binary/multiply_numeric_scalar.py +++ b/featuretools/primitives/standard/transform/binary/multiply_numeric_scalar.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class MultiplyNumericScalar(TransformPrimitive): @@ -20,7 +19,6 @@ class MultiplyNumericScalar(TransformPrimitive): name = "multiply_numeric_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=1): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/not_equal.py b/featuretools/primitives/standard/transform/binary/not_equal.py index a889c8fce1..91e8161c6b 100644 --- a/featuretools/primitives/standard/transform/binary/not_equal.py +++ b/featuretools/primitives/standard/transform/binary/not_equal.py @@ -3,7 +3,6 @@ from woodwork.logical_types import BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class NotEqual(TransformPrimitive): @@ -24,7 +23,6 @@ class NotEqual(TransformPrimitive): input_types = [ColumnSchema(), ColumnSchema()] return_type = ColumnSchema(logical_type=BooleanNullable) commutative = True - compatibility = [Library.PANDAS, Library.DASK] description_template = "whether {} does not equal {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/not_equal_scalar.py b/featuretools/primitives/standard/transform/binary/not_equal_scalar.py index a7e65d62bf..4af29cdac2 100644 --- a/featuretools/primitives/standard/transform/binary/not_equal_scalar.py +++ b/featuretools/primitives/standard/transform/binary/not_equal_scalar.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class NotEqualScalar(TransformPrimitive): @@ -21,7 +20,6 @@ class NotEqualScalar(TransformPrimitive): name = "not_equal_scalar" input_types = [ColumnSchema()] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=None): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/or_primitive.py b/featuretools/primitives/standard/transform/binary/or_primitive.py index e426a1544e..76f6e75ff0 100644 --- a/featuretools/primitives/standard/transform/binary/or_primitive.py +++ b/featuretools/primitives/standard/transform/binary/or_primitive.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Or(TransformPrimitive): @@ -38,7 +37,7 @@ class Or(TransformPrimitive): ] return_type = ColumnSchema(logical_type=BooleanNullable) commutative = True - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} is true or {} is true" def get_function(self): diff --git a/featuretools/primitives/standard/transform/binary/scalar_subtract_numeric_feature.py b/featuretools/primitives/standard/transform/binary/scalar_subtract_numeric_feature.py index 13f93a2946..2df3323eef 100644 --- a/featuretools/primitives/standard/transform/binary/scalar_subtract_numeric_feature.py +++ b/featuretools/primitives/standard/transform/binary/scalar_subtract_numeric_feature.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class ScalarSubtractNumericFeature(TransformPrimitive): @@ -21,7 +20,6 @@ class ScalarSubtractNumericFeature(TransformPrimitive): name = "scalar_subtract_numeric_feature" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=0): self.value = value diff --git a/featuretools/primitives/standard/transform/binary/subtract_numeric.py b/featuretools/primitives/standard/transform/binary/subtract_numeric.py index 14135375c4..ef8007d67a 100644 --- a/featuretools/primitives/standard/transform/binary/subtract_numeric.py +++ b/featuretools/primitives/standard/transform/binary/subtract_numeric.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class SubtractNumeric(TransformPrimitive): @@ -34,7 +33,6 @@ class SubtractNumeric(TransformPrimitive): ColumnSchema(semantic_tags={"numeric"}), ] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK] description_template = "the result of {} minus {}" commutative = True diff --git a/featuretools/primitives/standard/transform/binary/subtract_numeric_scalar.py b/featuretools/primitives/standard/transform/binary/subtract_numeric_scalar.py index 01c9e52333..556e31339f 100644 --- a/featuretools/primitives/standard/transform/binary/subtract_numeric_scalar.py +++ b/featuretools/primitives/standard/transform/binary/subtract_numeric_scalar.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base.transform_primitive_base import TransformPrimitive -from featuretools.utils.gen_utils import Library class SubtractNumericScalar(TransformPrimitive): @@ -20,7 +19,6 @@ class SubtractNumericScalar(TransformPrimitive): name = "subtract_numeric_scalar" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, value=0): self.value = value diff --git a/featuretools/primitives/standard/transform/datetime/age.py b/featuretools/primitives/standard/transform/datetime/age.py index 3958c5f0f1..3a64397cc4 100644 --- a/featuretools/primitives/standard/transform/datetime/age.py +++ b/featuretools/primitives/standard/transform/datetime/age.py @@ -2,7 +2,6 @@ from woodwork.logical_types import AgeFractional, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Age(TransformPrimitive): @@ -30,7 +29,6 @@ class Age(TransformPrimitive): input_types = [ColumnSchema(logical_type=Datetime, semantic_tags={"date_of_birth"})] return_type = ColumnSchema(logical_type=AgeFractional, semantic_tags={"numeric"}) uses_calc_time = True - compatibility = [Library.PANDAS, Library.DASK] description_template = "the age from {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/day.py b/featuretools/primitives/standard/transform/datetime/day.py index ec4ed56ab5..a0c760b944 100644 --- a/featuretools/primitives/standard/transform/datetime/day.py +++ b/featuretools/primitives/standard/transform/datetime/day.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Day(TransformPrimitive): @@ -24,7 +23,7 @@ class Day(TransformPrimitive): logical_type=Ordinal(order=list(range(1, 32))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the day of the month of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/day_of_year.py b/featuretools/primitives/standard/transform/datetime/day_of_year.py index c34bc28a60..1b7a57fef2 100644 --- a/featuretools/primitives/standard/transform/datetime/day_of_year.py +++ b/featuretools/primitives/standard/transform/datetime/day_of_year.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class DayOfYear(TransformPrimitive): @@ -28,7 +27,7 @@ class DayOfYear(TransformPrimitive): logical_type=Ordinal(order=list(range(1, 367))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the day of year from {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/days_in_month.py b/featuretools/primitives/standard/transform/datetime/days_in_month.py index a5593761eb..9e4c015a9f 100644 --- a/featuretools/primitives/standard/transform/datetime/days_in_month.py +++ b/featuretools/primitives/standard/transform/datetime/days_in_month.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class DaysInMonth(TransformPrimitive): @@ -24,7 +23,7 @@ class DaysInMonth(TransformPrimitive): logical_type=Ordinal(order=list(range(1, 32))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the days in the month of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/hour.py b/featuretools/primitives/standard/transform/datetime/hour.py index 45036f8600..db17546c4a 100644 --- a/featuretools/primitives/standard/transform/datetime/hour.py +++ b/featuretools/primitives/standard/transform/datetime/hour.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Hour(TransformPrimitive): @@ -24,7 +23,7 @@ class Hour(TransformPrimitive): logical_type=Ordinal(order=list(range(24))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the hour value of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/is_leap_year.py b/featuretools/primitives/standard/transform/datetime/is_leap_year.py index b350fcefc9..31018d861c 100644 --- a/featuretools/primitives/standard/transform/datetime/is_leap_year.py +++ b/featuretools/primitives/standard/transform/datetime/is_leap_year.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsLeapYear(TransformPrimitive): @@ -21,7 +20,7 @@ class IsLeapYear(TransformPrimitive): name = "is_leap_year" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether the year of {} is a leap year" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/is_lunch_time.py b/featuretools/primitives/standard/transform/datetime/is_lunch_time.py index 6f3f151042..8b82c75820 100644 --- a/featuretools/primitives/standard/transform/datetime/is_lunch_time.py +++ b/featuretools/primitives/standard/transform/datetime/is_lunch_time.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsLunchTime(TransformPrimitive): @@ -29,7 +28,7 @@ class IsLunchTime(TransformPrimitive): name = "is_lunch_time" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} falls during lunch time" def __init__(self, lunch_hour=12): diff --git a/featuretools/primitives/standard/transform/datetime/is_month_end.py b/featuretools/primitives/standard/transform/datetime/is_month_end.py index d270f11257..9d796a0dd7 100644 --- a/featuretools/primitives/standard/transform/datetime/is_month_end.py +++ b/featuretools/primitives/standard/transform/datetime/is_month_end.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsMonthEnd(TransformPrimitive): @@ -21,7 +20,7 @@ class IsMonthEnd(TransformPrimitive): name = "is_month_end" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} is at the end of a month" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/is_month_start.py b/featuretools/primitives/standard/transform/datetime/is_month_start.py index 29ace5dffe..840e963026 100644 --- a/featuretools/primitives/standard/transform/datetime/is_month_start.py +++ b/featuretools/primitives/standard/transform/datetime/is_month_start.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsMonthStart(TransformPrimitive): @@ -21,7 +20,7 @@ class IsMonthStart(TransformPrimitive): name = "is_month_start" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} is at the start of a month" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/is_quarter_end.py b/featuretools/primitives/standard/transform/datetime/is_quarter_end.py index 6820fd78e2..1692111502 100644 --- a/featuretools/primitives/standard/transform/datetime/is_quarter_end.py +++ b/featuretools/primitives/standard/transform/datetime/is_quarter_end.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsQuarterEnd(TransformPrimitive): @@ -20,7 +19,7 @@ class IsQuarterEnd(TransformPrimitive): name = "is_quarter_end" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} is a quarter end" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/is_quarter_start.py b/featuretools/primitives/standard/transform/datetime/is_quarter_start.py index 0412e1c0bc..d029fad79e 100644 --- a/featuretools/primitives/standard/transform/datetime/is_quarter_start.py +++ b/featuretools/primitives/standard/transform/datetime/is_quarter_start.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsQuarterStart(TransformPrimitive): @@ -20,7 +19,7 @@ class IsQuarterStart(TransformPrimitive): name = "is_quarter_start" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} is a quarter start" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/is_weekend.py b/featuretools/primitives/standard/transform/datetime/is_weekend.py index dc9473b8fa..fd2cfdaaae 100644 --- a/featuretools/primitives/standard/transform/datetime/is_weekend.py +++ b/featuretools/primitives/standard/transform/datetime/is_weekend.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsWeekend(TransformPrimitive): @@ -21,7 +20,7 @@ class IsWeekend(TransformPrimitive): name = "is_weekend" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} occurred on a weekend" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/is_working_hours.py b/featuretools/primitives/standard/transform/datetime/is_working_hours.py index f96c45bdd7..038d4ce225 100644 --- a/featuretools/primitives/standard/transform/datetime/is_working_hours.py +++ b/featuretools/primitives/standard/transform/datetime/is_working_hours.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsWorkingHours(TransformPrimitive): @@ -30,7 +29,7 @@ class IsWorkingHours(TransformPrimitive): name = "is_working_hours" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} falls during working hours" def __init__(self, start_hour=8, end_hour=18): diff --git a/featuretools/primitives/standard/transform/datetime/is_year_end.py b/featuretools/primitives/standard/transform/datetime/is_year_end.py index 786e2b72ad..02e2949687 100644 --- a/featuretools/primitives/standard/transform/datetime/is_year_end.py +++ b/featuretools/primitives/standard/transform/datetime/is_year_end.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsYearEnd(TransformPrimitive): @@ -23,7 +22,7 @@ class IsYearEnd(TransformPrimitive): name = "is_year_end" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} occurred on the end of a year" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/is_year_start.py b/featuretools/primitives/standard/transform/datetime/is_year_start.py index 537fbd4fd1..a070cd3900 100644 --- a/featuretools/primitives/standard/transform/datetime/is_year_start.py +++ b/featuretools/primitives/standard/transform/datetime/is_year_start.py @@ -2,7 +2,6 @@ from woodwork.logical_types import BooleanNullable, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsYearStart(TransformPrimitive): @@ -23,7 +22,7 @@ class IsYearStart(TransformPrimitive): name = "is_year_start" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "whether {} occurred on the start of a year" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/minute.py b/featuretools/primitives/standard/transform/datetime/minute.py index bc1cf3d017..5e56b6c658 100644 --- a/featuretools/primitives/standard/transform/datetime/minute.py +++ b/featuretools/primitives/standard/transform/datetime/minute.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Minute(TransformPrimitive): @@ -24,7 +23,7 @@ class Minute(TransformPrimitive): logical_type=Ordinal(order=list(range(60))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the minutes value of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/month.py b/featuretools/primitives/standard/transform/datetime/month.py index 6f6dd63b15..ee87e2b680 100644 --- a/featuretools/primitives/standard/transform/datetime/month.py +++ b/featuretools/primitives/standard/transform/datetime/month.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Month(TransformPrimitive): @@ -24,7 +23,7 @@ class Month(TransformPrimitive): logical_type=Ordinal(order=list(range(1, 13))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the month of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/part_of_day.py b/featuretools/primitives/standard/transform/datetime/part_of_day.py index d78349ed20..765cf6da03 100644 --- a/featuretools/primitives/standard/transform/datetime/part_of_day.py +++ b/featuretools/primitives/standard/transform/datetime/part_of_day.py @@ -4,7 +4,6 @@ from woodwork.logical_types import Categorical, Datetime from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class PartOfDay(TransformPrimitive): @@ -35,7 +34,7 @@ class PartOfDay(TransformPrimitive): name = "part_of_day" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Categorical, semantic_tags={"category"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the part of day {} falls in" @staticmethod diff --git a/featuretools/primitives/standard/transform/datetime/quarter.py b/featuretools/primitives/standard/transform/datetime/quarter.py index cab70f56a9..c88b029aca 100644 --- a/featuretools/primitives/standard/transform/datetime/quarter.py +++ b/featuretools/primitives/standard/transform/datetime/quarter.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Quarter(TransformPrimitive): @@ -24,7 +23,7 @@ class Quarter(TransformPrimitive): logical_type=Ordinal(order=list(range(1, 5))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the quarter that describes {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/second.py b/featuretools/primitives/standard/transform/datetime/second.py index 4c7dba77ce..7ee231547a 100644 --- a/featuretools/primitives/standard/transform/datetime/second.py +++ b/featuretools/primitives/standard/transform/datetime/second.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Second(TransformPrimitive): @@ -24,7 +23,7 @@ class Second(TransformPrimitive): logical_type=Ordinal(order=list(range(60))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the seconds value of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/time_since.py b/featuretools/primitives/standard/transform/datetime/time_since.py index 59e20ee7bf..38bd0fbf57 100644 --- a/featuretools/primitives/standard/transform/datetime/time_since.py +++ b/featuretools/primitives/standard/transform/datetime/time_since.py @@ -3,7 +3,6 @@ from featuretools.primitives.base import TransformPrimitive from featuretools.utils import convert_time_units -from featuretools.utils.gen_utils import Library class TimeSince(TransformPrimitive): @@ -42,7 +41,6 @@ class TimeSince(TransformPrimitive): input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(semantic_tags={"numeric"}) uses_calc_time = True - compatibility = [Library.PANDAS, Library.DASK] description_template = "the time from {} to the cutoff time" def __init__(self, unit="seconds"): diff --git a/featuretools/primitives/standard/transform/datetime/week.py b/featuretools/primitives/standard/transform/datetime/week.py index 8b05ff5b6b..ef8d376c96 100644 --- a/featuretools/primitives/standard/transform/datetime/week.py +++ b/featuretools/primitives/standard/transform/datetime/week.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Week(TransformPrimitive): @@ -29,7 +28,7 @@ class Week(TransformPrimitive): logical_type=Ordinal(order=list(range(1, 54))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the week of the year of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/weekday.py b/featuretools/primitives/standard/transform/datetime/weekday.py index 1dd64d8417..9b99409805 100644 --- a/featuretools/primitives/standard/transform/datetime/weekday.py +++ b/featuretools/primitives/standard/transform/datetime/weekday.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Weekday(TransformPrimitive): @@ -28,7 +27,7 @@ class Weekday(TransformPrimitive): logical_type=Ordinal(order=list(range(7))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the day of the week of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/datetime/year.py b/featuretools/primitives/standard/transform/datetime/year.py index 77a29a0021..d0e189708c 100644 --- a/featuretools/primitives/standard/transform/datetime/year.py +++ b/featuretools/primitives/standard/transform/datetime/year.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Datetime, Ordinal from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Year(TransformPrimitive): @@ -24,7 +23,7 @@ class Year(TransformPrimitive): logical_type=Ordinal(order=list(range(1, 3000))), semantic_tags={"category"}, ) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the year of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/is_in.py b/featuretools/primitives/standard/transform/is_in.py index 0b44f0ccbd..6ef7c888ab 100644 --- a/featuretools/primitives/standard/transform/is_in.py +++ b/featuretools/primitives/standard/transform/is_in.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Boolean from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsIn(TransformPrimitive): @@ -18,7 +17,6 @@ class IsIn(TransformPrimitive): name = "isin" input_types = [ColumnSchema()] return_type = ColumnSchema(logical_type=Boolean) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def __init__(self, list_of_outputs=None): self.list_of_outputs = list_of_outputs diff --git a/featuretools/primitives/standard/transform/is_null.py b/featuretools/primitives/standard/transform/is_null.py index 26c0e3adbb..08f4fa9055 100644 --- a/featuretools/primitives/standard/transform/is_null.py +++ b/featuretools/primitives/standard/transform/is_null.py @@ -2,7 +2,6 @@ from woodwork.logical_types import Boolean from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class IsNull(TransformPrimitive): @@ -17,7 +16,6 @@ class IsNull(TransformPrimitive): name = "is_null" input_types = [ColumnSchema()] return_type = ColumnSchema(logical_type=Boolean) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "whether {} is null" def get_function(self): diff --git a/featuretools/primitives/standard/transform/natural_language/num_characters.py b/featuretools/primitives/standard/transform/natural_language/num_characters.py index d9cd55367e..9b99fde887 100644 --- a/featuretools/primitives/standard/transform/natural_language/num_characters.py +++ b/featuretools/primitives/standard/transform/natural_language/num_characters.py @@ -3,7 +3,6 @@ from woodwork.logical_types import IntegerNullable, NaturalLanguage from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class NumCharacters(TransformPrimitive): @@ -23,7 +22,7 @@ class NumCharacters(TransformPrimitive): name = "num_characters" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the number of characters in {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/natural_language/num_words.py b/featuretools/primitives/standard/transform/natural_language/num_words.py index 686d069f38..09914430bd 100644 --- a/featuretools/primitives/standard/transform/natural_language/num_words.py +++ b/featuretools/primitives/standard/transform/natural_language/num_words.py @@ -10,7 +10,6 @@ from featuretools.primitives.standard.transform.natural_language.constants import ( DELIMITERS, ) -from featuretools.utils.gen_utils import Library class NumWords(TransformPrimitive): @@ -29,7 +28,7 @@ class NumWords(TransformPrimitive): name = "num_words" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the number of words in {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/not_primitive.py b/featuretools/primitives/standard/transform/not_primitive.py index 60abbabcd9..ee3c39dace 100644 --- a/featuretools/primitives/standard/transform/not_primitive.py +++ b/featuretools/primitives/standard/transform/not_primitive.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Boolean, BooleanNullable from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Not(TransformPrimitive): @@ -21,7 +20,6 @@ class Not(TransformPrimitive): [ColumnSchema(logical_type=BooleanNullable)], ] return_type = ColumnSchema(logical_type=BooleanNullable) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the negation of {}" def generate_name(self, base_feature_names): diff --git a/featuretools/primitives/standard/transform/numeric/absolute.py b/featuretools/primitives/standard/transform/numeric/absolute.py index 2d525f486b..4c62b7a208 100644 --- a/featuretools/primitives/standard/transform/numeric/absolute.py +++ b/featuretools/primitives/standard/transform/numeric/absolute.py @@ -2,7 +2,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Absolute(TransformPrimitive): @@ -17,7 +16,7 @@ class Absolute(TransformPrimitive): name = "absolute" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the absolute value of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/numeric/cosine.py b/featuretools/primitives/standard/transform/numeric/cosine.py index c0ba27bc63..94f56374ff 100644 --- a/featuretools/primitives/standard/transform/numeric/cosine.py +++ b/featuretools/primitives/standard/transform/numeric/cosine.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Double from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Cosine(TransformPrimitive): @@ -18,7 +17,7 @@ class Cosine(TransformPrimitive): name = "cosine" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the cosine of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/numeric/natural_logarithm.py b/featuretools/primitives/standard/transform/numeric/natural_logarithm.py index 88c9656086..a14e4d6d73 100644 --- a/featuretools/primitives/standard/transform/numeric/natural_logarithm.py +++ b/featuretools/primitives/standard/transform/numeric/natural_logarithm.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Double from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class NaturalLogarithm(TransformPrimitive): @@ -20,7 +19,7 @@ class NaturalLogarithm(TransformPrimitive): name = "natural_logarithm" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the natural logarithm of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/numeric/negate.py b/featuretools/primitives/standard/transform/numeric/negate.py index c4b71a130e..e7cab058c0 100644 --- a/featuretools/primitives/standard/transform/numeric/negate.py +++ b/featuretools/primitives/standard/transform/numeric/negate.py @@ -1,7 +1,6 @@ from woodwork.column_schema import ColumnSchema from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Negate(TransformPrimitive): @@ -16,7 +15,6 @@ class Negate(TransformPrimitive): name = "negate" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the negation of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/numeric/sine.py b/featuretools/primitives/standard/transform/numeric/sine.py index c8b8df6e7a..13bd70139b 100644 --- a/featuretools/primitives/standard/transform/numeric/sine.py +++ b/featuretools/primitives/standard/transform/numeric/sine.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Double from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Sine(TransformPrimitive): @@ -18,7 +17,7 @@ class Sine(TransformPrimitive): name = "sine" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the sine of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/numeric/square_root.py b/featuretools/primitives/standard/transform/numeric/square_root.py index 055c56a772..ca2c836da7 100644 --- a/featuretools/primitives/standard/transform/numeric/square_root.py +++ b/featuretools/primitives/standard/transform/numeric/square_root.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Double from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class SquareRoot(TransformPrimitive): @@ -18,7 +17,7 @@ class SquareRoot(TransformPrimitive): name = "square_root" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the square root of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/numeric/tangent.py b/featuretools/primitives/standard/transform/numeric/tangent.py index 1b37286667..f60448ab70 100644 --- a/featuretools/primitives/standard/transform/numeric/tangent.py +++ b/featuretools/primitives/standard/transform/numeric/tangent.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Double from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class Tangent(TransformPrimitive): @@ -18,7 +17,7 @@ class Tangent(TransformPrimitive): name = "tangent" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + description_template = "the tangent of {}" def get_function(self): diff --git a/featuretools/primitives/standard/transform/postal/one_digit_postal_code.py b/featuretools/primitives/standard/transform/postal/one_digit_postal_code.py index 97a44e411f..7cd74e3806 100644 --- a/featuretools/primitives/standard/transform/postal/one_digit_postal_code.py +++ b/featuretools/primitives/standard/transform/postal/one_digit_postal_code.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Categorical, PostalCode from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class OneDigitPostalCode(TransformPrimitive): @@ -20,7 +19,6 @@ class OneDigitPostalCode(TransformPrimitive): name = "one_digit_postal_code" input_types = [ColumnSchema(logical_type=PostalCode)] - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] return_type = ColumnSchema(logical_type=Categorical, semantic_tags={"category"}) description_template = "The one digit postal code prefix of {}" diff --git a/featuretools/primitives/standard/transform/postal/two_digit_postal_code.py b/featuretools/primitives/standard/transform/postal/two_digit_postal_code.py index 619cfc146d..68f1176725 100644 --- a/featuretools/primitives/standard/transform/postal/two_digit_postal_code.py +++ b/featuretools/primitives/standard/transform/postal/two_digit_postal_code.py @@ -3,7 +3,6 @@ from woodwork.logical_types import Categorical, PostalCode from featuretools.primitives.base import TransformPrimitive -from featuretools.utils.gen_utils import Library class TwoDigitPostalCode(TransformPrimitive): @@ -20,7 +19,7 @@ class TwoDigitPostalCode(TransformPrimitive): name = "two_digit_postal_code" input_types = [ColumnSchema(logical_type=PostalCode)] - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] + return_type = ColumnSchema(logical_type=Categorical, semantic_tags={"category"}) description_template = "The two digit postal code prefix of {}" diff --git a/featuretools/primitives/utils.py b/featuretools/primitives/utils.py index 2949ab3a86..bef52b66bd 100644 --- a/featuretools/primitives/utils.py +++ b/featuretools/primitives/utils.py @@ -15,7 +15,7 @@ PrimitiveBase, TransformPrimitive, ) -from featuretools.utils.gen_utils import Library, find_descendents +from featuretools.utils.gen_utils import find_descendents def _get_primitives(primitive_kind): @@ -32,16 +32,12 @@ def _get_primitives(primitive_kind): def get_aggregation_primitives(): - """Returns all aggregation primitives, regardless - of compatibility - """ + """Returns all aggregation primitives""" return _get_primitives(featuretools.primitives.AggregationPrimitive) def get_transform_primitives(): - """Returns all transform primitives, regardless - of compatibility - """ + """Returns all transform primitives""" return _get_primitives(featuretools.primitives.TransformPrimitive) @@ -57,9 +53,7 @@ def get_all_primitives(): def _get_natural_language_primitives(): - """Returns all Natural Language transform primitives, - regardless of compatibility - """ + """Returns all Natural Language transform primitives""" transform_primitives = get_transform_primitives() def _natural_language_in_input_type(primitive): @@ -87,18 +81,10 @@ def list_primitives(): trans_names, trans_primitives, valid_inputs, return_type = _get_names_primitives( get_transform_primitives, ) - trans_dask = [ - Library.DASK in primitive.compatibility for primitive in trans_primitives - ] - trans_spark = [ - Library.SPARK in primitive.compatibility for primitive in trans_primitives - ] transform_df = pd.DataFrame( { "name": trans_names, "description": _get_descriptions(trans_primitives), - "dask_compatible": trans_dask, - "spark_compatible": trans_spark, "valid_inputs": valid_inputs, "return_type": return_type, }, @@ -108,16 +94,10 @@ def list_primitives(): agg_names, agg_primitives, valid_inputs, return_type = _get_names_primitives( get_aggregation_primitives, ) - agg_dask = [Library.DASK in primitive.compatibility for primitive in agg_primitives] - agg_spark = [ - Library.SPARK in primitive.compatibility for primitive in agg_primitives - ] agg_df = pd.DataFrame( { "name": agg_names, "description": _get_descriptions(agg_primitives), - "dask_compatible": agg_dask, - "spark_compatible": agg_spark, "valid_inputs": valid_inputs, "return_type": return_type, }, @@ -127,8 +107,6 @@ def list_primitives(): columns = [ "name", "type", - "dask_compatible", - "spark_compatible", "description", "valid_inputs", "return_type", diff --git a/featuretools/synthesis/deep_feature_synthesis.py b/featuretools/synthesis/deep_feature_synthesis.py index 36c1944dde..9f5e30637f 100644 --- a/featuretools/synthesis/deep_feature_synthesis.py +++ b/featuretools/synthesis/deep_feature_synthesis.py @@ -32,7 +32,7 @@ generate_all_primitive_options, ignore_dataframe_for_primitive, ) -from featuretools.utils.gen_utils import Library, camel_and_title_to_snake +from featuretools.utils.gen_utils import camel_and_title_to_snake logger = logging.getLogger("featuretools") @@ -196,19 +196,10 @@ def __init__( self.target_dataframe_name = target_dataframe_name self.es = entityset - for library in Library: - if library.value == self.es.dataframe_type: - df_library = library - break - aggregation_primitive_dict = primitives.get_aggregation_primitives() transform_primitive_dict = primitives.get_transform_primitives() if agg_primitives is None: - agg_primitives = [ - p - for p in primitives.get_default_aggregation_primitives() - if df_library in p.compatibility - ] + agg_primitives = primitives.get_default_aggregation_primitives() self.agg_primitives = sorted( [ check_primitive( @@ -222,11 +213,8 @@ def __init__( ) if trans_primitives is None: - trans_primitives = [ - p - for p in primitives.get_default_transform_primitives() - if df_library in p.compatibility - ] + trans_primitives = primitives.get_default_transform_primitives() + self.trans_primitives = sorted( [ check_primitive( @@ -275,12 +263,6 @@ def __init__( + self.where_primitives + self.groupby_trans_primitives ) - bad_primitives = [ - prim.name for prim in all_primitives if df_library not in prim.compatibility - ] - if bad_primitives: - msg = "Selected primitives are incompatible with {} EntitySets: {}" - raise ValueError(msg.format(df_library.value, ", ".join(bad_primitives))) ( self.primitive_options, @@ -1335,8 +1317,7 @@ def _direct_of_dataframe(feature, parent_dataframe): def get_feature_depth(feature, stop_at=None): """Helper method to allow caching of feature.get_depth() - Why here and not in FeatureBase? Putting t in FeatureBase was causing - some weird pickle errors in spark tests in 3.9 and this keeps the caching + Why here and not in FeatureBase? This keeps the caching local to DFS. """ hash_key = hash(f"{feature.get_name()}{feature.dataframe_name}{stop_at}") diff --git a/featuretools/synthesis/encode_features.py b/featuretools/synthesis/encode_features.py index 1b90ce0bc2..7fe246c889 100644 --- a/featuretools/synthesis/encode_features.py +++ b/featuretools/synthesis/encode_features.py @@ -82,10 +82,6 @@ def encode_features( drop_first=True) f_encoded """ - if not isinstance(feature_matrix, pd.DataFrame): - msg = "feature_matrix must be a Pandas DataFrame" - raise TypeError(msg) - if inplace: X = feature_matrix else: diff --git a/featuretools/synthesis/get_valid_primitives.py b/featuretools/synthesis/get_valid_primitives.py index 5aaa2bbd4c..03bd5da301 100644 --- a/featuretools/synthesis/get_valid_primitives.py +++ b/featuretools/synthesis/get_valid_primitives.py @@ -5,7 +5,6 @@ ) from featuretools.synthesis.deep_feature_synthesis import DeepFeatureSynthesis from featuretools.synthesis.utils import _categorize_features, get_unused_primitives -from featuretools.utils.gen_utils import Library def get_valid_primitives( @@ -47,11 +46,6 @@ def get_valid_primitives( available_aggs = get_aggregation_primitives() available_trans = get_transform_primitives() - for library in Library: - if library.value == entityset.dataframe_type: - df_library = library - break - if selected_primitives: for prim in selected_primitives: if not isinstance(prim, str): @@ -72,17 +66,10 @@ def get_valid_primitives( prim_list = trans_primitives else: raise ValueError(f"'{prim}' is not a recognized primitive name") - if df_library in prim.compatibility: - prim_list.append(prim) + prim_list.append(prim) else: - agg_primitives = [ - agg for agg in available_aggs.values() if df_library in agg.compatibility - ] - trans_primitives = [ - trans - for trans in available_trans.values() - if df_library in trans.compatibility - ] + agg_primitives = [agg for agg in available_aggs.values()] + trans_primitives = [trans for trans in available_trans.values()] dfs_object = DeepFeatureSynthesis( target_dataframe_name, diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py index 991d2a80d4..9ddc54f899 100644 --- a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py +++ b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py @@ -60,11 +60,7 @@ from featuretools.tests.testing_utils import ( backward_path, get_mock_client_cluster, - to_pandas, ) -from featuretools.utils.gen_utils import Library, import_or_none - -dd = import_or_none("dask.dataframe") def test_scatter_warning(caplog): @@ -77,10 +73,7 @@ def test_scatter_warning(caplog): assert warning_message in caplog.text -# TODO: final assert fails w/ Dask def test_calc_feature_matrix(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Distributed dataframe result not ordered") times = list( [datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] @@ -167,32 +160,6 @@ def test_calc_feature_matrix(es): ) assert all(feature_matrix.index == cutoff_reordered["id"].values) - # fails with Dask and Spark entitysets, cutoff time not reordered; cannot verify out of order - # - can't tell if wrong/different all are false so can't check positional - - -def test_cfm_warns_dask_cutoff_time(es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - times = list( - [datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] - + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] - + [datetime(2011, 4, 9, 10, 40, 0)] - + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] - + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] - + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)], - ) - instances = range(17) - cutoff_time = pd.DataFrame({"time": times, es["log"].ww.index: instances}) - cutoff_time = dd.from_pandas(cutoff_time, npartitions=4) - - property_feature = Feature(es["log"].ww["value"]) > 10 - - match = ( - "cutoff_time should be a Pandas DataFrame: " - "computing cutoff_time, this may take a while" - ) - with pytest.warns(UserWarning, match=match): - calculate_feature_matrix([property_feature], es, cutoff_time=cutoff_time) def test_cfm_compose(es, lt): @@ -204,7 +171,6 @@ def test_cfm_compose(es, lt): cutoff_time=lt, verbose=True, ) - feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True) assert ( feature_matrix[property_feature.get_name()] == feature_matrix["label_func"] @@ -212,9 +178,6 @@ def test_cfm_compose(es, lt): def test_cfm_compose_approximate(es, lt): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("dask does not support approximate") - property_feature = Feature(es["log"].ww["value"]) > 10 feature_matrix = calculate_feature_matrix( @@ -225,30 +188,12 @@ def test_cfm_compose_approximate(es, lt): verbose=True, ) assert type(feature_matrix) == pd.core.frame.DataFrame - feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True) - - assert ( - feature_matrix[property_feature.get_name()] == feature_matrix["label_func"] - ).values.all() - - -def test_cfm_dask_compose(dask_es, lt): - property_feature = Feature(dask_es["log"].ww["value"]) > 10 - - feature_matrix = calculate_feature_matrix( - [property_feature], - dask_es, - cutoff_time=lt, - verbose=True, - ) - feature_matrix = feature_matrix.compute() assert ( feature_matrix[property_feature.get_name()] == feature_matrix["label_func"] ).values.all() -# tests approximate, skip for dask/spark def test_cfm_approximate_correct_ordering(): trips = { "trip_id": [i for i in range(1000)], @@ -312,10 +257,9 @@ def test_cfm_approximate_correct_ordering(): assert (pd.isnull(x) and pd.isnull(y)) or (x == y) -# uses approximate, skip for dask/spark entitysets -def test_cfm_no_cutoff_time_index(pd_es): +def test_cfm_no_cutoff_time_index(es): agg_feat = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count, ) @@ -329,7 +273,7 @@ def test_cfm_no_cutoff_time_index(pd_es): ) feature_matrix = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, cutoff_time_in_index=False, approximate=Timedelta(12, "s"), cutoff_time=cutoff_time, @@ -347,7 +291,7 @@ def test_cfm_no_cutoff_time_index(pd_es): ) feature_matrix_2 = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, cutoff_time_in_index=False, approximate=Timedelta(10, "s"), cutoff_time=cutoff_time, @@ -358,11 +302,7 @@ def test_cfm_no_cutoff_time_index(pd_es): assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1] -# TODO: fails with dask entitysets -# TODO: fails with spark entitysets def test_cfm_duplicated_index_in_cutoff_time(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Distributed results not ordered, missing duplicates") times = [ datetime(2011, 4, 1), datetime(2011, 5, 1), @@ -383,10 +323,7 @@ def test_cfm_duplicated_index_in_cutoff_time(es): assert feature_matrix.shape[0] == cutoff_time.shape[0] -# TODO: fails with Dask, Spark def test_saveprogress(es, tmp_path): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("saveprogress fails with distributed entitysets") times = list( [datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] @@ -438,7 +375,6 @@ def test_cutoff_time_correctly(es): es, cutoff_time=cutoff_time, ) - feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True) labels = [10, 5, 0] assert (feature_matrix[property_feature.get_name()] == labels).values.all() @@ -478,18 +414,6 @@ def test_cutoff_time_binning(): binned_cutoff_times = bin_cutoff_times(cutoff_time, Timedelta(1, "mo")) -def test_training_window_fails_dask(dask_es): - property_feature = Feature( - dask_es["log"].ww["id"], - parent_dataframe_name="customers", - primitive=Count, - ) - - error_text = "Using training_window is not supported with Dask dataframes" - with pytest.raises(ValueError, match=error_text): - calculate_feature_matrix([property_feature], dask_es, training_window="2 hours") - - def test_cutoff_time_columns_order(es): property_feature = Feature( es["log"].ww["id"], @@ -516,7 +440,6 @@ def test_cutoff_time_columns_order(es): ) labels = [10, 5, 0] - feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True) assert (feature_matrix[property_feature.get_name()] == labels).values.all() @@ -559,14 +482,14 @@ def test_cutoff_time_df_redundant_column_names(es): calculate_feature_matrix([property_feature], es, cutoff_time=cutoff_time) -def test_training_window(pd_es): +def test_training_window(es): property_feature = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="customers", primitive=Count, ) top_level_agg = Feature( - pd_es["customers"].ww["id"], + es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count, ) @@ -589,18 +512,18 @@ def test_training_window(pd_es): with pytest.warns(UserWarning, match=warn_text): feature_matrix = calculate_feature_matrix( [property_feature, dagg], - pd_es, + es, cutoff_time=cutoff_time, training_window="2 hours", ) - pd_es.add_last_time_indexes() + es.add_last_time_indexes() error_text = "Training window cannot be in observations" with pytest.raises(AssertionError, match=error_text): feature_matrix = calculate_feature_matrix( [property_feature], - pd_es, + es, cutoff_time=cutoff_time, training_window=Timedelta(2, "observations"), ) @@ -608,7 +531,7 @@ def test_training_window(pd_es): # Case1. include_cutoff_time = True feature_matrix = calculate_feature_matrix( [property_feature, dagg], - pd_es, + es, cutoff_time=cutoff_time, training_window="2 hours", include_cutoff_time=True, @@ -621,7 +544,7 @@ def test_training_window(pd_es): # Case2. include_cutoff_time = False feature_matrix = calculate_feature_matrix( [property_feature, dagg], - pd_es, + es, cutoff_time=cutoff_time, training_window="2 hours", include_cutoff_time=False, @@ -635,7 +558,7 @@ def test_training_window(pd_es): # Case3. include_cutoff_time = False with single cutoff time value feature_matrix = calculate_feature_matrix( [property_feature, dagg], - pd_es, + es, cutoff_time=pd.to_datetime("2011-04-09 10:40:00"), training_window="9 minutes", include_cutoff_time=False, @@ -648,7 +571,7 @@ def test_training_window(pd_es): # Case4. include_cutoff_time = True with single cutoff time value feature_matrix = calculate_feature_matrix( [property_feature, dagg], - pd_es, + es, cutoff_time=pd.to_datetime("2011-04-10 10:40:00"), training_window="2 days", include_cutoff_time=True, @@ -659,11 +582,11 @@ def test_training_window(pd_es): assert (feature_matrix[dagg.get_name()] == dagg_values).values.all() -def test_training_window_overlap(pd_es): - pd_es.add_last_time_indexes() +def test_training_window_overlap(es): + es.add_last_time_indexes() count_log = Feature( - Feature(pd_es["log"].ww["id"]), + Feature(es["log"].ww["id"]), parent_dataframe_name="customers", primitive=Count, ) @@ -678,7 +601,7 @@ def test_training_window_overlap(pd_es): # Case1. include_cutoff_time = True actual = calculate_feature_matrix( features=[count_log], - entityset=pd_es, + entityset=es, cutoff_time=cutoff_time, cutoff_time_in_index=True, training_window="10 minutes", @@ -690,7 +613,7 @@ def test_training_window_overlap(pd_es): # Case2. include_cutoff_time = False actual = calculate_feature_matrix( features=[count_log], - entityset=pd_es, + entityset=es, cutoff_time=cutoff_time, cutoff_time_in_index=True, training_window="10 minutes", @@ -724,7 +647,7 @@ def test_include_cutoff_time_without_training_window(es): cutoff_time_in_index=True, include_cutoff_time=True, ) - actual = to_pandas(actual)["COUNT(log)"] + actual = actual["COUNT(log)"] np.testing.assert_array_equal(actual.values, [1, 6]) # Case2. include_cutoff_time = False @@ -735,7 +658,7 @@ def test_include_cutoff_time_without_training_window(es): cutoff_time_in_index=True, include_cutoff_time=False, ) - actual = to_pandas(actual)["COUNT(log)"] + actual = actual["COUNT(log)"] np.testing.assert_array_equal(actual.values, [0, 5]) # Case3. include_cutoff_time = True with single cutoff time value @@ -747,7 +670,7 @@ def test_include_cutoff_time_without_training_window(es): cutoff_time_in_index=True, include_cutoff_time=True, ) - actual = to_pandas(actual)["COUNT(log)"] + actual = actual["COUNT(log)"] np.testing.assert_array_equal(actual.values, [6]) # Case4. include_cutoff_time = False with single cutoff time value @@ -759,13 +682,13 @@ def test_include_cutoff_time_without_training_window(es): cutoff_time_in_index=True, include_cutoff_time=False, ) - actual = to_pandas(actual)["COUNT(log)"] + actual = actual["COUNT(log)"] np.testing.assert_array_equal(actual.values, [5]) -def test_approximate_dfeat_of_agg_on_target_include_cutoff_time(pd_es): +def test_approximate_dfeat_of_agg_on_target_include_cutoff_time(es): agg_feat = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count, ) @@ -777,7 +700,7 @@ def test_approximate_dfeat_of_agg_on_target_include_cutoff_time(pd_es): ) feature_matrix = calculate_feature_matrix( [dfeat, agg_feat2, agg_feat], - pd_es, + es, approximate=Timedelta(20, "s"), cutoff_time=cutoff_time, include_cutoff_time=False, @@ -791,7 +714,7 @@ def test_approximate_dfeat_of_agg_on_target_include_cutoff_time(pd_es): feature_matrix = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, approximate=Timedelta(20, "s"), cutoff_time=cutoff_time, include_cutoff_time=True, @@ -804,7 +727,7 @@ def test_approximate_dfeat_of_agg_on_target_include_cutoff_time(pd_es): assert feature_matrix[agg_feat.get_name()].tolist() == [5] -def test_training_window_recent_time_index(pd_es): +def test_training_window_recent_time_index(es): # customer with no sessions row = { "id": [3], @@ -824,7 +747,7 @@ def test_training_window_recent_time_index(pd_es): to_add_df.index = range(3, 4) # have to convert category to int in order to concat - old_df = pd_es["customers"] + old_df = es["customers"] old_df.index = old_df.index.astype("int") old_df["id"] = old_df["id"].astype(int) @@ -834,20 +757,20 @@ def test_training_window_recent_time_index(pd_es): df.index = df.index.astype("category") df["id"] = df["id"].astype("category") - pd_es.replace_dataframe( + es.replace_dataframe( dataframe_name="customers", df=df, recalculate_last_time_indexes=False, ) - pd_es.add_last_time_indexes() + es.add_last_time_indexes() property_feature = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="customers", primitive=Count, ) top_level_agg = Feature( - pd_es["customers"].ww["id"], + es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count, ) @@ -864,7 +787,7 @@ def test_training_window_recent_time_index(pd_es): # Case1. include_cutoff_time = True feature_matrix = calculate_feature_matrix( [property_feature, dagg], - pd_es, + es, cutoff_time=cutoff_time, training_window="2 hours", include_cutoff_time=True, @@ -879,7 +802,7 @@ def test_training_window_recent_time_index(pd_es): # Case2. include_cutoff_time = False feature_matrix = calculate_feature_matrix( [property_feature, dagg], - pd_es, + es, cutoff_time=cutoff_time, training_window="2 hours", include_cutoff_time=False, @@ -892,21 +815,9 @@ def test_training_window_recent_time_index(pd_es): assert (feature_matrix[dagg.get_name()] == dagg_values).values.all() -# TODO: add test to fail w/ spark -def test_approximate_fails_dask(dask_es): - agg_feat = Feature( - dask_es["log"].ww["id"], - parent_dataframe_name="sessions", - primitive=Count, - ) - error_text = "Using approximate is not supported with Dask dataframes" - with pytest.raises(ValueError, match=error_text): - calculate_feature_matrix([agg_feat], dask_es, approximate=Timedelta(1, "week")) - - -def test_approximate_multiple_instances_per_cutoff_time(pd_es): +def test_approximate_multiple_instances_per_cutoff_time(es): agg_feat = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count, ) @@ -916,7 +827,7 @@ def test_approximate_multiple_instances_per_cutoff_time(pd_es): cutoff_time = pd.DataFrame({"time": times, "instance_id": [0, 2]}) feature_matrix = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, approximate=Timedelta(1, "week"), cutoff_time=cutoff_time, ) @@ -924,11 +835,11 @@ def test_approximate_multiple_instances_per_cutoff_time(pd_es): assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] -def test_approximate_with_multiple_paths(pd_diamond_es): - pd_es = pd_diamond_es - path = backward_path(pd_es, ["regions", "customers", "transactions"]) +def test_approximate_with_multiple_paths(diamond_es): + es = diamond_es + path = backward_path(es, ["regions", "customers", "transactions"]) agg_feat = AggregationFeature( - Feature(pd_es["transactions"].ww["id"]), + Feature(es["transactions"].ww["id"]), parent_dataframe_name="regions", relationship_path=path, primitive=Count, @@ -938,16 +849,16 @@ def test_approximate_with_multiple_paths(pd_diamond_es): cutoff_time = pd.DataFrame({"time": times, "instance_id": [0, 2]}) feature_matrix = calculate_feature_matrix( [dfeat], - pd_es, + es, approximate=Timedelta(1, "week"), cutoff_time=cutoff_time, ) assert feature_matrix[dfeat.get_name()].tolist() == [6, 2] -def test_approximate_dfeat_of_agg_on_target(pd_es): +def test_approximate_dfeat_of_agg_on_target(es): agg_feat = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count, ) @@ -957,7 +868,7 @@ def test_approximate_dfeat_of_agg_on_target(pd_es): cutoff_time = pd.DataFrame({"time": times, "instance_id": [0, 2]}) feature_matrix = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=cutoff_time, ) @@ -965,8 +876,8 @@ def test_approximate_dfeat_of_agg_on_target(pd_es): assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] -def test_approximate_dfeat_of_need_all_values(pd_es): - p = Feature(pd_es["log"].ww["value"], primitive=Percentile) +def test_approximate_dfeat_of_need_all_values(es): + p = Feature(es["log"].ww["value"], primitive=Percentile) agg_feat = Feature(p, parent_dataframe_name="sessions", primitive=Sum) agg_feat2 = Feature(agg_feat, parent_dataframe_name="customers", primitive=Sum) dfeat = DirectFeature(agg_feat2, "sessions") @@ -974,12 +885,12 @@ def test_approximate_dfeat_of_need_all_values(pd_es): cutoff_time = pd.DataFrame({"time": times, "instance_id": [0, 2]}) feature_matrix = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time_in_index=True, cutoff_time=cutoff_time, ) - log_df = pd_es["log"] + log_df = es["log"] instances = [0, 2] cutoffs = [pd.Timestamp("2011-04-09 10:31:19"), pd.Timestamp("2011-04-09 11:00:00")] approxes = [ @@ -1015,9 +926,9 @@ def test_approximate_dfeat_of_need_all_values(pd_es): assert test_list == true_vals -def test_uses_full_dataframe_feat_of_approximate(pd_es): +def test_uses_full_dataframe_feat_of_approximate(es): agg_feat = Feature( - pd_es["log"].ww["value"], + es["log"].ww["value"], parent_dataframe_name="sessions", primitive=Sum, ) @@ -1033,7 +944,7 @@ def test_uses_full_dataframe_feat_of_approximate(pd_es): feature_matrix_only_dfeat2 = calculate_feature_matrix( [dfeat2], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time_in_index=True, cutoff_time=cutoff_time, @@ -1042,7 +953,7 @@ def test_uses_full_dataframe_feat_of_approximate(pd_es): feature_matrix_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time_in_index=True, cutoff_time=cutoff_time, @@ -1054,7 +965,7 @@ def test_uses_full_dataframe_feat_of_approximate(pd_es): feature_matrix_small_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], - pd_es, + es, approximate=Timedelta(10, "ms"), cutoff_time_in_index=True, cutoff_time=cutoff_time, @@ -1062,7 +973,7 @@ def test_uses_full_dataframe_feat_of_approximate(pd_es): feature_matrix_no_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], - pd_es, + es, cutoff_time_in_index=True, cutoff_time=cutoff_time, ) @@ -1078,9 +989,9 @@ def test_uses_full_dataframe_feat_of_approximate(pd_es): assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist() -def test_approximate_dfeat_of_dfeat_of_agg_on_target(pd_es): +def test_approximate_dfeat_of_dfeat_of_agg_on_target(es): agg_feat = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count, ) @@ -1090,22 +1001,22 @@ def test_approximate_dfeat_of_dfeat_of_agg_on_target(pd_es): cutoff_time = pd.DataFrame({"time": times, "instance_id": [0, 2]}) feature_matrix = calculate_feature_matrix( [dfeat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=cutoff_time, ) assert feature_matrix[dfeat.get_name()].tolist() == [7, 10] -def test_empty_path_approximate_full(pd_es): - pd_es["sessions"].ww["customer_id"] = pd.Series( +def test_empty_path_approximate_full(es): + es["sessions"].ww["customer_id"] = pd.Series( [np.nan, np.nan, np.nan, 1, 1, 2], dtype="category", ) # Need to reassign the `foreign_key` tag as the column reassignment above removes it - pd_es["sessions"].ww.set_types(semantic_tags={"customer_id": "foreign_key"}) + es["sessions"].ww.set_types(semantic_tags={"customer_id": "foreign_key"}) agg_feat = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count, ) @@ -1115,7 +1026,7 @@ def test_empty_path_approximate_full(pd_es): cutoff_time = pd.DataFrame({"time": times, "instance_id": [0, 2]}) feature_matrix = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=cutoff_time, ) @@ -1126,8 +1037,8 @@ def test_empty_path_approximate_full(pd_es): assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] -def test_approx_base_feature_is_also_first_class_feature(pd_es): - log_to_products = DirectFeature(Feature(pd_es["products"].ww["rating"]), "log") +def test_approx_base_feature_is_also_first_class_feature(es): + log_to_products = DirectFeature(Feature(es["products"].ww["rating"]), "log") # This should still be computed properly agg_feat = Feature(log_to_products, parent_dataframe_name="sessions", primitive=Min) customer_agg_feat = Feature( @@ -1141,7 +1052,7 @@ def test_approx_base_feature_is_also_first_class_feature(pd_es): cutoff_time = pd.DataFrame({"time": times, "instance_id": [0, 2]}) feature_matrix = calculate_feature_matrix( [sess_to_cust, agg_feat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=cutoff_time, ) @@ -1152,9 +1063,9 @@ def test_approx_base_feature_is_also_first_class_feature(pd_es): assert vals2 == [4, 1.5] -def test_approximate_time_split_returns_the_same_result(pd_es): +def test_approximate_time_split_returns_the_same_result(es): agg_feat = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count, ) @@ -1173,7 +1084,7 @@ def test_approximate_time_split_returns_the_same_result(pd_es): feature_matrix_at_once = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=cutoff_df, ) @@ -1186,7 +1097,7 @@ def test_approximate_time_split_returns_the_same_result(pd_es): for ct in separate_cutoff: fm = calculate_feature_matrix( [dfeat, agg_feat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=ct, ) @@ -1200,9 +1111,9 @@ def test_approximate_time_split_returns_the_same_result(pd_es): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2) -def test_approximate_returns_correct_empty_default_values(pd_es): +def test_approximate_returns_correct_empty_default_values(es): agg_feat = Feature( - pd_es["log"].ww["id"], + es["log"].ww["id"], parent_dataframe_name="customers", primitive=Count, ) @@ -1220,22 +1131,22 @@ def test_approximate_returns_correct_empty_default_values(pd_es): fm = calculate_feature_matrix( [dfeat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=cutoff_df, ) assert fm[dfeat.get_name()].tolist() == [0, 10] -def test_approximate_child_aggs_handled_correctly(pd_es): +def test_approximate_child_aggs_handled_correctly(es): agg_feat = Feature( - pd_es["customers"].ww["id"], + es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count, ) dfeat = DirectFeature(agg_feat, "customers") agg_feat_2 = Feature( - pd_es["log"].ww["value"], + es["log"].ww["value"], parent_dataframe_name="customers", primitive=Sum, ) @@ -1251,13 +1162,13 @@ def test_approximate_child_aggs_handled_correctly(pd_es): fm = calculate_feature_matrix( [dfeat], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=cutoff_df, ) fm_2 = calculate_feature_matrix( [dfeat, agg_feat_2], - pd_es, + es, approximate=Timedelta(10, "s"), cutoff_time=cutoff_df, ) @@ -1286,9 +1197,7 @@ def test_cutoff_time_naming(es): cutoff_df_wrong_time_name = cutoff_df.rename(columns={"time": "cutoff_time"}) fm1 = calculate_feature_matrix([dfeat], es, cutoff_time=cutoff_df) - fm1 = to_pandas(fm1, index="id", sort_index=True) fm2 = calculate_feature_matrix([dfeat], es, cutoff_time=cutoff_df_index_name) - fm2 = to_pandas(fm2, index="id", sort_index=True) assert all((fm1 == fm2.values).values) error_text = ( @@ -1306,10 +1215,7 @@ def test_cutoff_time_naming(es): calculate_feature_matrix([dfeat], es, cutoff_time=cutoff_df_wrong_time_name) -# TODO: order doesn't match, but output matches def test_cutoff_time_extra_columns(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Distributed result not ordered") agg_feat = Feature( es["customers"].ww["id"], parent_dataframe_name="régions", @@ -1336,9 +1242,9 @@ def test_cutoff_time_extra_columns(es): assert (fm["label"].values == cutoff_df["label"].values).all() -def test_cutoff_time_extra_columns_approximate(pd_es): +def test_cutoff_time_extra_columns_approximate(es): agg_feat = Feature( - pd_es["customers"].ww["id"], + es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count, ) @@ -1358,7 +1264,7 @@ def test_cutoff_time_extra_columns_approximate(pd_es): ) fm = calculate_feature_matrix( [dfeat], - pd_es, + es, cutoff_time=cutoff_df, approximate="2 days", ) @@ -1369,8 +1275,6 @@ def test_cutoff_time_extra_columns_approximate(pd_es): def test_cutoff_time_extra_columns_same_name(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Distributed result not ordered") agg_feat = Feature( es["customers"].ww["id"], parent_dataframe_name="régions", @@ -1398,9 +1302,9 @@ def test_cutoff_time_extra_columns_same_name(es): ).all() -def test_cutoff_time_extra_columns_same_name_approximate(pd_es): +def test_cutoff_time_extra_columns_same_name_approximate(es): agg_feat = Feature( - pd_es["customers"].ww["id"], + es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count, ) @@ -1420,7 +1324,7 @@ def test_cutoff_time_extra_columns_same_name_approximate(pd_es): ) fm = calculate_feature_matrix( [dfeat], - pd_es, + es, cutoff_time=cutoff_df, approximate="2 days", ) @@ -1444,7 +1348,6 @@ def test_instances_after_cutoff_time_removed(es): cutoff_time=cutoff_time, cutoff_time_in_index=True, ) - fm = to_pandas(fm, index="id", sort_index=True) actual_ids = ( [id for (id, _) in fm.index] if isinstance(fm.index, pd.MultiIndex) @@ -1455,10 +1358,7 @@ def test_instances_after_cutoff_time_removed(es): assert set(actual_ids) == set([2, 0]) -# TODO: Dask and Spark do not keep instance_id after cutoff def test_instances_with_id_kept_after_cutoff(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Distributed result not ordered, missing extra instances") property_feature = Feature( es["log"].ww["id"], parent_dataframe_name="customers", @@ -1483,13 +1383,7 @@ def test_instances_with_id_kept_after_cutoff(es): assert set(actual_ids) == set([0, 1, 2]) -# TODO: Fails with Dask -# TODO: Fails with Spark def test_cfm_returns_original_time_indexes(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Distributed result not ordered, indexes are lost due to not multiindexing", - ) agg_feat = Feature( es["customers"].ww["id"], parent_dataframe_name="régions", @@ -1521,15 +1415,15 @@ def test_cfm_returns_original_time_indexes(es): assert (time_level_vals == cutoff_df["time"].values).all() -def test_cfm_returns_original_time_indexes_approximate(pd_es): +def test_cfm_returns_original_time_indexes_approximate(es): agg_feat = Feature( - pd_es["customers"].ww["id"], + es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count, ) dfeat = DirectFeature(agg_feat, "customers") agg_feat_2 = Feature( - pd_es["sessions"].ww["id"], + es["sessions"].ww["id"], parent_dataframe_name="customers", primitive=Count, ) @@ -1546,7 +1440,7 @@ def test_cfm_returns_original_time_indexes_approximate(pd_es): # approximate, in different windows, no unapproximated aggs fm = calculate_feature_matrix( [dfeat], - pd_es, + es, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m", @@ -1559,7 +1453,7 @@ def test_cfm_returns_original_time_indexes_approximate(pd_es): # approximate, in different windows, unapproximated aggs fm = calculate_feature_matrix( [dfeat, agg_feat_2], - pd_es, + es, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m", @@ -1572,7 +1466,7 @@ def test_cfm_returns_original_time_indexes_approximate(pd_es): # approximate, in same window, no unapproximated aggs fm2 = calculate_feature_matrix( [dfeat], - pd_es, + es, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d", @@ -1585,7 +1479,7 @@ def test_cfm_returns_original_time_indexes_approximate(pd_es): # approximate, in same window, unapproximated aggs fm3 = calculate_feature_matrix( [dfeat, agg_feat_2], - pd_es, + es, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d", @@ -1596,7 +1490,7 @@ def test_cfm_returns_original_time_indexes_approximate(pd_es): assert (time_level_vals == cutoff_df["time"].values).all() -def test_dask_kwargs(pd_es, dask_cluster): +def test_dask_kwargs(es, dask_cluster): times = ( [datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] @@ -1607,12 +1501,12 @@ def test_dask_kwargs(pd_es, dask_cluster): ) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 cutoff_time = pd.DataFrame({"time": times, "instance_id": range(17)}) - property_feature = IdentityFeature(pd_es["log"].ww["value"]) > 10 + property_feature = IdentityFeature(es["log"].ww["value"]) > 10 dkwargs = {"cluster": dask_cluster.scheduler.address} feature_matrix = calculate_feature_matrix( [property_feature], - entityset=pd_es, + entityset=es, cutoff_time=cutoff_time, verbose=True, chunk_size=0.13, @@ -1623,7 +1517,7 @@ def test_dask_kwargs(pd_es, dask_cluster): assert (feature_matrix[property_feature.get_name()] == labels).values.all() -def test_dask_persisted_es(pd_es, capsys, dask_cluster): +def test_dask_persisted_es(es, capsys, dask_cluster): times = ( [datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] @@ -1634,12 +1528,12 @@ def test_dask_persisted_es(pd_es, capsys, dask_cluster): ) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 cutoff_time = pd.DataFrame({"time": times, "instance_id": range(17)}) - property_feature = IdentityFeature(pd_es["log"].ww["value"]) > 10 + property_feature = IdentityFeature(es["log"].ww["value"]) > 10 dkwargs = {"cluster": dask_cluster.scheduler.address} feature_matrix = calculate_feature_matrix( [property_feature], - entityset=pd_es, + entityset=es, cutoff_time=cutoff_time, verbose=True, chunk_size=0.13, @@ -1649,7 +1543,7 @@ def test_dask_persisted_es(pd_es, capsys, dask_cluster): assert (feature_matrix[property_feature.get_name()] == labels).values.all() feature_matrix = calculate_feature_matrix( [property_feature], - entityset=pd_es, + entityset=es, cutoff_time=cutoff_time, verbose=True, chunk_size=0.13, @@ -1731,8 +1625,7 @@ def test_not_enough_memory(self, monkeypatch): ) -@pytest.mark.skipif("not dd") -def test_parallel_failure_raises_correct_error(pd_es): +def test_parallel_failure_raises_correct_error(es): times = ( [datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] @@ -1742,13 +1635,13 @@ def test_parallel_failure_raises_correct_error(pd_es): + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)] ) cutoff_time = pd.DataFrame({"time": times, "instance_id": range(17)}) - property_feature = IdentityFeature(pd_es["log"].ww["value"]) > 10 + property_feature = IdentityFeature(es["log"].ww["value"]) > 10 error_text = "Need at least one worker" with pytest.raises(AssertionError, match=error_text): calculate_feature_matrix( [property_feature], - entityset=pd_es, + entityset=es, cutoff_time=cutoff_time, verbose=True, chunk_size=0.13, @@ -1758,16 +1651,16 @@ def test_parallel_failure_raises_correct_error(pd_es): def test_warning_not_enough_chunks( - pd_es, + es, capsys, three_worker_dask_cluster, ): # pragma: no cover - property_feature = IdentityFeature(pd_es["log"].ww["value"]) > 10 + property_feature = IdentityFeature(es["log"].ww["value"]) > 10 dkwargs = {"cluster": three_worker_dask_cluster.scheduler.address} calculate_feature_matrix( [property_feature], - entityset=pd_es, + entityset=es, chunk_size=0.5, verbose=True, dask_kwargs=dkwargs, @@ -1796,7 +1689,7 @@ def test_n_jobs(): n_jobs_to_workers(0) -def test_parallel_cutoff_time_column_pass_through(pd_es, dask_cluster): +def test_parallel_cutoff_time_column_pass_through(es, dask_cluster): times = ( [datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] @@ -1809,12 +1702,12 @@ def test_parallel_cutoff_time_column_pass_through(pd_es, dask_cluster): cutoff_time = pd.DataFrame( {"time": times, "instance_id": range(17), "labels": labels}, ) - property_feature = IdentityFeature(pd_es["log"].ww["value"]) > 10 + property_feature = IdentityFeature(es["log"].ww["value"]) > 10 dkwargs = {"cluster": dask_cluster.scheduler.address} feature_matrix = calculate_feature_matrix( [property_feature], - entityset=pd_es, + entityset=es, cutoff_time=cutoff_time, verbose=True, dask_kwargs=dkwargs, @@ -1827,8 +1720,6 @@ def test_parallel_cutoff_time_column_pass_through(pd_es, dask_cluster): def test_integer_time_index(int_es): - if int_es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask and Spark do not retain time column") times = list(range(8, 18)) + list(range(19, 26)) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 cutoff_df = pd.DataFrame({"time": times, "instance_id": range(17)}) @@ -1848,8 +1739,6 @@ def test_integer_time_index(int_es): def test_integer_time_index_single_cutoff_value(int_es): - if int_es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask and Spark do not retain time column") labels = [False] * 3 + [True] * 2 + [False] * 4 property_feature = IdentityFeature(int_es["log"].ww["value"]) > 10 @@ -1899,7 +1788,6 @@ def test_integer_time_index_passes_extra_columns(int_es): cutoff_time=cutoff_df, cutoff_time_in_index=True, ) - fm = to_pandas(fm) assert (fm[property_feature.get_name()] == fm["labels"]).all() @@ -1967,13 +1855,7 @@ def test_datetime_index_mixed_cutoff(es): calculate_feature_matrix([property_feature], es, cutoff_time=cutoff_df) -# TODO: Dask version fails (feature matrix is empty) -# TODO: Spark version fails (spark groupby agg doesn't support custom functions) def test_no_data_for_cutoff_time(mock_customer): - if mock_customer.dataframe_type != Library.PANDAS: - pytest.xfail( - "Dask fails because returned feature matrix is empty; Spark doesn't support custom agg functions", - ) es = mock_customer cutoff_times = pd.DataFrame( {"customer_id": [4], "time": pd.Timestamp("2011-04-08 20:08:13")}, @@ -2015,27 +1897,26 @@ def test_no_data_for_cutoff_time(mock_customer): ) -# adding missing instances not supported in Dask or Spark -def test_instances_not_in_data(pd_es): - last_instance = max(pd_es["log"].index.values) +def test_instances_not_in_data(es): + last_instance = max(es["log"].index.values) instances = list(range(last_instance + 1, last_instance + 11)) - identity_feature = IdentityFeature(pd_es["log"].ww["value"]) + identity_feature = IdentityFeature(es["log"].ww["value"]) property_feature = identity_feature > 10 agg_feat = AggregationFeature( - Feature(pd_es["log"].ww["value"]), + Feature(es["log"].ww["value"]), parent_dataframe_name="sessions", primitive=Max, ) direct_feature = DirectFeature(agg_feat, "log") features = [identity_feature, property_feature, direct_feature] - fm = calculate_feature_matrix(features, entityset=pd_es, instance_ids=instances) + fm = calculate_feature_matrix(features, entityset=es, instance_ids=instances) assert all(fm.index.values == instances) for column in fm.columns: assert fm[column].isnull().all() fm = calculate_feature_matrix( features, - entityset=pd_es, + entityset=es, instance_ids=instances, approximate="730 days", ) @@ -2044,23 +1925,23 @@ def test_instances_not_in_data(pd_es): assert fm[column].isnull().all() -def test_some_instances_not_in_data(pd_es): +def test_some_instances_not_in_data(es): a_time = datetime(2011, 4, 10, 10, 41, 9) # only valid data b_time = datetime(2011, 4, 10, 11, 10, 5) # some missing data c_time = datetime(2011, 4, 10, 12, 0, 0) # all missing data times = [a_time, b_time, a_time, a_time, b_time, b_time] + [c_time] * 4 cutoff_time = pd.DataFrame({"instance_id": list(range(12, 22)), "time": times}) - identity_feature = IdentityFeature(pd_es["log"].ww["value"]) + identity_feature = IdentityFeature(es["log"].ww["value"]) property_feature = identity_feature > 10 agg_feat = AggregationFeature( - Feature(pd_es["log"].ww["value"]), + Feature(es["log"].ww["value"]), parent_dataframe_name="sessions", primitive=Max, ) direct_feature = DirectFeature(agg_feat, "log") features = [identity_feature, property_feature, direct_feature] - fm = calculate_feature_matrix(features, entityset=pd_es, cutoff_time=cutoff_time) + fm = calculate_feature_matrix(features, entityset=es, cutoff_time=cutoff_time) ifeat_answer = pd.Series([0, 7, 14, np.nan] + [np.nan] * 6) prop_answer = pd.Series([0, 0, 1, pd.NA, 0] + [pd.NA] * 5, dtype="boolean") dfeat_answer = pd.Series([14, 14, 14, np.nan] + [np.nan] * 6) @@ -2071,7 +1952,7 @@ def test_some_instances_not_in_data(pd_es): fm = calculate_feature_matrix( features, - entityset=pd_es, + entityset=es, cutoff_time=cutoff_time, approximate="5 seconds", ) @@ -2085,16 +1966,16 @@ def test_some_instances_not_in_data(pd_es): pd.testing.assert_series_equal(fm[x], y, check_index=False, check_names=False) -def test_missing_instances_with_categorical_index(pd_es): +def test_missing_instances_with_categorical_index(es): instance_ids = ["coke zero", "car", 3, "taco clock"] features = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="products", features_only=True, ) fm = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=instance_ids, ) @@ -2214,7 +2095,7 @@ def __call__(self, update, progress_percent, time_elapsed): assert np.isclose(mock_progress_callback.total_progress_percent, 100.0) -def test_calls_progress_callback_cluster(pd_mock_customer, dask_cluster): +def test_calls_progress_callback_cluster(mock_customer, dask_cluster): class MockProgressCallback: def __init__(self): self.progress_history = [] @@ -2229,12 +2110,12 @@ def __call__(self, update, progress_percent, time_elapsed): mock_progress_callback = MockProgressCallback() trans_per_session = Feature( - pd_mock_customer["transactions"].ww["transaction_id"], + mock_customer["transactions"].ww["transaction_id"], parent_dataframe_name="sessions", primitive=Count, ) trans_per_customer = Feature( - pd_mock_customer["transactions"].ww["transaction_id"], + mock_customer["transactions"].ww["transaction_id"], parent_dataframe_name="customers", primitive=Count, ) @@ -2243,7 +2124,7 @@ def __call__(self, update, progress_percent, time_elapsed): dkwargs = {"cluster": dask_cluster.scheduler.address} calculate_feature_matrix( features, - entityset=pd_mock_customer, + entityset=mock_customer, progress_callback=mock_progress_callback, dask_kwargs=dkwargs, ) @@ -2259,7 +2140,6 @@ class ErrorPrim(TransformPrimitive): name = "error_prim" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = "Numeric" - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def get_function(self): def error(s): @@ -2281,9 +2161,9 @@ def error(s): assert len(tqdm._instances) == 0 -def test_approximate_with_single_cutoff_warns(pd_es): +def test_approximate_with_single_cutoff_warns(es): features = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="customers", features_only=True, ignore_dataframes=["cohorts"], @@ -2298,18 +2178,18 @@ def test_approximate_with_single_cutoff_warns(pd_es): with pytest.warns(UserWarning, match=match): calculate_feature_matrix( features, - pd_es, + es, cutoff_time=pd.to_datetime("2020-01-01"), approximate="1 day", ) # test warning with no cutoff time with pytest.warns(UserWarning, match=match): - calculate_feature_matrix(features, pd_es, approximate="1 day") + calculate_feature_matrix(features, es, approximate="1 day") # check proper handling of approximate feature_matrix = calculate_feature_matrix( features, - pd_es, + es, cutoff_time=pd.to_datetime("2011-04-09 10:31:30"), approximate="1 minute", ) @@ -2343,7 +2223,6 @@ def test_calc_feature_matrix_with_cutoff_df_and_instance_ids(es): verbose=True, ) - feature_matrix = to_pandas(feature_matrix) assert (feature_matrix[property_feature.get_name()] == labels).values.all() @@ -2360,7 +2239,6 @@ def test_calculate_feature_matrix_returns_default_values(default_value_es): entityset=default_value_es, ) - feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True) expected_values = [2.0, 2.0, 1.0, 0.0] assert (feature_matrix[sessions_sum.get_name()] == expected_values).values.all() @@ -2379,8 +2257,6 @@ def test_dataframes_relationships(dataframes, relationships): relationships=relationships, ) - fm_1 = to_pandas(fm_1, index="id", sort_index=True) - fm_2 = to_pandas(fm_2, index="id", sort_index=True) assert fm_1.equals(fm_2) @@ -2410,8 +2286,6 @@ def test_no_relationships(dataframes): relationships=None, ) - fm_1 = to_pandas(fm_1, index="id") - fm_2 = to_pandas(fm_2, index="id") assert fm_1.equals(fm_2) @@ -2448,7 +2322,7 @@ def test_cfm_introduces_nan_values_in_direct_feats(es): assert isinstance(fm.ww.logical_types["loves_ice_cream"], BooleanNullable) -def test_feature_origins_present_on_all_fm_cols(pd_es): +def test_feature_origins_present_on_all_fm_cols(es): class MultiCumSum(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] @@ -2462,7 +2336,7 @@ def multi_cum_sum(x): return multi_cum_sum feature_matrix, _ = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="log", trans_primitives=[MultiCumSum], ) @@ -2472,7 +2346,7 @@ def multi_cum_sum(x): assert origin in ["base", "engineered"] -def test_renamed_features_have_expected_column_names_in_feature_matrix(pd_es): +def test_renamed_features_have_expected_column_names_in_feature_matrix(es): class MultiCumulative(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] @@ -2486,16 +2360,16 @@ def multi_cum_sum(x): return multi_cum_sum multi_output_trans_feat = Feature( - pd_es["log"].ww["value"], + es["log"].ww["value"], primitive=MultiCumulative, ) groupby_trans_feat = GroupByTransformFeature( - pd_es["log"].ww["value"], + es["log"].ww["value"], primitive=MultiCumulative, - groupby=pd_es["log"].ww["product_id"], + groupby=es["log"].ww["product_id"], ) multi_output_agg_feat = Feature( - pd_es["log"].ww["product_id"], + es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) @@ -2515,7 +2389,7 @@ def multi_cum_sum(x): stacked_feat, groupby_trans_feat, ] - feature_matrix = calculate_feature_matrix(entityset=pd_es, features=features) + feature_matrix = calculate_feature_matrix(entityset=es, features=features) expected_names = multi_output_trans_names + agg_names + groupby_trans_feat_names for renamed_col in expected_names: assert renamed_col in feature_matrix.columns diff --git a/featuretools/tests/computational_backend/test_dask_features.py b/featuretools/tests/computational_backend/test_dask_features.py deleted file mode 100644 index 13a1d3c797..0000000000 --- a/featuretools/tests/computational_backend/test_dask_features.py +++ /dev/null @@ -1,37 +0,0 @@ -import pandas as pd -import pytest - -from featuretools.tests.testing_utils import make_ecommerce_entityset - - -def test_tokenize_entityset(pd_es, pd_int_es): - pytest.importorskip("dask", reason="Dask not installed, skipping") - from dask.base import tokenize - - dupe = make_ecommerce_entityset() - - # check identitcal entitysets hash to same token - assert tokenize(pd_es) == tokenize(dupe) - - # not same if product relationship is missing - productless = make_ecommerce_entityset() - productless.relationships.pop() - assert tokenize(pd_es) != tokenize(productless) - - # not same if integer entityset - assert tokenize(pd_es) != tokenize(pd_int_es) - - # add row to cohorts - cohorts_df = dupe["cohorts"] - new_row = pd.DataFrame( - data={ - "cohort": [2], - "cohort_name": None, - "cohort_end": [pd.Timestamp("2011-04-08 12:00:00")], - }, - columns=["cohort", "cohort_name", "cohort_end"], - index=[2], - ) - more_cohorts = pd.concat([cohorts_df, new_row]) - dupe.replace_dataframe(dataframe_name="cohorts", df=more_cohorts) - assert tokenize(pd_es) == tokenize(dupe) diff --git a/featuretools/tests/computational_backend/test_feature_set.py b/featuretools/tests/computational_backend/test_feature_set.py index f27d8d4920..03fc2e16e0 100644 --- a/featuretools/tests/computational_backend/test_feature_set.py +++ b/featuretools/tests/computational_backend/test_feature_set.py @@ -79,11 +79,11 @@ def test_feature_trie_without_needs_full_dataframe(diamond_es): def test_feature_trie_with_needs_full_dataframe(diamond_es): - pd_es = diamond_es - amount = IdentityFeature(pd_es["transactions"].ww["amount"]) + es = diamond_es + amount = IdentityFeature(es["transactions"].ww["amount"]) path_through_customers = backward_path( - pd_es, + es, ["regions", "customers", "transactions"], ) agg = AggregationFeature( @@ -94,7 +94,7 @@ def test_feature_trie_with_needs_full_dataframe(diamond_es): ) trans_of_agg = TransformFeature(agg, primitives.CumSum) - path_through_stores = backward_path(pd_es, ["regions", "stores", "transactions"]) + path_through_stores = backward_path(es, ["regions", "stores", "transactions"]) trans = TransformFeature(amount, primitives.CumSum) agg_of_trans = AggregationFeature( trans, diff --git a/featuretools/tests/computational_backend/test_feature_set_calculator.py b/featuretools/tests/computational_backend/test_feature_set_calculator.py index 68684d203b..99afe56ffc 100644 --- a/featuretools/tests/computational_backend/test_feature_set_calculator.py +++ b/featuretools/tests/computational_backend/test_feature_set_calculator.py @@ -41,11 +41,8 @@ ) from featuretools.primitives.base import AggregationPrimitive from featuretools.primitives.standard.aggregation.num_unique import NumUnique -from featuretools.tests.testing_utils import backward_path, to_pandas +from featuretools.tests.testing_utils import backward_path from featuretools.utils import Trie -from featuretools.utils.gen_utils import Library, import_or_none, is_instance - -dd = import_or_none("dask.dataframe") def test_make_identity(es): @@ -53,7 +50,7 @@ def test_make_identity(es): feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) - df = to_pandas(calculator.run(np.array([0]))) + df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert v == datetime(2011, 4, 9, 10, 30, 0) @@ -67,7 +64,7 @@ def test_make_dfeat(es): feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) - df = to_pandas(calculator.run(np.array([0]))) + df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert v == 33 @@ -82,45 +79,28 @@ def test_make_agg_feat_of_identity_column(es): feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) - df = to_pandas(calculator.run(np.array([0]))) + df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert v == 50 -# full_dataframe not supported with Dask -def test_full_dataframe_trans_of_agg(pd_es): +def test_full_dataframe_trans_of_agg(es): agg_feat = Feature( - pd_es["log"].ww["value"], + es["log"].ww["value"], parent_dataframe_name="customers", primitive=Sum, ) trans_feat = Feature(agg_feat, primitive=CumSum) feature_set = FeatureSet([trans_feat]) - calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) + calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([1])) v = df[trans_feat.get_name()].values[0] assert v == 82 -def test_full_dataframe_error_dask(dask_es): - agg_feat = Feature( - dask_es["log"].ww["value"], - parent_dataframe_name="customers", - primitive=Sum, - ) - trans_feat = Feature(agg_feat, primitive=CumSum) - - feature_set = FeatureSet([trans_feat]) - calculator = FeatureSetCalculator(dask_es, time_last=None, feature_set=feature_set) - error_text = "Cannot use primitives that require full dataframe with Dask" - - with pytest.raises(ValueError, match=error_text): - calculator.run(np.array([1])) - - def test_make_agg_feat_of_identity_index_column(es): agg_feat = Feature( es["log"].ww["id"], @@ -130,7 +110,7 @@ def test_make_agg_feat_of_identity_index_column(es): feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) - df = to_pandas(calculator.run(np.array([0]))) + df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert v == 5 @@ -146,7 +126,7 @@ def test_make_agg_feat_where_count(es): feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) - df = to_pandas(calculator.run(np.array([0]))) + df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert v == 3 @@ -166,7 +146,7 @@ def test_make_agg_feat_using_prev_time(es): time_last=datetime(2011, 4, 9, 10, 30, 10), feature_set=feature_set, ) - df = to_pandas(calculator.run(np.array([0]))) + df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert v == 2 @@ -176,15 +156,13 @@ def test_make_agg_feat_using_prev_time(es): time_last=datetime(2011, 4, 9, 10, 30, 30), feature_set=feature_set, ) - df = to_pandas(calculator.run(np.array([0]))) + df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert v == 1 def test_make_agg_feat_using_prev_n_events(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Distrubuted entitysets do not support use_previous") agg_feat_1 = Feature( es["log"].ww["value"], parent_dataframe_name="sessions", @@ -231,10 +209,6 @@ def test_make_agg_feat_using_prev_n_events(es): def test_make_agg_feat_multiple_dtypes(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Currently no Dask or Spark compatible agg prims that use multiple dtypes", - ) compare_prod = IdentityFeature(es["log"].ww["product_id"]) == "coke zero" agg_feat = Feature( @@ -289,7 +263,6 @@ def test_make_agg_feat_where_different_identity_feat(es): features=feats, instance_ids=[0, 1, 2, 3], ) - df = to_pandas(df, index="id", sort_index=True) for i, where_cmp in enumerate(where_cmps): name = feats[i].get_name() @@ -337,7 +310,6 @@ def test_make_agg_feat_of_grandchild_dataframe(es): feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id") v = df[agg_feat.get_name()].values[0] assert v == 10 @@ -364,7 +336,6 @@ def test_make_agg_feat_where_count_feat(es): feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1])) - df = to_pandas(df, index="id", sort_index=True) name = feat.get_name() instances = df[name] @@ -398,7 +369,6 @@ def test_make_compare_feat(es): feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) - df = to_pandas(df, index="id", sort_index=True) name = feat.get_name() instances = df[name] @@ -433,7 +403,6 @@ def test_make_agg_feat_where_count_and_device_type_feat(es): feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id") name = feat.get_name() instances = df[name] @@ -465,7 +434,6 @@ def test_make_agg_feat_where_count_or_device_type_feat(es): feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id", int_index=True) name = feat.get_name() instances = df[name] @@ -488,13 +456,12 @@ def test_make_agg_feat_of_agg_feat(es): feature_set = FeatureSet([customer_sum_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id") v = df[customer_sum_feat.get_name()].values[0] assert v == 10 @pytest.fixture -def pd_df(): +def df(): return pd.DataFrame( { "id": ["a", "b", "c", "d", "e"], @@ -506,23 +473,6 @@ def pd_df(): ) -@pytest.fixture -def dd_df(pd_df): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_df, npartitions=2) - - -@pytest.fixture -def spark_df(pd_df): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_df) - - -@pytest.fixture(params=["pd_df", "dd_df", "spark_df"]) -def df(request): - return request.getfixturevalue(request.param) - - def test_make_3_stacked_agg_feats(df): """ Tests stacking 3 agg features. @@ -531,8 +481,6 @@ def test_make_3_stacked_agg_feats(df): as dataframes are merged together """ - if is_instance(df, dd, "DataFrame"): - pytest.xfail("normalize_datdataframe fails with dask DataFrame") es = EntitySet() ltypes = {"e1": Categorical, "e2": Categorical, "e3": Categorical, "val": Double} es.add_dataframe( @@ -599,7 +547,6 @@ def test_make_dfeat_of_agg_feat_on_self(es): feature_set = FeatureSet([num_customers_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id") v = df[num_customers_feat.get_name()].values[0] assert v == 3 @@ -629,7 +576,6 @@ def test_make_dfeat_of_agg_feat_through_parent(es): feature_set = FeatureSet([num_stores_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id") v = df[num_stores_feat.get_name()].values[0] assert v == 3 @@ -664,7 +610,6 @@ def test_make_deep_agg_feat_of_dfeat_of_agg_feat(es): feature_set = FeatureSet([purchase_popularity]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id") v = df[purchase_popularity.get_name()].values[0] assert v == 38.0 / 10.0 @@ -689,22 +634,20 @@ def test_deep_agg_feat_chain(es): feature_set = FeatureSet([region_avg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array(["United States"])) - df = to_pandas(df, index="id") v = df[region_avg_feat.get_name()][0] assert v == 17 / 3.0 -# NMostCommon not supported with Dask or Spark -def test_topn(pd_es): +def test_topn(es): topn = Feature( - pd_es["log"].ww["product_id"], + es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) feature_set = FeatureSet([topn]) - calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) + calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) true_results = pd.DataFrame( [ @@ -726,16 +669,15 @@ def test_topn(pd_es): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2) -# Trend not supported with Dask or Spark -def test_trend(pd_es): +def test_trend(es): trend = Feature( - [Feature(pd_es["log"].ww["value"]), Feature(pd_es["log"].ww["datetime"])], + [Feature(es["log"].ww["value"]), Feature(es["log"].ww["datetime"])], parent_dataframe_name="customers", primitive=Trend, ) feature_set = FeatureSet([trend]) - calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) + calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) true_results = [-0.812730, 4.870378, np.nan] @@ -752,7 +694,7 @@ def test_direct_squared(es): squared = feature * feature feature_set = FeatureSet([feature, squared]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) - df = to_pandas(calculator.run(np.array([0, 1, 2]))) + df = calculator.run(np.array([0, 1, 2])) for i, row in df.iterrows(): assert (row[0] * row[0]) == row[1] @@ -771,7 +713,7 @@ def test_agg_empty_child(es): time_last=datetime(2011, 4, 8), feature_set=feature_set, ) - df = to_pandas(calculator.run(np.array([0])), index="id") + df = calculator.run(np.array([0])) assert df["COUNT(log)"].iloc[0] == 0 @@ -802,7 +744,6 @@ def test_diamond_entityset(diamond_es): feature_set=feature_set, ) df = calculator.run(np.array([0, 1, 2])) - df = to_pandas(df, index="id", sort_index=True) assert (df["SUM(stores.transactions.amount)"] == [94, 261, 128]).all() assert (df["SUM(customers.transactions.amount)"] == [72, 411, 0]).all() @@ -835,14 +776,13 @@ def test_two_relationships_to_single_dataframe(games_es): feature_set=feature_set, ) df = calculator.run(np.array(range(3))) - df = to_pandas(df, index="id", sort_index=True) assert (df[home_team_mean.get_name()] == [1.5, 1.5, 2.5]).all() assert (df[away_team_mean.get_name()] == [1, 0.5, 2]).all() @pytest.fixture -def pd_parent_child(): +def parent_child(): parent_df = pd.DataFrame({"id": [1]}) child_df = pd.DataFrame( { @@ -856,29 +796,6 @@ def pd_parent_child(): return (parent_df, child_df) -@pytest.fixture -def dd_parent_child(pd_parent_child): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - parent_df, child_df = pd_parent_child - parent_df = dd.from_pandas(parent_df, npartitions=2) - child_df = dd.from_pandas(child_df, npartitions=2) - return (parent_df, child_df) - - -@pytest.fixture -def spark_parent_child(pd_parent_child): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - parent_df, child_df = pd_parent_child - parent_df = ps.from_pandas(parent_df) - child_df = ps.from_pandas(child_df) - return (parent_df, child_df) - - -@pytest.fixture(params=["pd_parent_child", "dd_parent_child", "spark_parent_child"]) -def parent_child(request): - return request.getfixturevalue(request.param) - - def test_empty_child_dataframe(parent_child): parent_df, child_df = parent_child child_ltypes = { @@ -941,31 +858,24 @@ def test_empty_child_dataframe(parent_child): primitive=NMostCommon, ) - if isinstance(parent_df, pd.DataFrame): - features = [ - count, - count_where, - trend, - trend_where, - n_most_common, - n_most_common_where, - ] - data = { - count.get_name(): pd.Series([0], dtype="Int64"), - count_where.get_name(): pd.Series([0], dtype="Int64"), - trend.get_name(): pd.Series([np.nan], dtype="float"), - trend_where.get_name(): pd.Series([np.nan], dtype="float"), - } - for name in n_most_common.get_feature_names(): - data[name] = pd.Series([np.nan], dtype="category") - for name in n_most_common_where.get_feature_names(): - data[name] = pd.Series([np.nan], dtype="category") - else: - features = [count, count_where] - data = { - count.get_name(): pd.Series([0], dtype="Int64"), - count_where.get_name(): pd.Series([0], dtype="Int64"), - } + features = [ + count, + count_where, + trend, + trend_where, + n_most_common, + n_most_common_where, + ] + data = { + count.get_name(): pd.Series([0], dtype="Int64"), + count_where.get_name(): pd.Series([0], dtype="Int64"), + trend.get_name(): pd.Series([np.nan], dtype="float"), + trend_where.get_name(): pd.Series([np.nan], dtype="float"), + } + for name in n_most_common.get_feature_names(): + data[name] = pd.Series([np.nan], dtype="category") + for name in n_most_common_where.get_feature_names(): + data[name] = pd.Series([np.nan], dtype="category") answer = pd.DataFrame(data) @@ -975,7 +885,6 @@ def test_empty_child_dataframe(parent_child): features=features, cutoff_time=pd.Timestamp("12/31/2017"), ) - fm = to_pandas(fm) for column in data.keys(): pd.testing.assert_series_equal( @@ -986,26 +895,14 @@ def test_empty_child_dataframe(parent_child): ) # cutoff time after all rows, but where clause filters all rows - if isinstance(parent_df, pd.DataFrame): - features = [count_where, trend_where, n_most_common_where] - data = { - count_where.get_name(): pd.Series([0], dtype="Int64"), - trend_where.get_name(): pd.Series([np.nan], dtype="float"), - } - for name in n_most_common_where.get_feature_names(): - data[name] = pd.Series([np.nan], dtype="category") - else: - features = [count_where] - data = {count_where.get_name(): pd.Series([0], dtype="Int64")} + data = { + count_where.get_name(): pd.Series([0], dtype="Int64"), + trend_where.get_name(): pd.Series([np.nan], dtype="float"), + } + for name in n_most_common_where.get_feature_names(): + data[name] = pd.Series([np.nan], dtype="category") answer = pd.DataFrame(data) - fm2 = calculate_feature_matrix( - entityset=es, - features=features, - cutoff_time=pd.Timestamp("1/4/2018"), - ) - fm2 = to_pandas(fm2) - for column in data.keys(): pd.testing.assert_series_equal( fm[column], @@ -1027,18 +924,11 @@ def test_with_features_built_from_es_metadata(es): feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id") v = df[agg_feat.get_name()].values[0] assert v == 10 -# TODO: Fails with Dask and Spark (conflicting aggregation primitives) def test_handles_primitive_function_name_uniqueness(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Fails with Dask and Spark due conflicting aggregation primitive names", - ) - class SumTimesN(AggregationPrimitive): name = "sum_times_n" input_types = [ColumnSchema(semantic_tags={"numeric"})] @@ -1047,7 +937,7 @@ class SumTimesN(AggregationPrimitive): def __init__(self, n): self.n = n - def get_function(self, agg_type="pandas"): + def get_function(self): def my_function(values): return values.sum() * self.n @@ -1111,7 +1001,7 @@ class Sum1(AggregationPrimitive): stack_on_exclude = [Count] default_value = 0 - def get_function(self, agg_type="pandas"): + def get_function(self): return np.sum class Sum2(AggregationPrimitive): @@ -1124,7 +1014,7 @@ class Sum2(AggregationPrimitive): stack_on_exclude = [Count] default_value = 0 - def get_function(self, agg_type="pandas"): + def get_function(self): return np.sum class Sum3(AggregationPrimitive): @@ -1137,7 +1027,7 @@ class Sum3(AggregationPrimitive): stack_on_exclude = [Count] default_value = 0 - def get_function(self, agg_type="pandas"): + def get_function(self): return np.sum f5 = Feature( @@ -1161,13 +1051,12 @@ def get_function(self, agg_type="pandas"): assert all(fm[f7.get_name()].sort_index() == value_sum) -# No order guarantees w/ Dask -def test_returns_order_of_instance_ids(pd_es): - feature_set = FeatureSet([Feature(pd_es["customers"].ww["age"])]) - calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) +def test_returns_order_of_instance_ids(es): + feature_set = FeatureSet([Feature(es["customers"].ww["age"])]) + calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) instance_ids = [0, 1, 2] - assert list(pd_es["customers"]["id"]) != instance_ids + assert list(es["customers"]["id"]) != instance_ids df = calculator.run(np.array(instance_ids)) @@ -1196,18 +1085,15 @@ def test_calls_progress_callback(es): groupby=Feature(es["customers"].ww["cohort"]), ) - if es.dataframe_type != Library.PANDAS: - all_features = [identity, direct, agg, trans] - else: - all_features = [ - identity, - direct, - agg, - agg_apply, - trans, - trans_full, - groupby_trans, - ] + all_features = [ + identity, + direct, + agg, + agg_apply, + trans, + trans_full, + groupby_trans, + ] feature_set = FeatureSet(all_features) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) @@ -1241,7 +1127,7 @@ def __call__(self, update): # precalculated_features is only used with approximate -def test_precalculated_features(pd_es): +def test_precalculated_features(es): error_msg = ( "This primitive should never be used because the features are precalculated" ) @@ -1253,13 +1139,13 @@ class ErrorPrim(AggregationPrimitive): input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - def get_function(self, agg_type="pandas"): + def get_function(self): def error(s): raise RuntimeError(error_msg) return error - value = Feature(pd_es["log"].ww["value"]) + value = Feature(es["log"].ww["value"]) agg = Feature(value, parent_dataframe_name="sessions", primitive=ErrorPrim) agg2 = Feature(agg, parent_dataframe_name="customers", primitive=ErrorPrim) direct = Feature(agg2, dataframe_name="sessions") @@ -1281,7 +1167,7 @@ def error(s): precalculated_fm_trie.get_node(direct.relationship_path).value = parent_fm calculator = FeatureSetCalculator( - pd_es, + es, feature_set=feature_set, precalculated_features=precalculated_fm_trie, ) @@ -1293,15 +1179,15 @@ def error(s): # Calculating without precalculated features should error. with pytest.raises(RuntimeError, match=error_msg): - FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids) + FeatureSetCalculator(es, feature_set=FeatureSet([direct])).run(instance_ids) -def test_nunique_nested_with_agg_bug(pd_es): +def test_nunique_nested_with_agg_bug(es): """Pandas 2.2.0 has a bug where pd.Series.nunique produces columns with the category dtype instead of int64 dtype, causing an error when we attempt another aggregation""" num_unique_feature = AggregationFeature( - Feature(pd_es["log"].ww["priority_level"]), + Feature(es["log"].ww["priority_level"]), "sessions", primitive=NumUnique, ) @@ -1312,8 +1198,7 @@ def test_nunique_nested_with_agg_bug(pd_es): primitive=Mean, ) feature_set = FeatureSet([mean_nunique_feature]) - calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) + calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) - df = to_pandas(df, index="id") assert df.iloc[0, 0].round(4) == 1.6667 diff --git a/featuretools/tests/computational_backend/test_utils.py b/featuretools/tests/computational_backend/test_utils.py index fdf6755f04..03d5d65363 100644 --- a/featuretools/tests/computational_backend/test_utils.py +++ b/featuretools/tests/computational_backend/test_utils.py @@ -3,7 +3,6 @@ from featuretools import dfs from featuretools.computational_backends import replace_inf_values from featuretools.primitives import DivideByFeature, DivideNumericScalar -from featuretools.tests.testing_utils import to_pandas def test_replace_inf_values(divide_by_zero_es): @@ -22,14 +21,12 @@ def test_replace_inf_values(divide_by_zero_es): trans_primitives=[primitive], max_depth=1, ) - assert np.inf in to_pandas(fm).values or -np.inf in to_pandas(fm).values + assert np.inf in fm.values or -np.inf in fm.values replaced_fm = replace_inf_values(fm) - replaced_fm = to_pandas(replaced_fm) assert np.inf not in replaced_fm.values assert -np.inf not in replaced_fm.values custom_value_fm = replace_inf_values(fm, replacement_value="custom_val") - custom_value_fm = to_pandas(custom_value_fm) assert np.inf not in custom_value_fm.values assert -np.inf not in replaced_fm.values assert "custom_val" in custom_value_fm.values @@ -44,8 +41,7 @@ def test_replace_inf_values_specify_cols(divide_by_zero_es): max_depth=1, ) - assert np.inf in to_pandas(fm["col1 / 0"]).values + assert np.inf in fm["col1 / 0"].values replaced_fm = replace_inf_values(fm, columns=["col1 / 0"]) - replaced_fm = to_pandas(replaced_fm) assert np.inf not in replaced_fm["col1 / 0"].values assert np.inf in replaced_fm["col2 / 0"].values diff --git a/featuretools/tests/conftest.py b/featuretools/tests/conftest.py index a40e1af7e2..b1d14b55c2 100644 --- a/featuretools/tests/conftest.py +++ b/featuretools/tests/conftest.py @@ -8,13 +8,10 @@ import pytest from packaging.version import parse from woodwork.column_schema import ColumnSchema -from woodwork.logical_types import Boolean, Integer from featuretools import EntitySet, demo from featuretools.primitives import AggregationPrimitive, TransformPrimitive -from featuretools.tests.testing_utils import make_ecommerce_entityset, to_pandas -from featuretools.utils.gen_utils import import_or_none -from featuretools.utils.spark_utils import pd_to_spark_clean +from featuretools.tests.testing_utils import make_ecommerce_entityset @pytest.fixture() @@ -39,24 +36,6 @@ def three_worker_dask_cluster(): yield cluster -@pytest.fixture(scope="session", autouse=True) -def spark_session(): - sql = import_or_none("pyspark.sql") - if sql: - spark = ( - sql.SparkSession.builder.master("local[2]") - .config( - "spark.driver.extraJavaOptions", - "-Dio.netty.tryReflectionSetAccessible=True", - ) - .config("spark.sql.shuffle.partitions", "2") - .config("spark.driver.bindAddress", "127.0.0.1") - .getOrCreate() - ) - - return spark - - @pytest.fixture(scope="session") def make_es(): return make_ecommerce_entityset() @@ -68,139 +47,23 @@ def make_int_es(): @pytest.fixture -def pd_es(make_es): +def es(make_es): return copy.deepcopy(make_es) @pytest.fixture -def pd_int_es(make_int_es): +def int_es(make_int_es): return copy.deepcopy(make_int_es) @pytest.fixture -def dask_int_es(pd_int_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - es = EntitySet(id=pd_int_es.id) - for df in pd_int_es.dataframes: - dd_df = dd.from_pandas(df.reset_index(drop=True), npartitions=4) - dd_df.ww.init(schema=df.ww.schema) - es.add_dataframe(dd_df) - - for rel in pd_int_es.relationships: - es.add_relationship( - rel.parent_dataframe.ww.name, - rel._parent_column_name, - rel.child_dataframe.ww.name, - rel._child_column_name, - ) - return es - - -@pytest.fixture -def spark_int_es(pd_int_es): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - es = EntitySet(id=pd_int_es.id) - for df in pd_int_es.dataframes: - cleaned_df = pd_to_spark_clean(df).reset_index(drop=True) - spark_df = ps.from_pandas(cleaned_df) - spark_df.ww.init(schema=df.ww.schema) - es.add_dataframe(spark_df) - - for rel in pd_int_es.relationships: - es.add_relationship( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - return es - - -@pytest.fixture(params=["pd_int_es", "dask_int_es", "spark_int_es"]) -def int_es(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def dask_es(pd_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - es = EntitySet(id=pd_es.id) - for df in pd_es.dataframes: - dd_df = dd.from_pandas(df.reset_index(drop=True), npartitions=4) - dd_df.ww.init(schema=df.ww.schema) - es.add_dataframe(dd_df) - - for rel in pd_es.relationships: - es.add_relationship( - rel.parent_dataframe.ww.name, - rel._parent_column_name, - rel.child_dataframe.ww.name, - rel._child_column_name, - ) - return es - - -@pytest.fixture -def spark_es(pd_es): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - es = EntitySet(id=pd_es.id) - for df in pd_es.dataframes: - cleaned_df = pd_to_spark_clean(df).reset_index(drop=True) - spark_df = ps.from_pandas(cleaned_df) - spark_df.ww.init(schema=df.ww.schema) - es.add_dataframe(spark_df) - - for rel in pd_es.relationships: - es.add_relationship( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - return es - - -@pytest.fixture(params=["pd_es", "dask_es", "spark_es"]) -def es(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def pd_latlong_df(): +def latlong_df(): df = pd.DataFrame({"idx": [0, 1, 2], "latLong": [pd.NA, (1, 2), (pd.NA, pd.NA)]}) return df @pytest.fixture -def dask_latlong_df(pd_latlong_df): - dask = pytest.importorskip("dask", reason="Dask not installed, skipping") - dask.config.set({"dataframe.convert-string": False}) - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_latlong_df.reset_index(drop=True), npartitions=4) - - -@pytest.fixture -def spark_latlong_df(pd_latlong_df): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - cleaned_df = pd_to_spark_clean(pd_latlong_df) - - pdf = ps.from_pandas(cleaned_df) - - return pdf - - -@pytest.fixture(params=["pd_latlong_df", "dask_latlong_df", "spark_latlong_df"]) -def latlong_df(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture(params=["pd_diamond_es", "dask_diamond_es", "spark_diamond_es"]) -def diamond_es(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def pd_diamond_es(): +def diamond_es(): countries_df = pd.DataFrame({"id": range(2), "name": ["US", "Canada"]}) regions_df = pd.DataFrame( { @@ -254,66 +117,7 @@ def pd_diamond_es(): @pytest.fixture -def dask_diamond_es(pd_diamond_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - dataframes = {} - for df in pd_diamond_es.dataframes: - dd_df = dd.from_pandas(df, npartitions=2) - dd_df.ww.init(schema=df.ww.schema) - dataframes[df.ww.name] = (dd_df,) - - relationships = [ - ( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - for rel in pd_diamond_es.relationships - ] - - return EntitySet( - id=pd_diamond_es.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture -def spark_diamond_es(pd_diamond_es): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - dataframes = {} - for df in pd_diamond_es.dataframes: - spark_df = ps.from_pandas(pd_to_spark_clean(df)) - spark_df.ww.init(schema=df.ww.schema) - dataframes[df.ww.name] = (spark_df,) - - relationships = [ - ( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - for rel in pd_diamond_es.relationships - ] - - return EntitySet( - id=pd_diamond_es.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture( - params=["pd_default_value_es", "dask_default_value_es", "spark_default_value_es"], -) -def default_value_es(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def pd_default_value_es(): +def default_value_es(): transactions = pd.DataFrame( {"id": [1, 2, 3, 4], "session_id": ["a", "a", "b", "c"], "value": [1, 1, 1, 1]}, ) @@ -329,66 +133,7 @@ def pd_default_value_es(): @pytest.fixture -def dask_default_value_es(pd_default_value_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - dataframes = {} - for df in pd_default_value_es.dataframes: - dd_df = dd.from_pandas(df, npartitions=4) - dd_df.ww.init(schema=df.ww.schema) - dataframes[df.ww.name] = (dd_df,) - - relationships = [ - ( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - for rel in pd_default_value_es.relationships - ] - - return EntitySet( - id=pd_default_value_es.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture -def spark_default_value_es(pd_default_value_es): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - dataframes = {} - for df in pd_default_value_es.dataframes: - spark_df = ps.from_pandas(pd_to_spark_clean(df)) - spark_df.ww.init(schema=df.ww.schema) - dataframes[df.ww.name] = (spark_df,) - - relationships = [ - ( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - for rel in pd_default_value_es.relationships - ] - - return EntitySet( - id=pd_default_value_es.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture( - params=["pd_home_games_es", "dask_home_games_es", "spark_home_games_es"], -) -def home_games_es(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def pd_home_games_es(): +def home_games_es(): teams = pd.DataFrame({"id": range(3), "name": ["Breakers", "Spirit", "Thorns"]}) games = pd.DataFrame( { @@ -404,133 +149,16 @@ def pd_home_games_es(): return EntitySet(dataframes=dataframes, relationships=relationships) -@pytest.fixture -def dask_home_games_es(pd_home_games_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - dataframes = {} - for df in pd_home_games_es.dataframes: - dd_df = dd.from_pandas(df, npartitions=2) - dd_df.ww.init(schema=df.ww.schema) - dataframes[df.ww.name] = (dd_df,) - - relationships = [ - ( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - for rel in pd_home_games_es.relationships - ] - - return EntitySet( - id=pd_home_games_es.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture -def spark_home_games_es(pd_home_games_es): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - dataframes = {} - for df in pd_home_games_es.dataframes: - spark_df = ps.from_pandas(pd_to_spark_clean(df)) - spark_df.ww.init(schema=df.ww.schema) - dataframes[df.ww.name] = (spark_df,) - - relationships = [ - ( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - for rel in pd_home_games_es.relationships - ] - - return EntitySet( - id=pd_home_games_es.id, - dataframes=dataframes, - relationships=relationships, - ) - - @pytest.fixture def games_es(home_games_es): return home_games_es.add_relationship("teams", "id", "games", "away_team_id") @pytest.fixture -def pd_mock_customer(): +def mock_customer(): return demo.load_mock_customer(return_entityset=True, random_seed=0) -@pytest.fixture -def dd_mock_customer(pd_mock_customer): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - dataframes = {} - for df in pd_mock_customer.dataframes: - dd_df = dd.from_pandas(df.reset_index(drop=True), npartitions=4) - dd_df.ww.init(schema=df.ww.schema) - dataframes[df.ww.name] = ( - dd_df, - df.ww.index, - df.ww.time_index, - df.ww.logical_types, - ) - relationships = [ - ( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - for rel in pd_mock_customer.relationships - ] - - return EntitySet( - id=pd_mock_customer.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture -def spark_mock_customer(pd_mock_customer): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - dataframes = {} - for df in pd_mock_customer.dataframes: - cleaned_df = pd_to_spark_clean(df).reset_index(drop=True) - dataframes[df.ww.name] = ( - ps.from_pandas(cleaned_df), - df.ww.index, - df.ww.time_index, - df.ww.logical_types, - ) - - relationships = [ - ( - rel._parent_dataframe_name, - rel._parent_column_name, - rel._child_dataframe_name, - rel._child_column_name, - ) - for rel in pd_mock_customer.relationships - ] - - return EntitySet( - id=pd_mock_customer.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture(params=["pd_mock_customer", "dd_mock_customer", "spark_mock_customer"]) -def mock_customer(request): - return request.getfixturevalue(request.param) - - @pytest.fixture def lt(es): def label_func(df): @@ -549,19 +177,13 @@ def label_func(df): lm = cp.LabelMaker(**kwargs) df = es["log"] - df = to_pandas(df) labels = lm.search(df, num_examples_per_instance=-1) labels = labels.rename(columns={"cutoff_time": "time"}) return labels -@pytest.fixture(params=["pd_dataframes", "dask_dataframes", "spark_dataframes"]) -def dataframes(request): - return request.getfixturevalue(request.param) - - @pytest.fixture -def pd_dataframes(): +def dataframes(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame( { @@ -578,85 +200,13 @@ def pd_dataframes(): return dataframes -@pytest.fixture -def dask_dataframes(): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) - transactions_df = pd.DataFrame( - { - "id": [1, 2, 3, 4, 5, 6], - "card_id": [1, 2, 1, 3, 4, 5], - "transaction_time": [10, 12, 13, 20, 21, 20], - "fraud": [True, False, False, False, True, True], - }, - ) - cards_df = dd.from_pandas(cards_df, npartitions=2) - transactions_df = dd.from_pandas(transactions_df, npartitions=2) - - cards_ltypes = {"id": Integer} - transactions_ltypes = { - "id": Integer, - "card_id": Integer, - "transaction_time": Integer, - "fraud": Boolean, - } - - dataframes = { - "cards": (cards_df, "id", None, cards_ltypes), - "transactions": ( - transactions_df, - "id", - "transaction_time", - transactions_ltypes, - ), - } - return dataframes - - -@pytest.fixture -def spark_dataframes(): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - cards_df = ps.DataFrame({"id": [1, 2, 3, 4, 5]}) - transactions_df = ps.DataFrame( - { - "id": [1, 2, 3, 4, 5, 6], - "card_id": [1, 2, 1, 3, 4, 5], - "transaction_time": [10, 12, 13, 20, 21, 20], - "fraud": [True, False, False, False, True, True], - }, - ) - cards_ltypes = {"id": Integer} - transactions_ltypes = { - "id": Integer, - "card_id": Integer, - "transaction_time": Integer, - "fraud": Boolean, - } - - dataframes = { - "cards": (cards_df, "id", None, cards_ltypes), - "transactions": ( - transactions_df, - "id", - "transaction_time", - transactions_ltypes, - ), - } - return dataframes - - @pytest.fixture def relationships(): return [("cards", "id", "transactions", "card_id")] -@pytest.fixture(params=["pd_transform_es", "dask_transform_es", "spark_transform_es"]) -def transform_es(request): - return request.getfixturevalue(request.param) - - @pytest.fixture -def pd_transform_es(): +def transform_es(): # Create dataframe df = pd.DataFrame( { @@ -680,46 +230,7 @@ def pd_transform_es(): @pytest.fixture -def dask_transform_es(pd_transform_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - es = EntitySet(id=pd_transform_es.id) - for df in pd_transform_es.dataframes: - es.add_dataframe( - dataframe_name=df.ww.name, - dataframe=dd.from_pandas(df, npartitions=2), - index=df.ww.index, - logical_types=df.ww.logical_types, - ) - return es - - -@pytest.fixture -def spark_transform_es(pd_transform_es): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - es = EntitySet(id=pd_transform_es.id) - for df in pd_transform_es.dataframes: - es.add_dataframe( - dataframe_name=df.ww.name, - dataframe=ps.from_pandas(df), - index=df.ww.index, - logical_types=df.ww.logical_types, - ) - return es - - -@pytest.fixture( - params=[ - "divide_by_zero_es_pd", - "divide_by_zero_es_dask", - "divide_by_zero_es_spark", - ], -) -def divide_by_zero_es(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def divide_by_zero_es_pd(): +def divide_by_zero_es(): df = pd.DataFrame( { "id": [0, 1, 2, 3], @@ -731,35 +242,7 @@ def divide_by_zero_es_pd(): @pytest.fixture -def divide_by_zero_es_dask(divide_by_zero_es_pd): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - es = EntitySet(id=divide_by_zero_es_pd.id) - for df in divide_by_zero_es_pd.dataframes: - es.add_dataframe( - dataframe_name=df.ww.name, - dataframe=dd.from_pandas(df, npartitions=2), - index=df.ww.index, - logical_types=df.ww.logical_types, - ) - return es - - -@pytest.fixture -def divide_by_zero_es_spark(divide_by_zero_es_pd): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - es = EntitySet(id=divide_by_zero_es_pd.id) - for df in divide_by_zero_es_pd.dataframes: - es.add_dataframe( - dataframe_name=df.ww.name, - dataframe=ps.from_pandas(df), - index=df.ww.index, - logical_types=df.ww.logical_types, - ) - return es - - -@pytest.fixture -def window_series_pd(): +def window_series(): return pd.Series( range(20), index=pd.date_range(start="2020-01-01", end="2020-01-20"), @@ -767,12 +250,12 @@ def window_series_pd(): @pytest.fixture -def window_date_range_pd(): +def window_date_range(): return pd.date_range(start="2022-11-1", end="2022-11-5", periods=30) @pytest.fixture -def rolling_outlier_series_pd(): +def rolling_outlier_series(): return pd.Series( [0] * 4 + [10] + [0] * 4 + [10] + [0] * 5, index=pd.date_range(start="2020-01-01", end="2020-01-15", periods=15), @@ -780,7 +263,7 @@ def rolling_outlier_series_pd(): @pytest.fixture -def postal_code_dataframe_pd(): +def postal_code_dataframe(): df = pd.DataFrame( { "string_dtype": pd.Series(["90210", "60018", "10010", "92304-4201"]), @@ -788,35 +271,6 @@ def postal_code_dataframe_pd(): "has_nulls": pd.Series([np.nan, 20000, 30000]).astype("category"), }, ) - return df - - -@pytest.fixture -def postal_code_dataframe_pyspark(postal_code_dataframe_pd): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - df = ps.from_pandas(postal_code_dataframe_pd) - return df - - -@pytest.fixture -def postal_code_dataframe_dask(postal_code_dataframe_pd): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - df = dd.from_pandas( - postal_code_dataframe_pd, - npartitions=1, - ).categorize() - return df - - -@pytest.fixture( - params=[ - "postal_code_dataframe_pd", - "postal_code_dataframe_pyspark", - "postal_code_dataframe_dask", - ], -) -def postal_code_dataframe(request): - df = request.getfixturevalue(request.param) df.ww.init( logical_types={ "string_dtype": "PostalCode", diff --git a/featuretools/tests/entityset_tests/test_dask_es.py b/featuretools/tests/entityset_tests/test_dask_es.py deleted file mode 100644 index 8e3388fc69..0000000000 --- a/featuretools/tests/entityset_tests/test_dask_es.py +++ /dev/null @@ -1,213 +0,0 @@ -import pandas as pd -import pytest -from woodwork.logical_types import ( - Categorical, - Datetime, - Double, - Integer, - NaturalLanguage, -) - -from featuretools.entityset import EntitySet -from featuretools.tests.testing_utils import get_df_tags -from featuretools.utils.gen_utils import Library, import_or_none - -dd = import_or_none("dask.dataframe") - - -@pytest.mark.skipif("not dd") -def test_add_dataframe(pd_es): - dask_es = EntitySet(id="dask_es") - log_dask = dd.from_pandas(pd_es["log"], npartitions=2) - dask_es = dask_es.add_dataframe( - dataframe_name="log_dask", - dataframe=log_dask, - index="id", - time_index="datetime", - logical_types=pd_es["log"].ww.logical_types, - semantic_tags=get_df_tags(pd_es["log"]), - ) - pd.testing.assert_frame_equal( - pd_es["log"], - dask_es["log_dask"].compute(), - check_like=True, - ) - - -@pytest.mark.skipif("not dd") -def test_add_dataframe_with_non_numeric_index(pd_es, dask_es): - df = pd.DataFrame({"id": ["A_1", "A_2", "C", "D"], "values": [1, 12, -34, 27]}) - dask_df = dd.from_pandas(df, npartitions=2) - - pd_es.add_dataframe( - dataframe_name="new_dataframe", - dataframe=df, - index="id", - logical_types={"id": Categorical, "values": Integer}, - ) - - dask_es.add_dataframe( - dataframe_name="new_dataframe", - dataframe=dask_df, - index="id", - logical_types={"id": Categorical, "values": Integer}, - ) - - pd.testing.assert_frame_equal( - pd_es["new_dataframe"].reset_index(drop=True), - dask_es["new_dataframe"].compute(), - ) - - -@pytest.mark.skipif("not dd") -def test_create_entityset_with_mixed_dataframe_types(pd_es, dask_es): - df = pd.DataFrame({"id": [0, 1, 2, 3], "values": [1, 12, -34, 27]}) - dask_df = dd.from_pandas(df, npartitions=2) - - err_msg = ( - "All dataframes must be of the same type. " - "Cannot add dataframe of type {} to an entityset with existing dataframes " - "of type {}" - ) - - # Test error is raised when trying to add Dask dataframe to entityset with existing pandas dataframes - with pytest.raises( - ValueError, - match=err_msg.format(type(dask_df), type(pd_es.dataframes[0])), - ): - pd_es.add_dataframe( - dataframe_name="new_dataframe", - dataframe=dask_df, - index="id", - ) - - # Test error is raised when trying to add pandas dataframe to entityset with existing dask dataframes - with pytest.raises( - ValueError, - match=err_msg.format(type(df), type(dask_es.dataframes[0])), - ): - dask_es.add_dataframe(dataframe_name="new_dataframe", dataframe=df, index="id") - - -@pytest.mark.skipif("not dd") -def test_add_last_time_indexes(): - pd_es = EntitySet(id="pd_es") - dask_es = EntitySet(id="dask_es") - - sessions = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "user": [1, 2, 1, 3], - "time": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - sessions_dask = dd.from_pandas(sessions, npartitions=2) - sessions_logical_types = { - "id": Integer, - "user": Integer, - "time": Datetime, - "strings": NaturalLanguage, - } - - transactions = pd.DataFrame( - { - "id": [0, 1, 2, 3, 4, 5], - "session_id": [0, 0, 1, 2, 2, 3], - "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], - "time": [ - pd.to_datetime("2019-01-10 03:53"), - pd.to_datetime("2019-01-10 04:12"), - pd.to_datetime("2019-02-03 10:34"), - pd.to_datetime("2019-01-01 12:35"), - pd.to_datetime("2019-01-01 12:49"), - pd.to_datetime("2017-08-25 04:53"), - ], - }, - ) - transactions_dask = dd.from_pandas(transactions, npartitions=2) - - transactions_logical_types = { - "id": Integer, - "session_id": Integer, - "time": Datetime, - "amount": Double, - } - - pd_es.add_dataframe( - dataframe_name="sessions", - dataframe=sessions, - index="id", - time_index="time", - ) - dask_es.add_dataframe( - dataframe_name="sessions", - dataframe=sessions_dask, - index="id", - time_index="time", - logical_types=sessions_logical_types, - ) - - pd_es.add_dataframe( - dataframe_name="transactions", - dataframe=transactions, - index="id", - time_index="time", - ) - dask_es.add_dataframe( - dataframe_name="transactions", - dataframe=transactions_dask, - index="id", - time_index="time", - logical_types=transactions_logical_types, - ) - - pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id") - dask_es = dask_es.add_relationship("sessions", "id", "transactions", "session_id") - - assert "foreign_key" in pd_es["transactions"].ww.semantic_tags["session_id"] - assert "foreign_key" in dask_es["transactions"].ww.semantic_tags["session_id"] - - assert pd_es["sessions"].ww.metadata.get("last_time_index") is None - assert dask_es["sessions"].ww.metadata.get("last_time_index") is None - - pd_es.add_last_time_indexes() - dask_es.add_last_time_indexes() - - pd_lti_name = pd_es["sessions"].ww.metadata.get("last_time_index") - spark_lti_name = dask_es["sessions"].ww.metadata.get("last_time_index") - assert pd_lti_name == spark_lti_name - pd.testing.assert_series_equal( - pd_es["sessions"][pd_lti_name].sort_index(), - dask_es["sessions"][spark_lti_name].compute().sort_index(), - check_names=False, - ) - - -@pytest.mark.skipif("not dd") -def test_add_dataframe_with_make_index(): - values = [1, 12, -23, 27] - df = pd.DataFrame({"values": values}) - dask_df = dd.from_pandas(df, npartitions=2) - dask_es = EntitySet(id="dask_es") - logical_types = {"values": Integer} - dask_es.add_dataframe( - dataframe_name="new_dataframe", - dataframe=dask_df, - make_index=True, - index="new_index", - logical_types=logical_types, - ) - - expected_df = pd.DataFrame({"values": values, "new_index": range(len(values))}) - pd.testing.assert_frame_equal(expected_df, dask_es["new_dataframe"].compute()) - - -@pytest.mark.skipif("not dd") -def test_dataframe_type_dask(dask_es): - assert dask_es.dataframe_type == Library.DASK diff --git a/featuretools/tests/entityset_tests/test_es.py b/featuretools/tests/entityset_tests/test_es.py index 9a66266a70..1639c430fc 100644 --- a/featuretools/tests/entityset_tests/test_es.py +++ b/featuretools/tests/entityset_tests/test_es.py @@ -28,12 +28,7 @@ from featuretools.demo import load_retail from featuretools.entityset import EntitySet from featuretools.entityset.entityset import LTI_COLUMN_NAME, WW_SCHEMA_KEY -from featuretools.tests.testing_utils import get_df_tags, to_pandas -from featuretools.utils.gen_utils import Library, import_or_none, is_instance -from featuretools.utils.spark_utils import pd_to_spark_clean - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") +from featuretools.tests.testing_utils import get_df_tags def test_normalize_time_index_as_additional_column(es): @@ -280,10 +275,6 @@ def test_add_relationship_errors_child_v_index(es): def test_add_relationship_empty_child_convert_dtype(es): relationship = Relationship(es, "sessions", "id", "log", "session_id") empty_log_df = pd.DataFrame(columns=es["log"].columns) - if es.dataframe_type == Library.DASK: - empty_log_df = dd.from_pandas(empty_log_df, npartitions=2) - elif es.dataframe_type == Library.SPARK: - empty_log_df = ps.from_pandas(empty_log_df) es.add_dataframe(empty_log_df, "log") @@ -344,7 +335,6 @@ def test_query_by_values_secondary_time_index(es): end = np.datetime64(datetime(2011, 10, 1)) all_instances = [0, 1, 2] result = es.query_by_values("customers", all_instances, time_last=end) - result = to_pandas(result, index="id") for col in ["cancel_date", "cancel_reason"]: nulls = result.loc[all_instances][col].isnull() == [False, True, True] @@ -352,18 +342,18 @@ def test_query_by_values_secondary_time_index(es): def test_query_by_id(es): - df = to_pandas(es.query_by_values("log", instance_vals=[0])) + df = es.query_by_values("log", instance_vals=[0]) assert df["id"].values[0] == 0 def test_query_by_single_value(es): - df = to_pandas(es.query_by_values("log", instance_vals=0)) + df = es.query_by_values("log", instance_vals=0) assert df["id"].values[0] == 0 def test_query_by_df(es): instance_df = pd.DataFrame({"id": [1, 3], "vals": [0, 1]}) - df = to_pandas(es.query_by_values("log", instance_vals=instance_df)) + df = es.query_by_values("log", instance_vals=instance_df) assert np.array_equal(df["id"], [1, 3]) @@ -374,10 +364,6 @@ def test_query_by_id_with_time(es): instance_vals=[0, 1, 2, 3, 4], time_last=datetime(2011, 4, 9, 10, 30, 2 * 6), ) - df = to_pandas(df) - if es.dataframe_type == Library.SPARK: - # Spark doesn't maintain order - df = df.sort_values("id") assert list(df["id"].values) == [0, 1, 2] @@ -389,12 +375,8 @@ def test_query_by_column_with_time(es): column_name="session_id", time_last=datetime(2011, 4, 9, 10, 50, 0), ) - df = to_pandas(df) true_values = [i * 5 for i in range(5)] + [i * 1 for i in range(4)] + [0] - if es.dataframe_type == Library.SPARK: - # Spark doesn't maintain order - df = df.sort_values("id") assert list(df["id"].values) == list(range(10)) assert list(df["value"].values) == true_values @@ -412,7 +394,6 @@ def test_query_by_column_with_no_lti_and_training_window(es): time_last=datetime(2011, 4, 11), training_window="3d", ) - df = to_pandas(df) assert list(df["id"].values) == [1] assert list(df["age"].values) == [25] @@ -427,8 +408,7 @@ def test_query_by_column_with_lti_and_training_window(es): time_last=datetime(2011, 4, 11), training_window="3d", ) - # Account for different ordering between pandas and dask/spark - df = to_pandas(df).reset_index(drop=True).sort_values("id") + df = df.reset_index(drop=True).sort_values("id") assert list(df["id"].values) == [0, 1, 2] assert list(df["age"].values) == [33, 25, 56] @@ -439,33 +419,15 @@ def test_query_by_indexed_column(es): instance_vals=["taco clock"], column_name="product_id", ) - # Account for different ordering between pandas and dask/spark - df = to_pandas(df).reset_index(drop=True).sort_values("id") + df = df.reset_index(drop=True).sort_values("id") assert list(df["id"].values) == [15, 16] @pytest.fixture -def pd_df(): +def df(): return pd.DataFrame({"id": [0, 1, 2], "category": ["a", "b", "c"]}) -@pytest.fixture -def dd_df(pd_df): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_df, npartitions=2) - - -@pytest.fixture -def spark_df(pd_df): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_df) - - -@pytest.fixture(params=["pd_df", "dd_df", "spark_df"]) -def df(request): - return request.getfixturevalue(request.param) - - def test_check_columns_and_dataframe(df): # matches logical_types = {"id": Integer, "category": Categorical} @@ -496,11 +458,7 @@ def test_make_index_any_location(df): logical_types=logical_types, dataframe=df, ) - if es.dataframe_type != Library.PANDAS: - assert es.dataframe_dict["test_dataframe"].columns[-1] == "id1" - else: - assert es.dataframe_dict["test_dataframe"].columns[0] == "id1" - + assert es.dataframe_dict["test_dataframe"].columns[0] == "id1" assert es.dataframe_dict["test_dataframe"].ww.index == "id1" @@ -508,11 +466,6 @@ def test_replace_dataframe_and_create_index(es): df = pd.DataFrame({"ints": [3, 4, 5], "category": ["a", "b", "a"]}) final_df = df.copy() final_df["id"] = [0, 1, 2] - if es.dataframe_type == Library.DASK: - df = dd.from_pandas(df, npartitions=2) - elif es.dataframe_type == Library.SPARK: - df = ps.from_pandas(df) - needs_idx_df = df.copy() logical_types = {"ints": Integer, "category": Categorical} @@ -531,17 +484,13 @@ def test_replace_dataframe_and_create_index(es): es.replace_dataframe("test_df", needs_idx_df) assert es["test_df"].ww.index == "id" - df = to_pandas(es["test_df"]).sort_values(by="id") + df = es["test_df"].sort_values(by="id") assert all(df["id"] == final_df["id"]) assert all(df["ints"] == final_df["ints"]) def test_replace_dataframe_created_index_present(es): df = pd.DataFrame({"ints": [3, 4, 5], "category": ["a", "b", "a"]}) - if es.dataframe_type == Library.DASK: - df = dd.from_pandas(df, npartitions=2) - elif es.dataframe_type == Library.SPARK: - df = ps.from_pandas(df) logical_types = {"ints": Integer, "category": Categorical} es.add_dataframe( @@ -554,14 +503,13 @@ def test_replace_dataframe_created_index_present(es): # DataFrame that already has the index column has_idx_df = es["test_df"].replace({0: 100}) - if es.dataframe_type == Library.PANDAS: - has_idx_df.set_index("id", drop=False, inplace=True) + has_idx_df.set_index("id", drop=False, inplace=True) assert "id" in has_idx_df.columns es.replace_dataframe("test_df", has_idx_df) assert es["test_df"].ww.index == "id" - df = to_pandas(es["test_df"]).sort_values(by="ints") + df = es["test_df"].sort_values(by="ints") assert all(df["id"] == [100, 1, 2]) @@ -603,27 +551,10 @@ def test_add_parent_not_index_column(es): @pytest.fixture -def pd_df2(): +def df2(): return pd.DataFrame({"category": [1, 2, 3], "category2": ["1", "2", "3"]}) -@pytest.fixture -def dd_df2(pd_df2): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_df2, npartitions=2) - - -@pytest.fixture -def spark_df2(pd_df2): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_df2) - - -@pytest.fixture(params=["pd_df2", "dd_df2", "spark_df2"]) -def df2(request): - return request.getfixturevalue(request.param) - - def test_none_index(df2): es = EntitySet(id="test") @@ -648,27 +579,10 @@ def test_none_index(df2): @pytest.fixture -def pd_df3(): +def df3(): return pd.DataFrame({"category": [1, 2, 3]}) -@pytest.fixture -def dd_df3(pd_df3): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_df3, npartitions=2) - - -@pytest.fixture -def spark_df3(pd_df3): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_df3) - - -@pytest.fixture(params=["pd_df3", "dd_df3", "spark_df3"]) -def df3(request): - return request.getfixturevalue(request.param) - - def test_unknown_index(df3): warn_text = "index id not found in dataframe, creating new integer column" es = EntitySet(id="test") @@ -680,7 +594,7 @@ def test_unknown_index(df3): logical_types={"category": "Categorical"}, ) assert es["test_dataframe"].ww.index == "id" - assert list(to_pandas(es["test_dataframe"]["id"], sort_index=True)) == list( + assert list(es["test_dataframe"]["id"]) == list( range(3), ) @@ -714,7 +628,7 @@ def test_bad_time_index_column(df3): @pytest.fixture -def pd_df4(): +def df4(): df = pd.DataFrame( { "id": [0, 1, 2], @@ -728,28 +642,8 @@ def pd_df4(): return df -@pytest.fixture -def dd_df4(pd_df4): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_df4, npartitions=2) - - -@pytest.fixture -def spark_df4(pd_df4): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_to_spark_clean(pd_df4)) - - -@pytest.fixture(params=["pd_df4", "dd_df4", "spark_df4"]) -def df4(request): - return request.getfixturevalue(request.param) - - def test_converts_dtype_on_init(df4): logical_types = {"id": Integer, "ints": Integer, "floats": Double} - if not isinstance(df4, pd.DataFrame): - logical_types["category"] = Categorical - logical_types["category_int"] = Categorical es = EntitySet(id="test") df4.ww.init(name="test_dataframe", index="id", logical_types=logical_types) es.add_dataframe(dataframe=df4) @@ -765,26 +659,15 @@ def test_converts_dtype_on_init(df4): def test_converts_dtype_after_init(df4): category_dtype = "category" - if ps and isinstance(df4, ps.DataFrame): - category_dtype = "string" df4["category"] = df4["category"].astype(category_dtype) - if not isinstance(df4, pd.DataFrame): - logical_types = { - "id": Integer, - "category": Categorical, - "category_int": Categorical, - "ints": Integer, - "floats": Double, - } - else: - logical_types = None + es = EntitySet(id="test") es.add_dataframe( dataframe_name="test_dataframe", index="id", dataframe=df4, - logical_types=logical_types, + logical_types=None, ) df = es["test_dataframe"] @@ -805,42 +688,13 @@ def test_converts_dtype_after_init(df4): assert df["ints"].dtype == "string" -def test_warns_no_typing(df4): - es = EntitySet(id="test") - if not isinstance(df4, pd.DataFrame): - msg = "Performing type inference on Dask or Spark DataFrames may be computationally intensive. Specify logical types for each column to speed up EntitySet initialization." - with pytest.warns(UserWarning, match=msg): - es.add_dataframe(dataframe_name="test_dataframe", index="id", dataframe=df4) - else: - es.add_dataframe(dataframe_name="test_dataframe", index="id", dataframe=df4) - - assert "test_dataframe" in es.dataframe_dict - - @pytest.fixture -def pd_datetime1(): +def datetime1(): times = pd.date_range("1/1/2011", periods=3, freq="H") time_strs = times.strftime("%Y-%m-%d") return pd.DataFrame({"id": [0, 1, 2], "time": time_strs}) -@pytest.fixture -def dd_datetime1(pd_datetime1): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_datetime1, npartitions=2) - - -@pytest.fixture -def spark_datetime1(pd_datetime1): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_datetime1) - - -@pytest.fixture(params=["pd_datetime1", "dd_datetime1", "spark_datetime1"]) -def datetime1(request): - return request.getfixturevalue(request.param) - - def test_converts_datetime(datetime1): # string converts to datetime correctly # This test fails without defining logical types. @@ -855,13 +709,13 @@ def test_converts_datetime(datetime1): logical_types=logical_types, dataframe=datetime1, ) - pd_col = to_pandas(es["test_dataframe"]["time"]) + pd_col = es["test_dataframe"]["time"] assert isinstance(es["test_dataframe"].ww.logical_types["time"], Datetime) assert type(pd_col[0]) == pd.Timestamp @pytest.fixture -def pd_datetime2(): +def datetime2(): datetime_format = "%d-%m-%Y" actual = pd.Timestamp("Jan 2, 2011") time_strs = [actual.strftime(datetime_format)] * 3 @@ -870,23 +724,6 @@ def pd_datetime2(): ) -@pytest.fixture -def dd_datetime2(pd_datetime2): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_datetime2, npartitions=2) - - -@pytest.fixture -def spark_datetime2(pd_datetime2): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_datetime2) - - -@pytest.fixture(params=["pd_datetime2", "dd_datetime2", "spark_datetime2"]) -def datetime2(request): - return request.getfixturevalue(request.param) - - def test_handles_datetime_format(datetime2): # check if we load according to the format string # pass in an ambiguous date @@ -907,8 +744,8 @@ def test_handles_datetime_format(datetime2): dataframe=datetime2, ) - col_format = to_pandas(es["test_dataframe"]["time_format"]) - col_no_format = to_pandas(es["test_dataframe"]["time_no_format"]) + col_format = es["test_dataframe"]["time_format"] + col_no_format = es["test_dataframe"]["time_no_format"] # without formatting pandas gets it wrong assert (col_no_format != actual).all() @@ -942,17 +779,7 @@ def test_dataframe_init(es): "number": [4, 5, 6], }, ) - if es.dataframe_type == Library.DASK: - df = dd.from_pandas(df, npartitions=2) - elif es.dataframe_type == Library.SPARK: - df = ps.from_pandas(df) logical_types = {"id": Categorical, "time": Datetime} - if not isinstance(df, pd.DataFrame): - extra_logical_types = { - "category": Categorical, - "number": Integer, - } - logical_types.update(extra_logical_types) es.add_dataframe( df.copy(), dataframe_name="test_dataframe", @@ -960,50 +787,24 @@ def test_dataframe_init(es): time_index="time", logical_types=logical_types, ) - if is_instance(df, dd, "DataFrame"): - df_shape = (df.shape[0].compute(), df.shape[1]) - else: - df_shape = df.shape - if es.dataframe_type == Library.DASK: - es_df_shape = ( - es["test_dataframe"].shape[0].compute(), - es["test_dataframe"].shape[1], - ) - else: - es_df_shape = es["test_dataframe"].shape + df_shape = df.shape + + es_df_shape = es["test_dataframe"].shape assert es_df_shape == df_shape assert es["test_dataframe"].ww.index == "id" assert es["test_dataframe"].ww.time_index == "time" assert set([v for v in es["test_dataframe"].ww.columns]) == set(df.columns) assert es["test_dataframe"]["time"].dtype == df["time"].dtype - if es.dataframe_type == Library.SPARK: - assert set(es["test_dataframe"]["id"].to_list()) == set(df["id"].to_list()) - else: - assert set(es["test_dataframe"]["id"]) == set(df["id"]) + assert set(es["test_dataframe"]["id"]) == set(df["id"]) @pytest.fixture -def pd_bad_df(): +def bad_df(): return pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], 3: ["a", "b", "c"]}) -@pytest.fixture -def dd_bad_df(pd_bad_df): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_bad_df, npartitions=2) - - -@pytest.fixture(params=["pd_bad_df", "dd_bad_df"]) -def bad_df(request): - return request.getfixturevalue(request.param) - - -# Skip for Spark, automatically converts non-str column names to str def test_nonstr_column_names(bad_df): - if is_instance(bad_df, dd, "DataFrame"): - pytest.xfail("Dask DataFrames cannot handle integer column names") - es = EntitySet(id="Failure") error_text = r"All column names must be strings \(Columns \[3\] are not strings\)" with pytest.raises(ValueError, match=error_text): @@ -1099,11 +900,7 @@ def test_concat_inplace(es): def test_concat_with_lti(es): first_es = copy.deepcopy(es) for df in first_es.dataframes: - if first_es.dataframe_type == Library.SPARK: - # Spark cannot compute last time indexes on an empty Dataframe - new_df = df.head(1) - else: - new_df = df.loc[[], :] + new_df = df.loc[[], :] first_es.replace_dataframe(df.ww.name, new_df) second_es = copy.deepcopy(es) @@ -1145,18 +942,18 @@ def test_concat_errors(es): es.concat(copy_es) -def test_concat_sort_index_with_time_index(pd_es): +def test_concat_sort_index_with_time_index(es): # only pandas dataframes sort on the index and time index - es1 = copy.deepcopy(pd_es) + es1 = copy.deepcopy(es) es1.replace_dataframe( dataframe_name="customers", - df=pd_es["customers"].loc[[0, 1], :], + df=es["customers"].loc[[0, 1], :], already_sorted=True, ) - es2 = copy.deepcopy(pd_es) + es2 = copy.deepcopy(es) es2.replace_dataframe( dataframe_name="customers", - df=pd_es["customers"].loc[[2], :], + df=es["customers"].loc[[2], :], already_sorted=True, ) @@ -1165,23 +962,23 @@ def test_concat_sort_index_with_time_index(pd_es): assert list(combined_es_order_1["customers"].index) == [2, 0, 1] assert list(combined_es_order_2["customers"].index) == [2, 0, 1] - assert combined_es_order_1.__eq__(pd_es, deep=True) - assert combined_es_order_2.__eq__(pd_es, deep=True) + assert combined_es_order_1.__eq__(es, deep=True) + assert combined_es_order_2.__eq__(es, deep=True) assert combined_es_order_2.__eq__(combined_es_order_1, deep=True) -def test_concat_sort_index_without_time_index(pd_es): +def test_concat_sort_index_without_time_index(es): # Sorting is only performed on DataFrames with time indices - es1 = copy.deepcopy(pd_es) + es1 = copy.deepcopy(es) es1.replace_dataframe( dataframe_name="products", - df=pd_es["products"].iloc[[0, 1, 2], :], + df=es["products"].iloc[[0, 1, 2], :], already_sorted=True, ) - es2 = copy.deepcopy(pd_es) + es2 = copy.deepcopy(es) es2.replace_dataframe( dataframe_name="products", - df=pd_es["products"].iloc[[3, 4, 5], :], + df=es["products"].iloc[[3, 4, 5], :], already_sorted=True, ) @@ -1205,18 +1002,14 @@ def test_concat_sort_index_without_time_index(pd_es): "car", "toothpaste", ] - assert combined_es_order_1.__eq__(pd_es, deep=True) - assert not combined_es_order_2.__eq__(pd_es, deep=True) - assert combined_es_order_2.__eq__(pd_es, deep=False) + assert combined_es_order_1.__eq__(es, deep=True) + assert not combined_es_order_2.__eq__(es, deep=True) + assert combined_es_order_2.__eq__(es, deep=False) assert not combined_es_order_2.__eq__(combined_es_order_1, deep=True) def test_concat_with_make_index(es): df = pd.DataFrame({"id": [0, 1, 2], "category": ["a", "b", "a"]}) - if es.dataframe_type == Library.DASK: - df = dd.from_pandas(df, npartitions=2) - elif es.dataframe_type == Library.SPARK: - df = ps.from_pandas(df) logical_types = {"id": Categorical, "category": Categorical} es.add_dataframe( dataframe=df, @@ -1247,9 +1040,8 @@ def test_concat_with_make_index(es): assert es.__eq__(es_1, deep=False) assert es.__eq__(es_2, deep=False) - if es.dataframe_type == Library.PANDAS: - assert not es.__eq__(es_1, deep=True) - assert not es.__eq__(es_2, deep=True) + assert not es.__eq__(es_1, deep=True) + assert not es.__eq__(es_2, deep=True) old_es_1 = copy.deepcopy(es_1) old_es_2 = copy.deepcopy(es_2) @@ -1262,7 +1054,7 @@ def test_concat_with_make_index(es): @pytest.fixture -def pd_transactions_df(): +def transactions_df(): return pd.DataFrame( { "id": [1, 2, 3, 4, 5, 6], @@ -1273,44 +1065,11 @@ def pd_transactions_df(): ) -@pytest.fixture -def dd_transactions_df(pd_transactions_df): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_transactions_df, npartitions=3) - - -@pytest.fixture -def spark_transactions_df(pd_transactions_df): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_transactions_df) - - -@pytest.fixture( - params=["pd_transactions_df", "dd_transactions_df", "spark_transactions_df"], -) -def transactions_df(request): - return request.getfixturevalue(request.param) - - def test_set_time_type_on_init(transactions_df): # create cards dataframe cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) - if is_instance(transactions_df, dd, "DataFrame"): - cards_df = dd.from_pandas(cards_df, npartitions=3) - if ps and isinstance(transactions_df, ps.DataFrame): - cards_df = ps.from_pandas(cards_df) - if not isinstance(transactions_df, pd.DataFrame): - cards_logical_types = {"id": Categorical} - transactions_logical_types = { - "id": Integer, - "card_id": Categorical, - "transaction_time": Integer, - "fraud": Boolean, - } - else: - cards_logical_types = None - transactions_logical_types = None - + cards_logical_types = None + transactions_logical_types = None dataframes = { "cards": (cards_df, "id", None, cards_logical_types), "transactions": ( @@ -1340,21 +1099,8 @@ def test_sets_time_when_adding_dataframe(transactions_df): accounts_df_string = pd.DataFrame( {"id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"]}, ) - if is_instance(transactions_df, dd, "DataFrame"): - accounts_df = dd.from_pandas(accounts_df, npartitions=2) - if ps and isinstance(transactions_df, ps.DataFrame): - accounts_df = ps.from_pandas(accounts_df) - if not isinstance(transactions_df, pd.DataFrame): - accounts_logical_types = {"id": Categorical, "signup_date": Datetime} - transactions_logical_types = { - "id": Integer, - "card_id": Categorical, - "transaction_time": Integer, - "fraud": Boolean, - } - else: - accounts_logical_types = None - transactions_logical_types = None + accounts_logical_types = None + transactions_logical_types = None # create empty entityset es = EntitySet("fraud") @@ -1384,16 +1130,15 @@ def test_sets_time_when_adding_dataframe(transactions_df): time_index="signup_date", logical_types=accounts_logical_types, ) - # add non time type as time index, only valid for pandas - if isinstance(transactions_df, pd.DataFrame): - error_text = "Time index column must contain datetime or numeric values" - with pytest.raises(TypeError, match=error_text): - es.add_dataframe( - accounts_df_string, - dataframe_name="accounts", - index="id", - time_index="signup_date", - ) + + error_text = "Time index column must contain datetime or numeric values" + with pytest.raises(TypeError, match=error_text): + es.add_dataframe( + accounts_df_string, + dataframe_name="accounts", + index="id", + time_index="signup_date", + ) def test_secondary_time_index_no_primary_time_index(es): @@ -1604,7 +1349,7 @@ def test_normalize_dataframe_new_time_index_additional_success_check(es): @pytest.fixture -def pd_normalize_es(): +def normalize_es(): df = pd.DataFrame( { "id": [0, 1, 2, 3], @@ -1621,32 +1366,6 @@ def pd_normalize_es(): return es.add_dataframe(dataframe_name="data", dataframe=df, index="id") -@pytest.fixture -def dd_normalize_es(pd_normalize_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - es = EntitySet(id=pd_normalize_es.id) - dd_df = dd.from_pandas(pd_normalize_es["data"], npartitions=2) - dd_df.ww.init(schema=pd_normalize_es["data"].ww.schema) - - es.add_dataframe(dataframe=dd_df) - return es - - -@pytest.fixture -def spark_normalize_es(pd_normalize_es): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - es = EntitySet(id=pd_normalize_es.id) - spark_df = ps.from_pandas(pd_normalize_es["data"]) - spark_df.ww.init(schema=pd_normalize_es["data"].ww.schema) - es.add_dataframe(dataframe=spark_df) - return es - - -@pytest.fixture(params=["pd_normalize_es", "dd_normalize_es", "spark_normalize_es"]) -def normalize_es(request): - return request.getfixturevalue(request.param) - - def test_normalize_time_index_from_none(normalize_es): assert normalize_es["data"].ww.time_index is None @@ -1660,9 +1379,7 @@ def test_normalize_time_index_from_none(normalize_es): assert normalize_es["normalized"].ww.time_index == "time" df = normalize_es["normalized"] - # only pandas sorts by time index - if isinstance(df, pd.DataFrame): - assert df["time"].is_monotonic_increasing + assert df["time"].is_monotonic_increasing def test_raise_error_if_dupicate_additional_columns_passed(es): @@ -1726,7 +1443,6 @@ def test_normalize_dataframe_copies_logical_types(es): assert len(es["values_2"].ww.logical_types["value"].order) == 10 -# sorting not supported in Dask, Spark def test_make_time_index_keeps_original_sorting(): trips = { "trip_id": [999 - i for i in range(1000)], @@ -1765,7 +1481,7 @@ def test_normalize_dataframe_new_time_index(es): assert es["values"].ww.time_index == new_time_index assert new_time_index in es["values"].columns assert len(es["values"].columns) == 2 - df = to_pandas(es["values"], sort_index=True) + df = es["values"] assert df[new_time_index].is_monotonic_increasing @@ -1843,52 +1559,28 @@ def test_metadata_without_id(): @pytest.fixture -def pd_datetime3(): +def datetime3(): return pd.DataFrame({"id": [0, 1, 2], "ints": ["1", "2", "1"]}) -@pytest.fixture -def dd_datetime3(pd_datetime3): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_datetime3, npartitions=2) - - -@pytest.fixture -def spark_datetime3(pd_datetime3): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_datetime3) - - -@pytest.fixture(params=["pd_datetime3", "dd_datetime3", "spark_datetime3"]) -def datetime3(request): - return request.getfixturevalue(request.param) - - def test_datetime64_conversion(datetime3): df = datetime3 df["time"] = pd.Timestamp.now() - if ps and isinstance(df, ps.DataFrame): - df["time"] = df["time"].astype(np.datetime64) - else: - df["time"] = df["time"].dt.tz_localize("UTC") - - if not isinstance(df, pd.DataFrame): - logical_types = {"id": Integer, "ints": Integer, "time": Datetime} - else: - logical_types = None + df["time"] = df["time"].dt.tz_localize("UTC") + es = EntitySet(id="test") es.add_dataframe( dataframe_name="test_dataframe", index="id", dataframe=df, - logical_types=logical_types, + logical_types=None, ) es["test_dataframe"].ww.set_time_index("time") assert es["test_dataframe"].ww.time_index == "time" @pytest.fixture -def pd_index_df(): +def index_df(): return pd.DataFrame( { "id": [1, 2, 3, 4, 5, 6], @@ -1898,33 +1590,7 @@ def pd_index_df(): ) -@pytest.fixture -def dd_index_df(pd_index_df): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_index_df, npartitions=3) - - -@pytest.fixture -def spark_index_df(pd_index_df): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_index_df) - - -@pytest.fixture(params=["pd_index_df", "dd_index_df", "spark_index_df"]) -def index_df(request): - return request.getfixturevalue(request.param) - - def test_same_index_values(index_df): - if not isinstance(index_df, pd.DataFrame): - logical_types = { - "id": Integer, - "transaction_time": Datetime, - "first_dataframe_time": Integer, - } - else: - logical_types = None - es = EntitySet("example") error_text = ( @@ -1936,7 +1602,7 @@ def test_same_index_values(index_df): index="id", time_index="id", dataframe=index_df, - logical_types=logical_types, + logical_types=None, ) es.add_dataframe( @@ -1944,7 +1610,7 @@ def test_same_index_values(index_df): index="id", time_index="transaction_time", dataframe=index_df, - logical_types=logical_types, + logical_types=None, ) error_text = "time_index and index cannot be the same value, first_dataframe_time" @@ -1958,22 +1624,9 @@ def test_same_index_values(index_df): def test_use_time_index(index_df): - if not isinstance(index_df, pd.DataFrame): - bad_ltypes = { - "id": Integer, - "transaction_time": Datetime, - "first_dataframe_time": Integer, - } - bad_semantic_tags = {"transaction_time": "time_index"} - logical_types = { - "id": Integer, - "transaction_time": Datetime, - "first_dataframe_time": Integer, - } - else: - bad_ltypes = {"transaction_time": Datetime} - bad_semantic_tags = {"transaction_time": "time_index"} - logical_types = None + bad_ltypes = {"transaction_time": Datetime} + bad_semantic_tags = {"transaction_time": "time_index"} + logical_types = None es = EntitySet() @@ -2024,10 +1677,6 @@ def test_normalize_with_numeric_time_index(int_es): def test_normalize_with_invalid_time_index(es): - if es.dataframe_type == Library.DASK: - pytest.skip( - "Woodwork raises different error with Dask. Remove this skip once WW is updated.", - ) error_text = "Time index column must contain datetime or numeric values" with pytest.raises(TypeError, match=error_text): es.normalize_dataframe( @@ -2104,8 +1753,8 @@ def test_add_interesting_values_vals_specified_without_dataframe_name(es): es.add_interesting_values(values=interesting_values) -def test_add_interesting_values_single_dataframe(pd_es): - pd_es.add_interesting_values(dataframe_name="log") +def test_add_interesting_values_single_dataframe(es): + es.add_interesting_values(dataframe_name="log") expected_vals = { "zipcode": ["02116", "02116-3899", "12345-6789", "1234567890", "0"], @@ -2114,20 +1763,18 @@ def test_add_interesting_values_single_dataframe(pd_es): "priority_level": [0, 1, 2], } - for col in pd_es["log"].columns: + for col in es["log"].columns: if col in expected_vals: assert ( - pd_es["log"].ww.columns[col].metadata.get("interesting_values") + es["log"].ww.columns[col].metadata.get("interesting_values") == expected_vals[col] ) else: - assert ( - pd_es["log"].ww.columns[col].metadata.get("interesting_values") is None - ) + assert es["log"].ww.columns[col].metadata.get("interesting_values") is None -def test_add_interesting_values_multiple_dataframes(pd_es): - pd_es.add_interesting_values() +def test_add_interesting_values_multiple_dataframes(es): + es.add_interesting_values() expected_cols_with_vals = { "régions": {"language"}, "stores": {}, @@ -2137,7 +1784,7 @@ def test_add_interesting_values_multiple_dataframes(pd_es): "log": {"zipcode", "countrycode", "subregioncode", "priority_level"}, "cohorts": {"cohort_name"}, } - for df_id, df in pd_es.dataframe_dict.items(): + for df_id, df in es.dataframe_dict.items(): expected_cols = expected_cols_with_vals[df_id] for col in df.columns: if col in expected_cols: @@ -2377,52 +2024,24 @@ def test_entityset_deep_equality(es): first_es.replace_dataframe("customers", updated_df) assert first_es.__eq__(second_es, deep=False) - # Uses woodwork equality which only looks at df content for pandas - if isinstance(updated_df, pd.DataFrame): - assert not first_es.__eq__(second_es, deep=True) - else: - assert first_es.__eq__(second_es, deep=True) - - -@pytest.fixture(params=["make_es", "dask_es_to_copy"]) -def es_to_copy(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def dask_es_to_copy(make_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - es = EntitySet(id=make_es.id) - for df in make_es.dataframes: - dd_df = dd.from_pandas(df.reset_index(drop=True), npartitions=4) - dd_df.ww.init(schema=df.ww.schema) - es.add_dataframe(dd_df) - - for rel in make_es.relationships: - es.add_relationship( - rel.parent_dataframe.ww.name, - rel._parent_column_name, - rel.child_dataframe.ww.name, - rel._child_column_name, - ) - return es + assert not first_es.__eq__(second_es, deep=True) -def test_deepcopy_entityset(es_to_copy): +def test_deepcopy_entityset(make_es): # Uses make_es since the es fixture uses deepcopy - copied_es = copy.deepcopy(es_to_copy) + copied_es = copy.deepcopy(make_es) - assert copied_es == es_to_copy - assert copied_es is not es_to_copy + assert copied_es == make_es + assert copied_es is not make_es - for df_name in es_to_copy.dataframe_dict.keys(): - original_df = es_to_copy[df_name] + for df_name in make_es.dataframe_dict.keys(): + original_df = make_es[df_name] new_df = copied_es[df_name] assert new_df.ww.schema == original_df.ww.schema assert new_df.ww._schema is not original_df.ww._schema - pd.testing.assert_frame_equal(to_pandas(new_df), to_pandas(original_df)) + pd.testing.assert_frame_equal(new_df, original_df) assert new_df is not original_df @@ -2457,24 +2076,15 @@ def test_deepcopy_entityset_featuretools_changes(es): } -def test_dataframe_type_empty_es(): - es = EntitySet("test") - assert es.dataframe_type is None - - -def test_dataframe_type_pandas_es(pd_es): - assert pd_es.dataframe_type == Library.PANDAS - - def test_es__getstate__key_unique(es): assert not hasattr(es, WW_SCHEMA_KEY) -def test_pd_es_pickling(pd_es): - pkl = pickle.dumps(pd_es) +def test_es_pickling(es): + pkl = pickle.dumps(es) unpickled = pickle.loads(pkl) - assert pd_es.__eq__(unpickled, deep=True) + assert es.__eq__(unpickled, deep=True) assert not hasattr(unpickled, WW_SCHEMA_KEY) @@ -2508,7 +2118,7 @@ def test_latlong_nan_normalization(latlong_df): es = EntitySet("latlong-test", dataframes, relationships) - normalized_df = to_pandas(es["latLong"], sort_index=True) + normalized_df = es["latLong"] expected_df = pd.DataFrame( {"idx": [0, 1, 2], "latLong": [(np.nan, np.nan), (1, 2), (np.nan, np.nan)]}, @@ -2528,7 +2138,7 @@ def test_latlong_nan_normalization_add_dataframe(latlong_df): es.add_dataframe(latlong_df) - normalized_df = to_pandas(es["latLong"], sort_index=True) + normalized_df = es["latLong"] expected_df = pd.DataFrame( {"idx": [0, 1, 2], "latLong": [(np.nan, np.nan), (1, 2), (np.nan, np.nan)]}, diff --git a/featuretools/tests/entityset_tests/test_es_metadata.py b/featuretools/tests/entityset_tests/test_es_metadata.py index e7014972ca..a67ab6ee99 100644 --- a/featuretools/tests/entityset_tests/test_es_metadata.py +++ b/featuretools/tests/entityset_tests/test_es_metadata.py @@ -138,27 +138,10 @@ def test_find_forward_paths_multiple_relationships(games_es): @pytest.fixture -def pd_employee_df(): +def employee_df(): return pd.DataFrame({"id": [0], "manager_id": [0]}) -@pytest.fixture -def dd_employee_df(pd_employee_df): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_employee_df, npartitions=2) - - -@pytest.fixture -def spark_employee_df(pd_employee_df): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_employee_df) - - -@pytest.fixture(params=["pd_employee_df", "dd_employee_df", "spark_employee_df"]) -def employee_df(request): - return request.getfixturevalue(request.param) - - def test_find_forward_paths_ignores_loops(employee_df): dataframes = {"employees": (employee_df, "id")} relationships = [("employees", "id", "employees", "manager_id")] diff --git a/featuretools/tests/entityset_tests/test_last_time_index.py b/featuretools/tests/entityset_tests/test_last_time_index.py index 1d2aa68f0e..bc5b520bf2 100644 --- a/featuretools/tests/entityset_tests/test_last_time_index.py +++ b/featuretools/tests/entityset_tests/test_last_time_index.py @@ -5,11 +5,6 @@ from woodwork.logical_types import Categorical, Datetime, Integer from featuretools.entityset.entityset import LTI_COLUMN_NAME -from featuretools.tests.testing_utils import to_pandas -from featuretools.utils.gen_utils import Library, import_or_none - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") @pytest.fixture @@ -91,14 +86,8 @@ def wishlist_df(): def extra_session_df(es): row_values = {"customer_id": 2, "device_name": "PC", "device_type": 0, "id": 6} row = pd.DataFrame(row_values, index=pd.Index([6], name="id")) - df = to_pandas(es["sessions"]) + df = es["sessions"] df = pd.concat([df, row]).sort_index() - if es.dataframe_type == Library.DASK: - df = dd.from_pandas(df, npartitions=3) - elif es.dataframe_type == Library.SPARK: - # Spark can't handle object dtypes - df = df.astype("string") - df = ps.from_pandas(df) return df @@ -111,7 +100,7 @@ def test_leaf(self, es): assert lti_name == LTI_COLUMN_NAME assert len(log[lti_name]) == 17 - log_df = to_pandas(log) + log_df = log for v1, v2 in zip(log_df[lti_name], log_df["datetime"]): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 @@ -123,7 +112,7 @@ def test_leaf_no_time_index(self, es): assert len(true_lti) == len(stores[LTI_COLUMN_NAME]) - stores_lti = to_pandas(stores[LTI_COLUMN_NAME]) + stores_lti = stores[LTI_COLUMN_NAME] for v1, v2 in zip(stores_lti, true_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 @@ -131,23 +120,16 @@ def test_leaf_no_time_index(self, es): # TODO: possible issue with either normalize_dataframe or add_last_time_indexes def test_parent(self, values_es, true_values_lti): # test dataframe with time index and all instances in child dataframe - if values_es.dataframe_type != Library.PANDAS: - pytest.xfail( - "possible issue with either normalize_dataframe or add_last_time_indexes", - ) values_es.add_last_time_indexes() values = values_es["values"] lti_name = values.ww.metadata.get("last_time_index") assert len(values[lti_name]) == 10 - sorted_lti = to_pandas(values[lti_name]).sort_index() + sorted_lti = values[lti_name].sort_index() for v1, v2 in zip(sorted_lti, true_values_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 - # TODO: fails with Dask, tests needs to be reworked def test_parent_some_missing(self, values_es, true_values_lti): # test dataframe with time index and not all instances have children - if values_es.dataframe_type != Library.PANDAS: - pytest.xfail("fails with Dask, tests needs to be reworked") values = values_es["values"] # add extra value instance with no children @@ -180,7 +162,7 @@ def test_parent_no_time_index(self, es, true_sessions_lti): sessions = es["sessions"] lti_name = sessions.ww.metadata.get("last_time_index") assert len(sessions[lti_name]) == 6 - sorted_lti = to_pandas(sessions[lti_name]).sort_index() + sorted_lti = sessions[lti_name].sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 @@ -201,16 +183,12 @@ def test_parent_no_time_index_missing( lti_name = sessions.ww.metadata.get("last_time_index") assert len(sessions[lti_name]) == 7 - sorted_lti = to_pandas(sessions[lti_name]).sort_index() + sorted_lti = sessions[lti_name].sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 def test_multiple_children(self, es, wishlist_df, true_sessions_lti): - if es.dataframe_type == Library.SPARK: - pytest.xfail("Cannot make index on a Spark DataFrame") # test all instances in both children - if es.dataframe_type == Library.DASK: - wishlist_df = dd.from_pandas(wishlist_df, npartitions=2) logical_types = { "session_id": Integer, "datetime": Datetime, @@ -233,19 +211,15 @@ def test_multiple_children(self, es, wishlist_df, true_sessions_lti): lti_name = sessions.ww.metadata.get("last_time_index") assert len(sessions[lti_name]) == 6 - sorted_lti = to_pandas(sessions[lti_name]).sort_index() + sorted_lti = sessions[lti_name].sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 def test_multiple_children_right_missing(self, es, wishlist_df, true_sessions_lti): - if es.dataframe_type == Library.SPARK: - pytest.xfail("Cannot make index on a Spark DataFrame") # test all instances in left child # drop wishlist instance related to id 3 so it's only in log wishlist_df.drop(4, inplace=True) - if es.dataframe_type == Library.DASK: - wishlist_df = dd.from_pandas(wishlist_df, npartitions=2) logical_types = { "session_id": Integer, "datetime": Datetime, @@ -268,7 +242,7 @@ def test_multiple_children_right_missing(self, es, wishlist_df, true_sessions_lt lti_name = sessions.ww.metadata.get("last_time_index") assert len(sessions[lti_name]) == 6 - sorted_lti = to_pandas(sessions[lti_name]).sort_index() + sorted_lti = sessions[lti_name].sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 @@ -279,9 +253,6 @@ def test_multiple_children_left_missing( wishlist_df, true_sessions_lti, ): - if es.dataframe_type == Library.SPARK: - pytest.xfail("Cannot make index on a Spark DataFrame") - # add row to sessions so not all session instances are in log es.replace_dataframe(dataframe_name="sessions", df=extra_session_df) @@ -293,8 +264,6 @@ def test_multiple_children_left_missing( } row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = pd.concat([wishlist_df, row]) - if es.dataframe_type == Library.DASK: - df = dd.from_pandas(df, npartitions=2) logical_types = { "session_id": Integer, "datetime": Datetime, @@ -321,7 +290,7 @@ def test_multiple_children_left_missing( lti_name = sessions.ww.metadata.get("last_time_index") assert len(sessions[lti_name]) == 7 - sorted_lti = to_pandas(sessions[lti_name]).sort_index() + sorted_lti = sessions[lti_name].sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 @@ -332,9 +301,6 @@ def test_multiple_children_all_combined( wishlist_df, true_sessions_lti, ): - if es.dataframe_type == Library.SPARK: - pytest.xfail("Cannot make index on a Spark DataFrame") - # add row to sessions so not all session instances are in log es.replace_dataframe(dataframe_name="sessions", df=extra_session_df) @@ -349,8 +315,6 @@ def test_multiple_children_all_combined( # drop instance 4 so wishlist_log does not have session id 3 instance df.drop(4, inplace=True) - if es.dataframe_type == Library.DASK: - df = dd.from_pandas(df, npartitions=2) logical_types = { "session_id": Integer, "datetime": Datetime, @@ -376,7 +340,7 @@ def test_multiple_children_all_combined( lti_name = sessions.ww.metadata.get("last_time_index") assert len(sessions[lti_name]) == 7 - sorted_lti = to_pandas(sessions[lti_name]).sort_index() + sorted_lti = sessions[lti_name].sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 @@ -387,14 +351,9 @@ def test_multiple_children_both_missing( wishlist_df, true_sessions_lti, ): - if es.dataframe_type == Library.SPARK: - pytest.xfail("Cannot make index on a Spark DataFrame") # test all instances in neither child sessions = es["sessions"] - if es.dataframe_type == Library.DASK: - wishlist_df = dd.from_pandas(wishlist_df, npartitions=2) - logical_types = { "session_id": Integer, "datetime": Datetime, @@ -422,28 +381,23 @@ def test_multiple_children_both_missing( lti_name = sessions.ww.metadata.get("last_time_index") assert len(sessions[lti_name]) == 7 - sorted_lti = to_pandas(sessions[lti_name]).sort_index() + sorted_lti = sessions[lti_name].sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 def test_grandparent(self, es): # test sorting by time works correctly across several generations - log = es["log"] + df = es["log"] # For one user, change a log event to be newer than the user's normal # last time index. This event should be from a different session than # the current last time index. - df = to_pandas(log) df["datetime"][5] = pd.Timestamp("2011-4-09 10:40:01") df = ( df.set_index("datetime", append=True) .sort_index(level=[1, 0], kind="mergesort") .reset_index("datetime", drop=False) ) - if es.dataframe_type == Library.DASK: - df = dd.from_pandas(df, npartitions=2) - if es.dataframe_type == Library.SPARK: - df = ps.from_pandas(df) es.replace_dataframe(dataframe_name="log", df=df) es.add_last_time_indexes() customers = es["customers"] @@ -458,6 +412,6 @@ def test_grandparent(self, es): lti_name = customers.ww.metadata.get("last_time_index") assert len(customers[lti_name]) == 3 - sorted_lti = to_pandas(customers).sort_values("id")[lti_name] + sorted_lti = customers.sort_values("id")[lti_name] for v1, v2 in zip(sorted_lti, true_customers_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2 diff --git a/featuretools/tests/entityset_tests/test_plotting.py b/featuretools/tests/entityset_tests/test_plotting.py index cbd9e48053..364876c01b 100644 --- a/featuretools/tests/entityset_tests/test_plotting.py +++ b/featuretools/tests/entityset_tests/test_plotting.py @@ -6,41 +6,16 @@ import pytest from featuretools import EntitySet -from featuretools.utils.gen_utils import Library @pytest.fixture -def pd_simple(): +def simple_es(): es = EntitySet("test") df = pd.DataFrame({"foo": [1]}) es.add_dataframe(df, dataframe_name="test", index="foo") return es -@pytest.fixture -def dd_simple(): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - es = EntitySet("test") - df = pd.DataFrame({"foo": [1]}) - df = dd.from_pandas(df, npartitions=2) - es.add_dataframe(df, dataframe_name="test", index="foo") - return es - - -@pytest.fixture -def spark_simple(): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - es = EntitySet("test") - df = ps.DataFrame({"foo": [1]}) - es.add_dataframe(df, dataframe_name="test", index="foo") - return es - - -@pytest.fixture(params=["pd_simple", "dd_simple", "spark_simple"]) -def simple_es(request): - return request.getfixturevalue(request.param) - - def test_returns_digraph_object(es): graph = es.plot() @@ -78,19 +53,11 @@ def test_multiple_rows(es): plot_ = es.plot() result = re.findall(r"\((\d+\srows?)\)", plot_.source) expected = ["{} rows".format(str(i.shape[0])) for i in es.dataframes] - if es.dataframe_type == Library.DASK: - # Dask does not list number of rows in plot - assert result == [] - else: - assert result == expected + assert result == expected def test_single_row(simple_es): plot_ = simple_es.plot() result = re.findall(r"\((\d+\srows?)\)", plot_.source) expected = ["1 row"] - if simple_es.dataframe_type == Library.DASK: - # Dask does not list number of rows in plot - assert result == [] - else: - assert result == expected + assert result == expected diff --git a/featuretools/tests/entityset_tests/test_serialization.py b/featuretools/tests/entityset_tests/test_serialization.py index bc9ab36673..deb9da2d77 100644 --- a/featuretools/tests/entityset_tests/test_serialization.py +++ b/featuretools/tests/entityset_tests/test_serialization.py @@ -9,13 +9,11 @@ import pandas as pd import pytest import woodwork.type_sys.type_system as ww_type_system -from woodwork.logical_types import Datetime, LogicalType, Ordinal +from woodwork.logical_types import LogicalType, Ordinal from woodwork.serializers.serializer_base import typing_info_to_dict from woodwork.type_sys.utils import list_logical_types from featuretools.entityset import EntitySet, deserialize, serialize -from featuretools.tests.testing_utils import to_pandas -from featuretools.utils.gen_utils import Library from featuretools.version import ENTITYSET_SCHEMA_VERSION BUCKET_NAME = "test-bucket" @@ -99,18 +97,17 @@ def test_to_csv(es, tmp_path): es.to_csv(str(tmp_path), encoding="utf-8", engine="python") new_es = deserialize.read_entityset(str(tmp_path)) assert es.__eq__(new_es, deep=True) - df = to_pandas(es["log"], index="id") - new_df = to_pandas(new_es["log"], index="id") + df = es["log"] + new_df = new_es["log"] assert type(df["latlong"][0]) in (tuple, list) assert type(new_df["latlong"][0]) in (tuple, list) -# Dask/Spark don't support auto setting of interesting values with es.add_interesting_values() -def test_to_csv_interesting_values(pd_es, tmp_path): - pd_es.add_interesting_values() - pd_es.to_csv(str(tmp_path)) +def test_to_csv_interesting_values(es, tmp_path): + es.add_interesting_values() + es.to_csv(str(tmp_path)) new_es = deserialize.read_entityset(str(tmp_path)) - assert pd_es.__eq__(new_es, deep=True) + assert es.__eq__(new_es, deep=True) def test_to_csv_manual_interesting_values(es, tmp_path): @@ -126,44 +123,29 @@ def test_to_csv_manual_interesting_values(es, tmp_path): ] -# Dask/Spark do not support to_pickle -def test_to_pickle(pd_es, tmp_path): - pd_es.to_pickle(str(tmp_path)) +def test_to_pickle(es, tmp_path): + es.to_pickle(str(tmp_path)) new_es = deserialize.read_entityset(str(tmp_path)) - assert pd_es.__eq__(new_es, deep=True) - assert type(pd_es["log"]["latlong"][0]) == tuple + assert es.__eq__(new_es, deep=True) + assert type(es["log"]["latlong"][0]) == tuple assert type(new_es["log"]["latlong"][0]) == tuple -def test_to_pickle_errors_dask(dask_es, tmp_path): - msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format." - with pytest.raises(ValueError, match=msg): - dask_es.to_pickle(str(tmp_path)) - - -def test_to_pickle_errors_spark(spark_es, tmp_path): - msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format." - with pytest.raises(ValueError, match=msg): - spark_es.to_pickle(str(tmp_path)) - - -# Dask/Spark do not support to_pickle -def test_to_pickle_interesting_values(pd_es, tmp_path): - pd_es.add_interesting_values() - pd_es.to_pickle(str(tmp_path)) +def test_to_pickle_interesting_values(es, tmp_path): + es.add_interesting_values() + es.to_pickle(str(tmp_path)) new_es = deserialize.read_entityset(str(tmp_path)) - assert pd_es.__eq__(new_es, deep=True) + assert es.__eq__(new_es, deep=True) -# Dask/Spark do not support to_pickle -def test_to_pickle_manual_interesting_values(pd_es, tmp_path): - pd_es.add_interesting_values( +def test_to_pickle_manual_interesting_values(es, tmp_path): + es.add_interesting_values( dataframe_name="log", values={"product_id": ["coke_zero"]}, ) - pd_es.to_pickle(str(tmp_path)) + es.to_pickle(str(tmp_path)) new_es = deserialize.read_entityset(str(tmp_path)) - assert pd_es.__eq__(new_es, deep=True) + assert es.__eq__(new_es, deep=True) assert new_es["log"].ww["product_id"].ww.metadata["interesting_values"] == [ "coke_zero", ] @@ -173,8 +155,8 @@ def test_to_parquet(es, tmp_path): es.to_parquet(str(tmp_path)) new_es = deserialize.read_entityset(str(tmp_path)) assert es.__eq__(new_es, deep=True) - df = to_pandas(es["log"]) - new_df = to_pandas(new_es["log"]) + df = es["log"] + new_df = new_es["log"] assert type(df["latlong"][0]) in (tuple, list) assert type(new_df["latlong"][0]) in (tuple, list) @@ -192,16 +174,15 @@ def test_to_parquet_manual_interesting_values(es, tmp_path): ] -# Dask/Spark don't support auto setting of interesting values with es.add_interesting_values() -def test_to_parquet_interesting_values(pd_es, tmp_path): - pd_es.add_interesting_values() - pd_es.to_parquet(str(tmp_path)) +def test_to_parquet_interesting_values(es, tmp_path): + es.add_interesting_values() + es.to_parquet(str(tmp_path)) new_es = deserialize.read_entityset(str(tmp_path)) - assert pd_es.__eq__(new_es, deep=True) + assert es.__eq__(new_es, deep=True) -def test_to_parquet_with_lti(tmp_path, pd_mock_customer): - es = pd_mock_customer +def test_to_parquet_with_lti(tmp_path, mock_customer): + es = mock_customer es.to_parquet(str(tmp_path)) new_es = deserialize.read_entityset(str(tmp_path)) assert es.__eq__(new_es, deep=True) @@ -244,35 +225,24 @@ def make_public(s3_client, s3_bucket): s3_client.ObjectAcl(BUCKET_NAME, obj).put(ACL="public-read-write") -# TODO: tmp file disappears after deserialize step, cannot check equality with Dask, Spark @pytest.mark.parametrize("profile_name", [None, False]) def test_serialize_s3_csv(es, s3_client, s3_bucket, profile_name): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "tmp file disappears after deserialize step, cannot check equality with Dask", - ) es.to_csv(TEST_S3_URL, encoding="utf-8", engine="python", profile_name=profile_name) make_public(s3_client, s3_bucket) new_es = deserialize.read_entityset(TEST_S3_URL, profile_name=profile_name) assert es.__eq__(new_es, deep=True) -# Dask and Spark do not support to_pickle @pytest.mark.parametrize("profile_name", [None, False]) -def test_serialize_s3_pickle(pd_es, s3_client, s3_bucket, profile_name): - pd_es.to_pickle(TEST_S3_URL, profile_name=profile_name) +def test_serialize_s3_pickle(es, s3_client, s3_bucket, profile_name): + es.to_pickle(TEST_S3_URL, profile_name=profile_name) make_public(s3_client, s3_bucket) new_es = deserialize.read_entityset(TEST_S3_URL, profile_name=profile_name) - assert pd_es.__eq__(new_es, deep=True) + assert es.__eq__(new_es, deep=True) -# TODO: tmp file disappears after deserialize step, cannot check equality with Dask, Spark @pytest.mark.parametrize("profile_name", [None, False]) def test_serialize_s3_parquet(es, s3_client, s3_bucket, profile_name): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "tmp file disappears after deserialize step, cannot check equality with Dask or Spark", - ) es.to_parquet(TEST_S3_URL, profile_name=profile_name) make_public(s3_client, s3_bucket) new_es = deserialize.read_entityset(TEST_S3_URL, profile_name=profile_name) @@ -280,10 +250,6 @@ def test_serialize_s3_parquet(es, s3_client, s3_bucket, profile_name): def test_s3_test_profile(es, s3_client, s3_bucket, setup_test_profile): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "tmp file disappears after deserialize step, cannot check equality with Dask", - ) es.to_csv(TEST_S3_URL, encoding="utf-8", engine="python", profile_name="test") make_public(s3_client, s3_bucket) new_es = deserialize.read_entityset(TEST_S3_URL, profile_name="test") @@ -304,10 +270,7 @@ def test_serialize_subdirs_not_removed(es, tmp_path): description_path = write_path.joinpath("data_description.json") with open(description_path, "w") as f: json.dump("__SAMPLE_TEXT__", f) - if es.dataframe_type == Library.SPARK: - compression = "none" - else: - compression = None + compression = None serialize.write_data_description( es, path=str(write_path), @@ -345,20 +308,13 @@ def test_operations_invalidate_metadata(es): assert new_es._data_description is None assert new_es.metadata is not None # generated after access assert new_es._data_description is not None - if not isinstance(es["customers"], pd.DataFrame): - customers_ltypes = es["customers"].ww.logical_types - customers_ltypes["signup_date"] = Datetime - else: - customers_ltypes = None + customers_ltypes = None new_es.add_dataframe( es["customers"], "customers", logical_types=customers_ltypes, ) - if not isinstance(es["sessions"], pd.DataFrame): - sessions_ltypes = es["sessions"].ww.logical_types - else: - sessions_ltypes = None + sessions_ltypes = None new_es.add_dataframe( es["sessions"], "sessions", @@ -384,12 +340,10 @@ def test_operations_invalidate_metadata(es): assert new_es.metadata is not None assert new_es._data_description is not None - # automatically adding interesting values not supported in Dask or Spark - if new_es.dataframe_type == Library.PANDAS: - new_es.add_interesting_values() - assert new_es._data_description is None - assert new_es.metadata is not None - assert new_es._data_description is not None + new_es.add_interesting_values() + assert new_es._data_description is None + assert new_es.metadata is not None + assert new_es._data_description is not None def test_reset_metadata(es): @@ -458,7 +412,6 @@ def _check_schema_version(version, es, warning_text, caplog, warning_type=None): "id": es.id, "dataframes": dataframes, "relationships": relationships, - "data_type": es.dataframe_type, } if warning_type == "warn" and warning_text: diff --git a/featuretools/tests/entityset_tests/test_spark_es.py b/featuretools/tests/entityset_tests/test_spark_es.py deleted file mode 100644 index ef5534ec77..0000000000 --- a/featuretools/tests/entityset_tests/test_spark_es.py +++ /dev/null @@ -1,213 +0,0 @@ -import pandas as pd -import pytest -from woodwork.logical_types import Datetime, Double, Integer, NaturalLanguage - -from featuretools.entityset import EntitySet -from featuretools.tests.testing_utils import get_df_tags -from featuretools.utils.gen_utils import Library, import_or_none -from featuretools.utils.spark_utils import pd_to_spark_clean - -ps = import_or_none("pyspark.pandas") - - -@pytest.mark.skipif("not ps") -def test_add_dataframe_from_spark_df(pd_es): - cleaned_df = pd_to_spark_clean(pd_es["log"]) - log_spark = ps.from_pandas(cleaned_df) - - spark_es = EntitySet(id="spark_es") - spark_es = spark_es.add_dataframe( - dataframe_name="log_spark", - dataframe=log_spark, - index="id", - time_index="datetime", - logical_types=pd_es["log"].ww.logical_types, - semantic_tags=get_df_tags(pd_es["log"]), - ) - pd.testing.assert_frame_equal( - cleaned_df, - spark_es["log_spark"].to_pandas(), - check_like=True, - ) - - -@pytest.mark.skipif("not ps") -def test_add_dataframe_with_non_numeric_index(pd_es, spark_es): - df = pd.DataFrame( - { - "id": pd.Series(["A_1", "A_2", "C", "D"], dtype="string"), - "values": [1, 12, -34, 27], - }, - ) - spark_df = ps.from_pandas(df) - - pd_es.add_dataframe( - dataframe_name="new_dataframe", - dataframe=df, - index="id", - logical_types={"id": NaturalLanguage, "values": Integer}, - ) - - spark_es.add_dataframe( - dataframe_name="new_dataframe", - dataframe=spark_df, - index="id", - logical_types={"id": NaturalLanguage, "values": Integer}, - ) - pd.testing.assert_frame_equal( - pd_es["new_dataframe"].reset_index(drop=True), - spark_es["new_dataframe"].to_pandas(), - ) - - -@pytest.mark.skipif("not ps") -def test_create_entityset_with_mixed_dataframe_types(pd_es, spark_es): - df = pd.DataFrame({"id": [0, 1, 2, 3], "values": [1, 12, -34, 27]}) - spark_df = ps.from_pandas(df) - - err_msg = ( - "All dataframes must be of the same type. " - "Cannot add dataframe of type {} to an entityset with existing dataframes " - "of type {}" - ) - - # Test error is raised when trying to add Spark dataframe to entitset with existing pandas dataframes - with pytest.raises( - ValueError, - match=err_msg.format(type(spark_df), type(pd_es.dataframes[0])), - ): - pd_es.add_dataframe( - dataframe_name="new_dataframe", - dataframe=spark_df, - index="id", - ) - - # Test error is raised when trying to add pandas dataframe to entitset with existing ps dataframes - with pytest.raises( - ValueError, - match=err_msg.format(type(df), type(spark_es.dataframes[0])), - ): - spark_es.add_dataframe(dataframe_name="new_dataframe", dataframe=df, index="id") - - -@pytest.mark.skipif("not ps") -def test_add_last_time_indexes(): - pd_es = EntitySet(id="pd_es") - spark_es = EntitySet(id="spark_es") - - sessions = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "user": [1, 2, 1, 3], - "time": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - sessions_spark = ps.from_pandas(sessions) - sessions_logical_types = { - "id": Integer, - "user": Integer, - "strings": NaturalLanguage, - "time": Datetime, - } - - transactions = pd.DataFrame( - { - "id": [0, 1, 2, 3, 4, 5], - "session_id": [0, 0, 1, 2, 2, 3], - "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], - "time": [ - pd.to_datetime("2019-01-10 03:53"), - pd.to_datetime("2019-01-10 04:12"), - pd.to_datetime("2019-02-03 10:34"), - pd.to_datetime("2019-01-01 12:35"), - pd.to_datetime("2019-01-01 12:49"), - pd.to_datetime("2017-08-25 04:53"), - ], - }, - ) - transactions_spark = ps.from_pandas(transactions) - transactions_logical_types = { - "id": Integer, - "session_id": Integer, - "amount": Double, - "time": Datetime, - } - - pd_es.add_dataframe( - dataframe_name="sessions", - dataframe=sessions, - index="id", - time_index="time", - ) - spark_es.add_dataframe( - dataframe_name="sessions", - dataframe=sessions_spark, - index="id", - time_index="time", - logical_types=sessions_logical_types, - ) - - pd_es.add_dataframe( - dataframe_name="transactions", - dataframe=transactions, - index="id", - time_index="time", - ) - spark_es.add_dataframe( - dataframe_name="transactions", - dataframe=transactions_spark, - index="id", - time_index="time", - logical_types=transactions_logical_types, - ) - - pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id") - spark_es = spark_es.add_relationship("sessions", "id", "transactions", "session_id") - - assert "foreign_key" in pd_es["transactions"].ww.semantic_tags["session_id"] - assert "foreign_key" in spark_es["transactions"].ww.semantic_tags["session_id"] - - assert pd_es["sessions"].ww.metadata.get("last_time_index") is None - assert spark_es["sessions"].ww.metadata.get("last_time_index") is None - - pd_es.add_last_time_indexes() - spark_es.add_last_time_indexes() - - pd_lti_name = pd_es["sessions"].ww.metadata.get("last_time_index") - spark_lti_name = spark_es["sessions"].ww.metadata.get("last_time_index") - assert pd_lti_name == spark_lti_name - pd.testing.assert_series_equal( - pd_es["sessions"][pd_lti_name].sort_index(), - spark_es["sessions"][spark_lti_name].to_pandas().sort_index(), - check_names=False, - ) - - -@pytest.mark.skipif("not ps") -def test_add_dataframe_with_make_index(): - values = [1, 12, -23, 27] - df = pd.DataFrame({"values": values}) - spark_df = ps.from_pandas(df) - spark_es = EntitySet(id="spark_es") - ltypes = {"values": "Integer"} - spark_es.add_dataframe( - dataframe_name="new_dataframe", - dataframe=spark_df, - make_index=True, - index="new_index", - logical_types=ltypes, - ) - - expected_df = pd.DataFrame({"values": values, "new_index": range(len(values))}) - pd.testing.assert_frame_equal(expected_df, spark_es["new_dataframe"].to_pandas()) - - -@pytest.mark.skipif("not ps") -def test_dataframe_type_spark(spark_es): - assert spark_es.dataframe_type == Library.SPARK diff --git a/featuretools/tests/entityset_tests/test_timedelta.py b/featuretools/tests/entityset_tests/test_timedelta.py index dcae0520ae..e50a5ba2ed 100644 --- a/featuretools/tests/entityset_tests/test_timedelta.py +++ b/featuretools/tests/entityset_tests/test_timedelta.py @@ -5,7 +5,6 @@ from featuretools.entityset import Timedelta from featuretools.feature_base import Feature from featuretools.primitives import Count -from featuretools.tests.testing_utils import to_pandas from featuretools.utils.wrangle import _check_timedelta @@ -40,9 +39,9 @@ def test_delta_with_observations(es): def test_delta_with_time_unit_matches_pandas(es): customer_id = 0 - sessions_df = to_pandas(es["sessions"]) + sessions_df = es["sessions"] sessions_df = sessions_df[sessions_df["customer_id"] == customer_id] - log_df = to_pandas(es["log"]) + log_df = es["log"] log_df = log_df[log_df["session_id"].isin(sessions_df["id"])] all_times = log_df["datetime"].sort_values().tolist() @@ -112,9 +111,9 @@ def test_feature_takes_timedelta_string(es): def test_deltas_week(es): customer_id = 0 - sessions_df = to_pandas(es["sessions"]) + sessions_df = es["sessions"] sessions_df = sessions_df[sessions_df["customer_id"] == customer_id] - log_df = to_pandas(es["log"]) + log_df = es["log"] log_df = log_df[log_df["session_id"].isin(sessions_df["id"])] all_times = log_df["datetime"].sort_values().tolist() delta_week = Timedelta(1, "w") diff --git a/featuretools/tests/entityset_tests/test_ww_es.py b/featuretools/tests/entityset_tests/test_ww_es.py index ba17374a22..ee1f6ec485 100644 --- a/featuretools/tests/entityset_tests/test_ww_es.py +++ b/featuretools/tests/entityset_tests/test_ww_es.py @@ -14,11 +14,6 @@ ) from featuretools.entityset.entityset import LTI_COLUMN_NAME, EntitySet -from featuretools.tests.testing_utils import to_pandas -from featuretools.utils.gen_utils import Library, import_or_none, is_instance - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") def test_empty_es(): @@ -30,29 +25,12 @@ def test_empty_es(): @pytest.fixture -def pd_df(): +def df(): return pd.DataFrame({"id": [0, 1, 2], "category": ["a", "b", "c"]}).astype( {"category": "category"}, ) -@pytest.fixture -def dd_df(pd_df): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(pd_df, npartitions=2) - - -@pytest.fixture -def spark_df(pd_df): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(pd_df) - - -@pytest.fixture(params=["pd_df", "dd_df", "spark_df"]) -def df(request): - return request.getfixturevalue(request.param) - - def test_init_es_with_dataframe(df): es = EntitySet("es", dataframes={"table": (df, "id")}) assert es.id == "es" @@ -112,15 +90,15 @@ def test_init_es_with_dataframe_and_params(df): assert es["table"].ww.semantic_tags["category"] == {"new_tag"} -def test_init_es_with_multiple_dataframes(pd_df): +def test_init_es_with_multiple_dataframes(df): second_df = pd.DataFrame({"id": [0, 1, 2, 3], "first_table_id": [1, 2, 2, 1]}) - pd_df.ww.init(name="first_table", index="id") + df.ww.init(name="first_table", index="id") es = EntitySet( "es", dataframes={ - "first_table": (pd_df,), + "first_table": (df,), "second_table": ( second_df, "id", @@ -167,15 +145,15 @@ def test_change_es_dataframe_schema(df): assert es["table"].ww.index == "category" -def test_init_es_with_relationships(pd_df): +def test_init_es_with_relationships(df): second_df = pd.DataFrame({"id": [0, 1, 2, 3], "first_table_id": [1, 2, 2, 1]}) - pd_df.ww.init(name="first_table", index="id") + df.ww.init(name="first_table", index="id") second_df.ww.init(name="second_table", index="id") es = EntitySet( "es", - dataframes={"first_table": (pd_df,), "second_table": (second_df,)}, + dataframes={"first_table": (df,), "second_table": (second_df,)}, relationships=[("first_table", "id", "second_table", "first_table_id")], ) @@ -426,24 +404,6 @@ def test_add_last_time_index(es): assert isinstance(es["products"].ww.logical_types[LTI_COLUMN_NAME], Datetime) -def test_add_last_time_non_numeric_index(pd_es, spark_es, dask_es): - # Confirm that add_last_time_index works for indices that aren't numeric - # since numeric underlying indices can accidentally match the Woodwork index - pd_es.add_last_time_indexes(["products"]) - dask_es.add_last_time_indexes(["products"]) - spark_es.add_last_time_indexes(["products"]) - - assert list(to_pandas(pd_es["products"][LTI_COLUMN_NAME]).sort_index()) == list( - to_pandas(dask_es["products"][LTI_COLUMN_NAME]).sort_index(), - ) - assert list(to_pandas(pd_es["products"][LTI_COLUMN_NAME]).sort_index()) == list( - to_pandas(spark_es["products"]).sort_values("id")[LTI_COLUMN_NAME], - ) - - assert pd_es["products"].ww.schema == dask_es["products"].ww.schema - assert pd_es["products"].ww.schema == spark_es["products"].ww.schema - - def test_lti_already_has_last_time_column_name(es): col = es["customers"].ww.pop("loves_ice_cream") col.name = LTI_COLUMN_NAME @@ -595,10 +555,7 @@ def test_extra_woodwork_params(es): def test_replace_dataframe_errors(es): df = es["customers"].copy() - if ps and isinstance(df, ps.DataFrame): - df["new"] = [1, 2, 3] - else: - df["new"] = pd.Series([1, 2, 3]) + df["new"] = pd.Series([1, 2, 3]) error_text = "New dataframe is missing new cohort column" with pytest.raises(ValueError, match=error_text): @@ -612,59 +569,38 @@ def test_replace_dataframe_errors(es): def test_replace_dataframe_already_sorted(es): # test already_sorted on dataframe without time index df = es["sessions"].copy() - updated_id = to_pandas(df["id"]) + updated_id = df["id"] updated_id.iloc[1] = 2 updated_id.iloc[2] = 1 df = df.set_index("id", drop=False) df.index.name = None - - assert es["sessions"].ww.time_index is None - - if ps and isinstance(df, ps.DataFrame): - df["id"] = updated_id.to_list() - df = df.sort_index() - elif is_instance(df, dd, "DataFrame"): - df["id"] = updated_id - es.replace_dataframe(dataframe_name="sessions", df=df.copy(), already_sorted=False) - sessions_df = to_pandas(es["sessions"]) + sessions_df = es["sessions"] assert sessions_df["id"].iloc[1] == 2 # no sorting since time index not defined es.replace_dataframe(dataframe_name="sessions", df=df.copy(), already_sorted=True) - sessions_df = to_pandas(es["sessions"]) + sessions_df = es["sessions"] assert sessions_df["id"].iloc[1] == 2 # test already_sorted on dataframe with time index df = es["customers"].copy() - updated_signup = to_pandas(df["signup_date"]) + updated_signup = df["signup_date"] updated_signup.iloc[0] = datetime(2011, 4, 11) assert es["customers"].ww.time_index == "signup_date" - if ps and isinstance(df, ps.DataFrame): - df["signup_date"] = updated_signup.to_list() - df = df.sort_index() - else: - df["signup_date"] = updated_signup + df["signup_date"] = updated_signup es.replace_dataframe(dataframe_name="customers", df=df.copy(), already_sorted=True) - customers_df = to_pandas(es["customers"]) + customers_df = es["customers"] assert customers_df["id"].iloc[0] == 2 - # only pandas allows for sorting: es.replace_dataframe(dataframe_name="customers", df=df.copy(), already_sorted=False) - updated_customers = to_pandas(es["customers"]) - if isinstance(df, pd.DataFrame): - assert updated_customers["id"].iloc[0] == 0 - else: - assert updated_customers["id"].iloc[0] == 2 + updated_customers = es["customers"] + assert updated_customers["id"].iloc[0] == 0 def test_replace_dataframe_invalid_schema(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Invalid schema checks able to be caught by Woodwork only relevant for Pandas", - ) df = es["customers"].copy() df["id"] = pd.Series([1, 1, 1]) @@ -674,10 +610,6 @@ def test_replace_dataframe_invalid_schema(es): def test_replace_dataframe_mismatched_index(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Only pandas checks whether underlying index matches the Woodwork index", - ) df = es["customers"].copy() df["id"] = pd.Series([99, 88, 77]) @@ -698,20 +630,15 @@ def test_replace_dataframe_different_dtypes(es): incompatible_dtype_df = es["customers"].copy() incompatible_list = ["hi", "bye", "bye"] - if ps and isinstance(incompatible_dtype_df, ps.DataFrame): - incompatible_dtype_df["age"] = incompatible_list - else: - incompatible_dtype_df["age"] = pd.Series(incompatible_list) + incompatible_dtype_df["age"] = pd.Series(incompatible_list) - if isinstance(es["customers"], pd.DataFrame): - # Dask and Spark do not error on invalid type conversion until compute - error_msg = "Error converting datatype for age from type object to type int64. Please confirm the underlying data is consistent with logical type Integer." - with pytest.raises(TypeConversionError, match=error_msg): - es.replace_dataframe(dataframe_name="customers", df=incompatible_dtype_df) + error_msg = "Error converting datatype for age from type object to type int64. Please confirm the underlying data is consistent with logical type Integer." + with pytest.raises(TypeConversionError, match=error_msg): + es.replace_dataframe(dataframe_name="customers", df=incompatible_dtype_df) @pytest.fixture() -def latlong_df_pandas(): +def latlong_df(): latlong_df = pd.DataFrame( { "tuples": pd.Series([(1, 2), (3, 4)]), @@ -726,30 +653,7 @@ def latlong_df_pandas(): return latlong_df -@pytest.fixture() -def latlong_df_dask(latlong_df_pandas): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - return dd.from_pandas(latlong_df_pandas, npartitions=2) - - -@pytest.fixture() -def latlong_df_spark(latlong_df_pandas): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas( - latlong_df_pandas.applymap( - lambda tup: list(tup) if isinstance(tup, tuple) else tup, - ), - ) - - -@pytest.fixture(params=["latlong_df_pandas", "latlong_df_dask", "latlong_df_spark"]) -def latlong_df(request): - return request.getfixturevalue(request.param) - - def test_replace_dataframe_data_transformation(latlong_df): - dask = pytest.importorskip("dask", reason="Dask not installed, skipping") - dask.config.set({"dataframe.convert-string": False}) initial_df = latlong_df.copy() initial_df.ww.init( name="latlongs", @@ -759,19 +663,15 @@ def test_replace_dataframe_data_transformation(latlong_df): es = EntitySet() es.add_dataframe(dataframe=initial_df) - df = to_pandas(es["latlongs"]) + df = es["latlongs"] expected_val = (1, 2) - if ps and isinstance(es["latlongs"], ps.DataFrame): - expected_val = [1, 2] for col in latlong_df.columns: series = df[col] assert series.iloc[0] == expected_val es.replace_dataframe("latlongs", latlong_df) - df = to_pandas(es["latlongs"]) + df = es["latlongs"] expected_val = (3, 4) - if ps and isinstance(es["latlongs"], ps.DataFrame): - expected_val = [3, 4] for col in latlong_df.columns: series = df[col] assert series.iloc[-1] == expected_val @@ -794,10 +694,7 @@ def test_replace_dataframe_column_order(es): def test_replace_dataframe_different_woodwork_initialized(es): df = es["customers"].copy() - if ps and isinstance(df, ps.DataFrame): - df["age"] = [1, 2, 3] - else: - df["age"] = pd.Series([1, 2, 3]) + df["age"] = pd.Series([1, 2, 3]) # Initialize Woodwork on the new DataFrame and change the schema so it won't match the original DataFrame's schema df.ww.init(schema=es["customers"].ww.schema) @@ -816,7 +713,7 @@ def test_replace_dataframe_different_woodwork_initialized(es): with pytest.warns(UserWarning, match=warning): es.replace_dataframe("customers", df, already_sorted=True) - actual = to_pandas(es["customers"]["age"]).sort_values() + actual = es["customers"]["age"].sort_values() assert all(actual == [1, 2, 3]) assert es["customers"].ww._schema == original_schema @@ -824,61 +721,14 @@ def test_replace_dataframe_different_woodwork_initialized(es): assert es["customers"]["cancel_date"].dtype == "datetime64[ns]" -@pytest.mark.skipif("not dd") -def test_replace_dataframe_different_dataframe_types(): - dask_es = EntitySet(id="dask_es") - - sessions = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "user": [1, 2, 1, 3], - "time": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - sessions_dask = dd.from_pandas(sessions, npartitions=2) - sessions_logical_types = { - "id": Integer, - "user": Integer, - "time": Datetime, - "strings": NaturalLanguage, - } - sessions_semantic_tags = {"user": "foreign_key"} - - dask_es.add_dataframe( - dataframe_name="sessions", - dataframe=sessions_dask, - index="id", - time_index="time", - logical_types=sessions_logical_types, - semantic_tags=sessions_semantic_tags, - ) - - with pytest.raises(TypeError, match="Incorrect DataFrame type used"): - dask_es.replace_dataframe("sessions", sessions) - - def test_replace_dataframe_and_min_last_time_index(es): es.add_last_time_indexes(["products"]) original_time_index = es["log"]["datetime"].copy() original_last_time_index = es["products"][LTI_COLUMN_NAME].copy() - if ps and isinstance(original_time_index, ps.Series): - new_time_index = ps.from_pandas( - original_time_index.to_pandas() + pd.Timedelta(days=1), - ) - expected_last_time_index = ps.from_pandas( - original_last_time_index.to_pandas() + pd.Timedelta(days=1), - ) - else: - new_time_index = original_time_index + pd.Timedelta(days=1) - expected_last_time_index = original_last_time_index + pd.Timedelta(days=1) + new_time_index = original_time_index + pd.Timedelta(days=1) + expected_last_time_index = original_last_time_index + pd.Timedelta(days=1) new_dataframe = es["log"].copy() new_dataframe["datetime"] = new_time_index @@ -886,14 +736,13 @@ def test_replace_dataframe_and_min_last_time_index(es): es.replace_dataframe("log", new_dataframe, recalculate_last_time_indexes=True) - # Spark reorders indices during last time index, so we sort to confirm individual values are the same pd.testing.assert_series_equal( - to_pandas(es["products"][LTI_COLUMN_NAME]).sort_index(), - to_pandas(expected_last_time_index).sort_index(), + es["products"][LTI_COLUMN_NAME].sort_index(), + expected_last_time_index.sort_index(), ) pd.testing.assert_series_equal( - to_pandas(es["log"][LTI_COLUMN_NAME]).sort_index(), - to_pandas(new_time_index).sort_index(), + es["log"][LTI_COLUMN_NAME].sort_index(), + new_time_index.sort_index(), check_names=False, ) @@ -904,12 +753,7 @@ def test_replace_dataframe_dont_recalculate_last_time_index_present(es): original_time_index = es["customers"]["signup_date"].copy() original_last_time_index = es["customers"][LTI_COLUMN_NAME].copy() - if ps and isinstance(original_time_index, ps.Series): - new_time_index = ps.from_pandas( - original_time_index.to_pandas() + pd.Timedelta(days=10), - ) - else: - new_time_index = original_time_index + pd.Timedelta(days=10) + new_time_index = original_time_index + pd.Timedelta(days=10) new_dataframe = es["customers"].copy() new_dataframe["signup_date"] = new_time_index @@ -920,8 +764,8 @@ def test_replace_dataframe_dont_recalculate_last_time_index_present(es): recalculate_last_time_indexes=False, ) pd.testing.assert_series_equal( - to_pandas(es["customers"][LTI_COLUMN_NAME], sort_index=True), - to_pandas(original_last_time_index, sort_index=True), + es["customers"][LTI_COLUMN_NAME], + original_last_time_index, ) @@ -932,12 +776,7 @@ def test_replace_dataframe_dont_recalculate_last_time_index_not_present(es): original_time_index = es["customers"]["signup_date"].copy() - if ps and isinstance(original_time_index, ps.Series): - new_time_index = ps.from_pandas( - original_time_index.to_pandas() + pd.Timedelta(days=10), - ) - else: - new_time_index = original_time_index + pd.Timedelta(days=10) + new_time_index = original_time_index + pd.Timedelta(days=10) new_dataframe = es["customers"].copy() new_dataframe["signup_date"] = new_time_index @@ -957,12 +796,7 @@ def test_replace_dataframe_recalculate_last_time_index_not_present(es): original_time_index = es["log"]["datetime"].copy() - if ps and isinstance(original_time_index, ps.Series): - new_time_index = ps.from_pandas( - original_time_index.to_pandas() + pd.Timedelta(days=10), - ) - else: - new_time_index = original_time_index + pd.Timedelta(days=10) + new_time_index = original_time_index + pd.Timedelta(days=10) new_dataframe = es["log"].copy() new_dataframe["datetime"] = new_time_index @@ -970,13 +804,13 @@ def test_replace_dataframe_recalculate_last_time_index_not_present(es): es.replace_dataframe("log", new_dataframe, recalculate_last_time_indexes=True) pd.testing.assert_series_equal( - to_pandas(es["log"]["datetime"]).sort_index(), - to_pandas(new_time_index).sort_index(), + es["log"]["datetime"].sort_index(), + new_time_index.sort_index(), check_names=False, ) pd.testing.assert_series_equal( - to_pandas(es["log"][LTI_COLUMN_NAME]).sort_index(), - to_pandas(new_time_index).sort_index(), + es["log"][LTI_COLUMN_NAME].sort_index(), + new_time_index.sort_index(), check_names=False, ) @@ -986,12 +820,7 @@ def test_replace_dataframe_recalculate_last_time_index_present(es): original_time_index = es["log"]["datetime"].copy() - if ps and isinstance(original_time_index, ps.Series): - new_time_index = ps.from_pandas( - original_time_index.to_pandas() + pd.Timedelta(days=10), - ) - else: - new_time_index = original_time_index + pd.Timedelta(days=10) + new_time_index = original_time_index + pd.Timedelta(days=10) new_dataframe = es["log"].copy() new_dataframe["datetime"] = new_time_index @@ -999,13 +828,13 @@ def test_replace_dataframe_recalculate_last_time_index_present(es): es.replace_dataframe("log", new_dataframe, recalculate_last_time_indexes=True) pd.testing.assert_series_equal( - to_pandas(es["log"]["datetime"]).sort_index(), - to_pandas(new_time_index).sort_index(), + es["log"]["datetime"].sort_index(), + new_time_index.sort_index(), check_names=False, ) pd.testing.assert_series_equal( - to_pandas(es["log"][LTI_COLUMN_NAME]).sort_index(), - to_pandas(new_time_index).sort_index(), + es["log"][LTI_COLUMN_NAME].sort_index(), + new_time_index.sort_index(), check_names=False, ) diff --git a/featuretools/tests/primitive_tests/aggregation_primitive_tests/test_agg_primitives.py b/featuretools/tests/primitive_tests/aggregation_primitive_tests/test_agg_primitives.py index 2dccb2da37..24fc23d737 100644 --- a/featuretools/tests/primitive_tests/aggregation_primitive_tests/test_agg_primitives.py +++ b/featuretools/tests/primitive_tests/aggregation_primitive_tests/test_agg_primitives.py @@ -654,11 +654,11 @@ def test_empty(self): given_answer = primitive_func(case) assert pd.isna(given_answer) - def test_with_featuretools(self, pd_es): + def test_with_featuretools(self, es): transform, aggregation = find_applicable_primitives(self.primitive) primitive_instance = self.primitive() aggregation.append(primitive_instance) - valid_dfs(pd_es, aggregation, transform, self.primitive) + valid_dfs(es, aggregation, transform, self.primitive) def test_serialize(self, es): check_serialize(self.primitive, es, target_dataframe_name="sessions") diff --git a/featuretools/tests/primitive_tests/aggregation_primitive_tests/test_rolling_primitive.py b/featuretools/tests/primitive_tests/aggregation_primitive_tests/test_rolling_primitive.py index bd85172c1d..d56ed36f39 100644 --- a/featuretools/tests/primitive_tests/aggregation_primitive_tests/test_rolling_primitive.py +++ b/featuretools/tests/primitive_tests/aggregation_primitive_tests/test_rolling_primitive.py @@ -27,12 +27,12 @@ ], ) @pytest.mark.parametrize("min_periods", [1, 0, 2, 5]) -def test_rolling_max(min_periods, window_length, gap, window_series_pd): +def test_rolling_max(min_periods, window_length, gap, window_series): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Since we're using a uniform series we can check correctness using numeric parameters expected_vals = apply_rolling_agg_to_series( - window_series_pd, + window_series, lambda x: x.max(), window_length_num, gap=gap_num, @@ -47,7 +47,7 @@ def test_rolling_max(min_periods, window_length, gap, window_series_pd): primitive_func = primitive_instance.get_function() actual_vals = pd.Series( - primitive_func(window_series_pd.index, pd.Series(window_series_pd.values)), + primitive_func(window_series.index, pd.Series(window_series.values)), ) # Since min_periods of 0 is the same as min_periods of 1 @@ -67,13 +67,13 @@ def test_rolling_max(min_periods, window_length, gap, window_series_pd): ], ) @pytest.mark.parametrize("min_periods", [1, 0, 2, 5]) -def test_rolling_min(min_periods, window_length, gap, window_series_pd): +def test_rolling_min(min_periods, window_length, gap, window_series): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Since we're using a uniform series we can check correctness using numeric parameters expected_vals = apply_rolling_agg_to_series( - window_series_pd, + window_series, lambda x: x.min(), window_length_num, gap=gap_num, @@ -88,7 +88,7 @@ def test_rolling_min(min_periods, window_length, gap, window_series_pd): primitive_func = primitive_instance.get_function() actual_vals = pd.Series( - primitive_func(window_series_pd.index, pd.Series(window_series_pd.values)), + primitive_func(window_series.index, pd.Series(window_series.values)), ) # Since min_periods of 0 is the same as min_periods of 1 @@ -108,13 +108,13 @@ def test_rolling_min(min_periods, window_length, gap, window_series_pd): ], ) @pytest.mark.parametrize("min_periods", [1, 0, 2, 5]) -def test_rolling_mean(min_periods, window_length, gap, window_series_pd): +def test_rolling_mean(min_periods, window_length, gap, window_series): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Since we're using a uniform series we can check correctness using numeric parameters expected_vals = apply_rolling_agg_to_series( - window_series_pd, + window_series, np.mean, window_length_num, gap=gap_num, @@ -129,7 +129,7 @@ def test_rolling_mean(min_periods, window_length, gap, window_series_pd): primitive_func = primitive_instance.get_function() actual_vals = pd.Series( - primitive_func(window_series_pd.index, pd.Series(window_series_pd.values)), + primitive_func(window_series.index, pd.Series(window_series.values)), ) # Since min_periods of 0 is the same as min_periods of 1 @@ -149,13 +149,13 @@ def test_rolling_mean(min_periods, window_length, gap, window_series_pd): ], ) @pytest.mark.parametrize("min_periods", [1, 0, 2, 5]) -def test_rolling_std(min_periods, window_length, gap, window_series_pd): +def test_rolling_std(min_periods, window_length, gap, window_series): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Since we're using a uniform series we can check correctness using numeric parameters expected_vals = apply_rolling_agg_to_series( - window_series_pd, + window_series, lambda x: x.std(), window_length_num, gap=gap_num, @@ -170,7 +170,7 @@ def test_rolling_std(min_periods, window_length, gap, window_series_pd): primitive_func = primitive_instance.get_function() actual_vals = pd.Series( - primitive_func(window_series_pd.index, pd.Series(window_series_pd.values)), + primitive_func(window_series.index, pd.Series(window_series.values)), ) # Since min_periods of 0 is the same as min_periods of 1 @@ -194,12 +194,12 @@ def test_rolling_std(min_periods, window_length, gap, window_series_pd): ("6d", "7d"), ], ) -def test_rolling_count(window_length, gap, window_series_pd): +def test_rolling_count(window_length, gap, window_series): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) expected_vals = apply_rolling_agg_to_series( - window_series_pd, + window_series, lambda x: x.count(), window_length_num, gap=gap_num, @@ -212,7 +212,7 @@ def test_rolling_count(window_length, gap, window_series_pd): ) primitive_func = primitive_instance.get_function() - actual_vals = pd.Series(primitive_func(window_series_pd.index)) + actual_vals = pd.Series(primitive_func(window_series.index)) num_nans = gap_num + window_length_num - 1 assert actual_vals.isna().sum() == num_nans @@ -234,7 +234,7 @@ def test_rolling_count_primitive_min_periods_nans( gap, min_periods, expected_num_nams, - window_series_pd, + window_series, ): primitive_instance = RollingCount( window_length=window_length, @@ -242,7 +242,7 @@ def test_rolling_count_primitive_min_periods_nans( min_periods=min_periods, ) primitive_func = primitive_instance.get_function() - vals = pd.Series(primitive_func(window_series_pd.index)) + vals = pd.Series(primitive_func(window_series.index)) assert vals.isna().sum() == expected_num_nams @@ -257,7 +257,7 @@ def test_rolling_count_with_no_gap( gap, min_periods, expected_num_nams, - window_series_pd, + window_series, ): primitive_instance = RollingCount( window_length=window_length, @@ -265,7 +265,7 @@ def test_rolling_count_with_no_gap( min_periods=min_periods, ) primitive_func = primitive_instance.get_function() - vals = pd.Series(primitive_func(window_series_pd.index)) + vals = pd.Series(primitive_func(window_series.index)) assert vals.isna().sum() == expected_num_nams @@ -312,18 +312,18 @@ def test_rolling_count_with_no_gap( ), ], ) -def test_rolling_trend(window_length, gap, expected_vals, window_series_pd): +def test_rolling_trend(window_length, gap, expected_vals, window_series): primitive_instance = RollingTrend(window_length=window_length, gap=gap) - actual_vals = primitive_instance(window_series_pd.index, window_series_pd.values) + actual_vals = primitive_instance(window_series.index, window_series.values) pd.testing.assert_series_equal(pd.Series(expected_vals), pd.Series(actual_vals)) -def test_rolling_trend_window_length_less_than_three(window_series_pd): +def test_rolling_trend_window_length_less_than_three(window_series): primitive_instance = RollingTrend(window_length=2) - vals = primitive_instance(window_series_pd.index, window_series_pd.values) + vals = primitive_instance(window_series.index, window_series.values) for v in vals: assert np.isnan(v) @@ -441,7 +441,7 @@ def test_rolling_outlier_count( min_periods, window_length, gap, - rolling_outlier_series_pd, + rolling_outlier_series, ): primitive_instance = RollingOutlierCount( window_length=window_length, @@ -453,13 +453,13 @@ def test_rolling_outlier_count( actual_vals = pd.Series( primitive_func( - rolling_outlier_series_pd.index, - pd.Series(rolling_outlier_series_pd.values), + rolling_outlier_series.index, + pd.Series(rolling_outlier_series.values), ), ) expected_vals = apply_rolling_agg_to_series( - series=rolling_outlier_series_pd, + series=rolling_outlier_series, agg_func=primitive_instance.get_outliers_count, window_length=window_length, gap=gap, diff --git a/featuretools/tests/primitive_tests/test_agg_feats.py b/featuretools/tests/primitive_tests/test_agg_feats.py index 54597cd729..c32bf328eb 100644 --- a/featuretools/tests/primitive_tests/test_agg_feats.py +++ b/featuretools/tests/primitive_tests/test_agg_feats.py @@ -34,8 +34,7 @@ ) from featuretools.primitives.base import AggregationPrimitive from featuretools.synthesis.deep_feature_synthesis import DeepFeatureSynthesis, match -from featuretools.tests.testing_utils import backward_path, feature_with_name, to_pandas -from featuretools.utils.gen_utils import Library +from featuretools.tests.testing_utils import backward_path, feature_with_name @pytest.fixture(autouse=True) @@ -85,7 +84,7 @@ def test_makes_count(es): assert feature_with_name(features, "customers.COUNT(log)") -def test_count_null(pd_es): +def test_count_null(es): class Count(AggregationPrimitive): name = "count" input_types = [[ColumnSchema(semantic_tags={"foreign_key"})], [ColumnSchema()]] @@ -115,11 +114,11 @@ def generate_name( return "COUNT(%s%s%s)" % (relationship_path_name, where_str, use_prev_str) count_null = Feature( - pd_es["log"].ww["value"], + es["log"].ww["value"], parent_dataframe_name="sessions", primitive=Count(count_null=True), ) - feature_matrix = calculate_feature_matrix([count_null], entityset=pd_es) + feature_matrix = calculate_feature_matrix([count_null], entityset=es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all() @@ -199,15 +198,6 @@ def test_init_and_name(es): assert getattr(attr, "name") is not None agg_primitives = get_aggregation_primitives().values() - # If Dask EntitySet use only Dask compatible primitives - if es.dataframe_type == Library.DASK: - agg_primitives = [ - prim for prim in agg_primitives if Library.DASK in prim.compatibility - ] - if es.dataframe_type == Library.SPARK: - agg_primitives = [ - prim for prim in agg_primitives if Library.SPARK in prim.compatibility - ] for agg_prim in agg_primitives: input_types = agg_prim.input_types @@ -408,15 +398,15 @@ def test_serialization(es): _assert_agg_feats_equal(max2, deserialized) -def test_time_since_last(pd_es): +def test_time_since_last(es): f = Feature( - pd_es["log"].ww["datetime"], + es["log"].ww["datetime"], parent_dataframe_name="customers", primitive=TimeSinceLast, ) fm = calculate_feature_matrix( [f], - entityset=pd_es, + entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8), ) @@ -426,15 +416,15 @@ def test_time_since_last(pd_es): assert all(fm[f.get_name()].round().values == correct) -def test_time_since_first(pd_es): +def test_time_since_first(es): f = Feature( - pd_es["log"].ww["datetime"], + es["log"].ww["datetime"], parent_dataframe_name="customers", primitive=TimeSinceFirst, ) fm = calculate_feature_matrix( [f], - entityset=pd_es, + entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8), ) @@ -444,15 +434,15 @@ def test_time_since_first(pd_es): assert all(fm[f.get_name()].round().values == correct) -def test_median(pd_es): +def test_median(es): f = Feature( - pd_es["log"].ww["value_many_nans"], + es["log"].ww["value_many_nans"], parent_dataframe_name="customers", primitive=Median, ) fm = calculate_feature_matrix( [f], - entityset=pd_es, + entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8), ) @@ -468,9 +458,6 @@ def test_agg_same_method_name(es): can't differentiate them. We have a work around to this based on the name property that we test here. """ - # TODO: Update to work with Dask and Spark - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Need to update to work with Dask and Spark EntitySets") # test with normally defined functions class Sum(AggregationPrimitive): @@ -540,7 +527,7 @@ def get_function(self): assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] -def test_time_since_last_custom(pd_es): +def test_time_since_last_custom(es): class TimeSinceLast(AggregationPrimitive): name = "time_since_last" input_types = [ @@ -557,13 +544,13 @@ def time_since_last(values, time): return time_since_last f = Feature( - pd_es["log"].ww["datetime"], + es["log"].ww["datetime"], parent_dataframe_name="customers", primitive=TimeSinceLast, ) fm = calculate_feature_matrix( [f], - entityset=pd_es, + entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8), ) @@ -573,7 +560,7 @@ def time_since_last(values, time): assert all(fm[f.get_name()].round().values == correct) -def test_custom_primitive_multiple_inputs(pd_es): +def test_custom_primitive_multiple_inputs(es): class MeanSunday(AggregationPrimitive): name = "mean_sunday" input_types = [ @@ -594,7 +581,7 @@ def mean_sunday(numeric, datetime): return mean_sunday fm, features = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="sessions", agg_primitives=[MeanSunday], trans_primitives=[], @@ -604,10 +591,10 @@ def mean_sunday(numeric, datetime): for x, y in iterator: assert (pd.isnull(x) and pd.isnull(y)) or (x == y) - pd_es.add_interesting_values() + es.add_interesting_values() mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None]) fm, features = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="sessions", agg_primitives=[MeanSunday], trans_primitives=[], @@ -648,8 +635,6 @@ def __init__(self, n=1): def test_makes_numtrue(es): - if es.dataframe_type == Library.SPARK: - pytest.xfail("Spark EntitySets do not support NumTrue primitive") dfs = DeepFeatureSynthesis( target_dataframe_name="sessions", entityset=es, @@ -661,7 +646,7 @@ def test_makes_numtrue(es): assert feature_with_name(features, "NUM_TRUE(log.purchased)") -def test_make_three_most_common(pd_es): +def test_make_three_most_common(es): class NMostCommoner(AggregationPrimitive): name = "pd_top3" input_types = ([ColumnSchema(semantic_tags={"category"})],) @@ -681,7 +666,7 @@ def pd_top3(x): return pd_top3 fm, features = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="customers", instance_ids=[0, 1, 2], agg_primitives=[NMostCommoner], @@ -710,10 +695,10 @@ def pd_top3(x): ) -def test_stacking_multi(pd_es): +def test_stacking_multi(es): threecommon = NMostCommon(3) tc = Feature( - pd_es["log"].ww["product_id"], + es["log"].ww["product_id"], parent_dataframe_name="sessions", primitive=threecommon, ) @@ -724,7 +709,7 @@ def test_stacking_multi(pd_es): Feature(tc[i], parent_dataframe_name="customers", primitive=NumUnique), ) - fm = calculate_feature_matrix(stacked, entityset=pd_es, instance_ids=[0, 1, 2]) + fm = calculate_feature_matrix(stacked, entityset=es, instance_ids=[0, 1, 2]) correct_vals = [[3, 2, 1], [2, 1, 0], [0, 0, 0]] correct_vals1 = [[3, 1, 1], [2, 1, 0], [0, 0, 0]] @@ -755,7 +740,6 @@ def test_use_previous_pd_dateoffset(es): cutoff_time=pd.Timestamp("2011-04-11 10:31:30"), instance_ids=[0, 1, 2], ) - feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True) col_name = list(feature_matrix.head().keys())[0] assert (feature_matrix[col_name] == [1, 5, 2]).all() @@ -768,7 +752,7 @@ def _assert_agg_feats_equal(f1, f2): assert f1.use_previous == f2.use_previous -def test_override_multi_feature_names(pd_es): +def test_override_multi_feature_names(es): def gen_custom_names( primitive, base_feature_names, @@ -807,7 +791,7 @@ def generate_names( ) fm, features = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="products", instance_ids=[0, 1, 2], agg_primitives=[NMostCommoner], diff --git a/featuretools/tests/primitive_tests/test_dask_primitives.py b/featuretools/tests/primitive_tests/test_dask_primitives.py deleted file mode 100644 index 6d66f2b681..0000000000 --- a/featuretools/tests/primitive_tests/test_dask_primitives.py +++ /dev/null @@ -1,122 +0,0 @@ -import pandas as pd -import pytest - -from featuretools import calculate_feature_matrix, dfs, list_primitives -from featuretools.feature_base.cache import feature_cache -from featuretools.primitives import get_aggregation_primitives, get_transform_primitives -from featuretools.tests.testing_utils import to_pandas -from featuretools.utils.gen_utils import Library - -UNSUPPORTED = [ - p.name - for p in get_transform_primitives().values() - if Library.DASK not in p.compatibility -] -UNSUPPORTED += [ - p.name - for p in get_aggregation_primitives().values() - if Library.DASK not in p.compatibility -] - - -@pytest.fixture(autouse=True) -def reset_dfs_cache(): - feature_cache.enabled = False - feature_cache.clear_all() - - -def test_transform(pd_es, dask_es): - pytest.skip( - "TODO: Dask issue with `series.eq`. Fix once Dask Issue #7957 is closed.", - ) - primitives = list_primitives() - trans_list = primitives[primitives["type"] == "transform"]["name"].tolist() - trans_primitives = [prim for prim in trans_list if prim not in UNSUPPORTED] - agg_primitives = [] - cutoff_time = pd.Timestamp("2019-01-05 04:00") - - assert pd_es == dask_es - - # Run DFS using each dataframe as a target and confirm results match - for df in pd_es.dataframes: - features = dfs( - entityset=pd_es, - target_dataframe_name=df.ww.name, - trans_primitives=trans_primitives, - agg_primitives=agg_primitives, - max_depth=2, - features_only=True, - ) - - dask_features = dfs( - entityset=dask_es, - target_dataframe_name=df.ww.name, - trans_primitives=trans_primitives, - agg_primitives=agg_primitives, - max_depth=2, - features_only=True, - ) - assert features == dask_features - - # Calculate feature matrix values to confirm output is the same between dask and pandas. - # Not testing on all returned features due to long run times. - fm = calculate_feature_matrix( - features=features[:100], - entityset=pd_es, - cutoff_time=cutoff_time, - ) - dask_fm = calculate_feature_matrix( - features=dask_features[:100], - entityset=dask_es, - cutoff_time=cutoff_time, - ) - - # Categorical categories can be ordered differently, this makes sure they are the same - dask_fm = dask_fm.astype(fm.dtypes) - - # Use the same columns and make sure both indexes are sorted the same - dask_computed_fm = ( - dask_fm.compute().set_index(df.ww.index).loc[fm.index][fm.columns] - ) - pd.testing.assert_frame_equal(fm, dask_computed_fm) - - -def test_aggregation(pd_es, dask_es): - primitives = list_primitives() - trans_primitives = [] - agg_list = primitives[primitives["type"] == "aggregation"]["name"].tolist() - agg_primitives = [prim for prim in agg_list if prim not in UNSUPPORTED] - - assert pd_es == dask_es - - # Run DFS using each dataframe as a target and confirm results match - for df in pd_es.dataframes: - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name=df.ww.name, - trans_primitives=trans_primitives, - agg_primitives=agg_primitives, - cutoff_time=pd.Timestamp("2019-01-05 04:00"), - max_depth=2, - ) - - dask_fm, _ = dfs( - entityset=dask_es, - target_dataframe_name=df.ww.name, - trans_primitives=trans_primitives, - agg_primitives=agg_primitives, - cutoff_time=pd.Timestamp("2019-01-05 04:00"), - max_depth=2, - ) - - # Categorical categories can be ordered differently, this makes sure they - # are the same, including the index column - index_col = df.ww.index - fm = fm.reset_index() - dask_fm = dask_fm.astype(fm.dtypes) - fm = fm.set_index(index_col) - - pd.testing.assert_frame_equal( - fm.sort_index(), - to_pandas(dask_fm, index=index_col, sort_index=True), - ) diff --git a/featuretools/tests/primitive_tests/test_direct_features.py b/featuretools/tests/primitive_tests/test_direct_features.py index ab840f0d30..b64a44f1c3 100644 --- a/featuretools/tests/primitive_tests/test_direct_features.py +++ b/featuretools/tests/primitive_tests/test_direct_features.py @@ -22,8 +22,6 @@ ) from featuretools.primitives.utils import PrimitivesDeserializer from featuretools.synthesis import dfs -from featuretools.tests.testing_utils import to_pandas -from featuretools.utils.gen_utils import Library def test_direct_from_identity(es): @@ -33,12 +31,8 @@ def test_direct_from_identity(es): feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) - df = to_pandas(df, index="id", sort_index=True) v = df[d.get_name()].tolist() - if es.dataframe_type == Library.SPARK: - expected = ["0", "1"] - else: - expected = [0, 1] + expected = [0, 1] assert v == expected @@ -50,12 +44,8 @@ def test_direct_from_column(es): feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) - df = to_pandas(df, index="id", sort_index=True) v = df[d.get_name()].tolist() - if es.dataframe_type == Library.SPARK: - expected = ["0", "1"] - else: - expected = [0, 1] + expected = [0, 1] assert v == expected @@ -108,10 +98,6 @@ def test_direct_copy(games_es): def test_direct_of_multi_output_transform_feat(es): - # TODO: Update to work with Dask and Spark - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Custom primitive is not compatible with Dask or Spark") - class TestTime(TransformPrimitive): name = "test_time" input_types = [ColumnSchema(logical_type=Datetime)] @@ -151,14 +137,14 @@ def test_f(x): assert (fm[col1] == fm[col2]).all() -def test_direct_features_of_multi_output_agg_primitives(pd_es): +def test_direct_features_of_multi_output_agg_primitives(es): class ThreeMostCommonCat(AggregationPrimitive): name = "n_most_common_categorical" input_types = [ColumnSchema(semantic_tags={"category"})] return_type = ColumnSchema(semantic_tags={"category"}) number_output_features = 3 - def get_function(self, agg_type="pandas"): + def get_function(self): def pd_top3(x): counts = x.value_counts() counts = counts[counts > 0] @@ -171,7 +157,7 @@ def pd_top3(x): return pd_top3 fm, fl = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="log", agg_primitives=[ThreeMostCommonCat], trans_primitives=[], diff --git a/featuretools/tests/primitive_tests/test_feature_serialization.py b/featuretools/tests/primitive_tests/test_feature_serialization.py index d2ca60af2c..2a3a01dc34 100644 --- a/featuretools/tests/primitive_tests/test_feature_serialization.py +++ b/featuretools/tests/primitive_tests/test_feature_serialization.py @@ -54,7 +54,6 @@ ) from featuretools.primitives.base import AggregationPrimitive from featuretools.tests.testing_utils import check_names -from featuretools.utils.gen_utils import Library from featuretools.version import ENTITYSET_SCHEMA_VERSION, FEATURES_SCHEMA_VERSION BUCKET_NAME = "test-bucket" @@ -81,11 +80,6 @@ def assert_features(original, deserialized): assert feat_1.unique_name() == feat_2.unique_name() assert feat_1.entityset == feat_2.entityset - # IdentityFeature and DirectFeature objects do not have primitives, so - # series library does not need to be compared - if not (isinstance(feat_1, (IdentityFeature, DirectFeature))): - assert feat_1.primitive.series_library == feat_2.primitive.series_library - def pickle_features_test_helper(es_size, features_original, dir_path): filepath = os.path.join(dir_path, "test_feature") @@ -123,7 +117,7 @@ def test_pickle_features(es, tmp_path): pickle_features_test_helper(asizeof(es), features_original, str(tmp_path)) -def test_pickle_features_with_custom_primitive(pd_es, tmp_path): +def test_pickle_features_with_custom_primitive(es, tmp_path): class NewMax(AggregationPrimitive): name = "new_max" input_types = [ColumnSchema(semantic_tags={"numeric"})] @@ -131,13 +125,13 @@ class NewMax(AggregationPrimitive): features_original = dfs( target_dataframe_name="sessions", - entityset=pd_es, + entityset=es, agg_primitives=["Last", "Mean", NewMax], features_only=True, ) assert any([isinstance(feat.primitive, NewMax) for feat in features_original]) - pickle_features_test_helper(asizeof(pd_es), features_original, str(tmp_path)) + pickle_features_test_helper(asizeof(es), features_original, str(tmp_path)) def test_serialized_renamed_features(es): @@ -281,7 +275,7 @@ def test_s3_test_profile(es, s3_client, s3_bucket, setup_test_profile, profile_n @pytest.mark.parametrize("url,profile_name", [(S3_URL, False), (URL, None)]) -def test_deserialize_features_s3(pd_es, url, profile_name): +def test_deserialize_features_s3(es, url, profile_name): agg_primitives = [ Sum, Std, @@ -299,7 +293,7 @@ def test_deserialize_features_s3(pd_es, url, profile_name): features_original = dfs( target_dataframe_name="sessions", - entityset=pd_es, + entityset=es, features_only=True, agg_primitives=agg_primitives, trans_primitives=trans_primitives, @@ -320,7 +314,7 @@ def test_serialize_url(es): save_features(features_original, URL) -def test_custom_feature_names_retained_during_serialization(pd_es, tmp_path): +def test_custom_feature_names_retained_during_serialization(es, tmp_path): class MultiCumulative(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] @@ -328,16 +322,16 @@ class MultiCumulative(TransformPrimitive): number_output_features = 3 multi_output_trans_feat = Feature( - pd_es["log"].ww["value"], + es["log"].ww["value"], primitive=MultiCumulative, ) groupby_trans_feat = GroupByTransformFeature( - pd_es["log"].ww["value"], + es["log"].ww["value"], primitive=MultiCumulative, - groupby=pd_es["log"].ww["product_id"], + groupby=es["log"].ww["product_id"], ) multi_output_agg_feat = Feature( - pd_es["log"].ww["product_id"], + es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) @@ -459,40 +453,39 @@ def test_deserializer_uses_common_primitive_instances_with_args(es, tmp_path): assert new_scalar1_primitive.value == 1 assert new_scalar5_primitive.value == 5 - # Test primitive with multiple args - pandas only due to primitive compatibility - if es.dataframe_type == Library.PANDAS: - distance_to_holiday = DistanceToHoliday( - holiday="Canada Day", - country="Canada", - ) - features = dfs( - entityset=es, - target_dataframe_name="customers", - features_only=True, - agg_primitives=[], - trans_primitives=[distance_to_holiday], - ) + # Test primitive with multiple args + distance_to_holiday = DistanceToHoliday( + holiday="Canada Day", + country="Canada", + ) + features = dfs( + entityset=es, + target_dataframe_name="customers", + features_only=True, + agg_primitives=[], + trans_primitives=[distance_to_holiday], + ) - distance_features = [ - f for f in features if f.primitive.name == "distance_to_holiday" - ] + distance_features = [ + f for f in features if f.primitive.name == "distance_to_holiday" + ] - assert len(distance_features) > 1 + assert len(distance_features) > 1 - # DFS should use the the passed in primitive instance for all features - assert all([f.primitive is distance_to_holiday for f in distance_features]) + # DFS should use the the passed in primitive instance for all features + assert all([f.primitive is distance_to_holiday for f in distance_features]) - file = os.path.join(tmp_path, "distance_features.json") - save_features(distance_features, file) - new_distance_features = load_features(file) + file = os.path.join(tmp_path, "distance_features.json") + save_features(distance_features, file) + new_distance_features = load_features(file) - # After deserialization all features that share a primitive should use the same primitive instance - new_distance_primitive = new_distance_features[0].primitive - assert all( - [f.primitive is new_distance_primitive for f in new_distance_features], - ) - assert new_distance_primitive.holiday == "Canada Day" - assert new_distance_primitive.country == "Canada" + # After deserialization all features that share a primitive should use the same primitive instance + new_distance_primitive = new_distance_features[0].primitive + assert all( + [f.primitive is new_distance_primitive for f in new_distance_features], + ) + assert new_distance_primitive.holiday == "Canada Day" + assert new_distance_primitive.country == "Canada" # Test primitive with list arg is_in = IsIn(list_of_outputs=[5, True, "coke zero"]) @@ -520,7 +513,7 @@ def test_deserializer_uses_common_primitive_instances_with_args(es, tmp_path): assert new_is_in_primitive.list_of_outputs == [5, True, "coke zero"] -def test_can_serialize_word_set_for_number_of_common_words_feature(pd_es): +def test_can_serialize_word_set_for_number_of_common_words_feature(es): # The word_set argument is passed in as a set, which is not JSON-serializable. # This test checks internal logic that converts the set to a list so it can be serialized common_word_set = {"hello", "my"} diff --git a/featuretools/tests/primitive_tests/test_groupby_transform_primitives.py b/featuretools/tests/primitive_tests/test_groupby_transform_primitives.py index 6536deee68..8c4be78248 100644 --- a/featuretools/tests/primitive_tests/test_groupby_transform_primitives.py +++ b/featuretools/tests/primitive_tests/test_groupby_transform_primitives.py @@ -141,16 +141,16 @@ def test_regular(self): np.testing.assert_array_equal(function(group), answer) -def test_cum_sum(pd_es): - log_value_feat = IdentityFeature(pd_es["log"].ww["value"]) +def test_cum_sum(es): + log_value_feat = IdentityFeature(es["log"].ww["value"]) dfeat = Feature( - IdentityFeature(pd_es["sessions"].ww["device_type"]), + IdentityFeature(es["sessions"].ww["device_type"]), dataframe_name="log", ) cum_sum = Feature(log_value_feat, groupby=dfeat, primitive=CumSum) features = [cum_sum] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -161,16 +161,16 @@ def test_cum_sum(pd_es): assert v == cvalues[i] -def test_cum_min(pd_es): - log_value_feat = IdentityFeature(pd_es["log"].ww["value"]) +def test_cum_min(es): + log_value_feat = IdentityFeature(es["log"].ww["value"]) cum_min = Feature( log_value_feat, - groupby=IdentityFeature(pd_es["log"].ww["session_id"]), + groupby=IdentityFeature(es["log"].ww["session_id"]), primitive=CumMin, ) features = [cum_min] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -181,16 +181,16 @@ def test_cum_min(pd_es): assert v == cvalues[i] -def test_cum_max(pd_es): - log_value_feat = IdentityFeature(pd_es["log"].ww["value"]) +def test_cum_max(es): + log_value_feat = IdentityFeature(es["log"].ww["value"]) cum_max = Feature( log_value_feat, - groupby=IdentityFeature(pd_es["log"].ww["session_id"]), + groupby=IdentityFeature(es["log"].ww["session_id"]), primitive=CumMax, ) features = [cum_max] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -201,9 +201,9 @@ def test_cum_max(pd_es): assert v == cvalues[i] -def test_cum_sum_group_on_nan(pd_es): - log_value_feat = IdentityFeature(pd_es["log"].ww["value"]) - pd_es["log"]["product_id"] = ( +def test_cum_sum_group_on_nan(es): + log_value_feat = IdentityFeature(es["log"].ww["value"]) + es["log"]["product_id"] = ( ["coke zero"] * 3 + ["car"] * 2 + ["toothpaste"] * 3 @@ -212,15 +212,15 @@ def test_cum_sum_group_on_nan(pd_es): + [np.nan] * 4 + ["coke_zero"] * 2 ) - pd_es["log"]["value"][16] = 10 + es["log"]["value"][16] = 10 cum_sum = Feature( log_value_feat, - groupby=IdentityFeature(pd_es["log"].ww["product_id"]), + groupby=IdentityFeature(es["log"].ww["product_id"]), primitive=CumSum, ) features = [cum_sum] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(17), ) @@ -254,7 +254,7 @@ def test_cum_sum_group_on_nan(pd_es): assert v == cvalues[i] -def test_cum_sum_numpy_group_on_nan(pd_es): +def test_cum_sum_numpy_group_on_nan(es): class CumSumNumpy(TransformPrimitive): """Returns the cumulative sum after grouping""" @@ -269,8 +269,8 @@ def cum_sum(values): return cum_sum - log_value_feat = IdentityFeature(pd_es["log"].ww["value"]) - pd_es["log"]["product_id"] = ( + log_value_feat = IdentityFeature(es["log"].ww["value"]) + es["log"]["product_id"] = ( ["coke zero"] * 3 + ["car"] * 2 + ["toothpaste"] * 3 @@ -279,16 +279,16 @@ def cum_sum(values): + [np.nan] * 4 + ["coke_zero"] * 2 ) - pd_es["log"]["value"][16] = 10 + es["log"]["value"][16] = 10 cum_sum = Feature( log_value_feat, - groupby=IdentityFeature(pd_es["log"].ww["product_id"]), + groupby=IdentityFeature(es["log"].ww["product_id"]), primitive=CumSumNumpy, ) assert cum_sum.get_name() == "CUM_SUM(value) by product_id" features = [cum_sum] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(17), ) @@ -322,11 +322,11 @@ def cum_sum(values): assert v == cvalues[i] -def test_cum_handles_uses_full_dataframe(pd_es): +def test_cum_handles_uses_full_dataframe(es): def check(feature): feature_set = FeatureSet([feature]) calculator = FeatureSetCalculator( - pd_es, + es, feature_set=feature_set, time_last=None, ) @@ -339,31 +339,31 @@ def check(feature): for primitive in [CumSum, CumMean, CumMax, CumMin]: check( Feature( - pd_es["log"].ww["value"], - groupby=IdentityFeature(pd_es["log"].ww["session_id"]), + es["log"].ww["value"], + groupby=IdentityFeature(es["log"].ww["session_id"]), primitive=primitive, ), ) check( Feature( - pd_es["log"].ww["product_id"], - groupby=Feature(pd_es["log"].ww["product_id"]), + es["log"].ww["product_id"], + groupby=Feature(es["log"].ww["product_id"]), primitive=CumCount, ), ) -def test_cum_mean(pd_es): - log_value_feat = IdentityFeature(pd_es["log"].ww["value"]) +def test_cum_mean(es): + log_value_feat = IdentityFeature(es["log"].ww["value"]) cum_mean = Feature( log_value_feat, - groupby=IdentityFeature(pd_es["log"].ww["session_id"]), + groupby=IdentityFeature(es["log"].ww["session_id"]), primitive=CumMean, ) features = [cum_mean] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -374,15 +374,15 @@ def test_cum_mean(pd_es): assert v == cvalues[i] -def test_cum_count(pd_es): +def test_cum_count(es): cum_count = Feature( - IdentityFeature(pd_es["log"].ww["product_id"]), - groupby=IdentityFeature(pd_es["log"].ww["product_id"]), + IdentityFeature(es["log"].ww["product_id"]), + groupby=IdentityFeature(es["log"].ww["product_id"]), primitive=CumCount, ) features = [cum_count] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -393,10 +393,10 @@ def test_cum_count(pd_es): assert v == cvalues[i] -def test_rename(pd_es): +def test_rename(es): cum_count = Feature( - IdentityFeature(pd_es["log"].ww["product_id"]), - groupby=IdentityFeature(pd_es["log"].ww["product_id"]), + IdentityFeature(es["log"].ww["product_id"]), + groupby=IdentityFeature(es["log"].ww["product_id"]), primitive=CumCount, ) copy_feat = cum_count.rename("rename_test") @@ -411,15 +411,15 @@ def test_rename(pd_es): assert cum_count.dataframe_name == copy_feat.dataframe_name -def test_groupby_no_data(pd_es): +def test_groupby_no_data(es): cum_count = Feature( - IdentityFeature(pd_es["log"].ww["product_id"]), - groupby=IdentityFeature(pd_es["log"].ww["product_id"]), + IdentityFeature(es["log"].ww["product_id"]), + groupby=IdentityFeature(es["log"].ww["product_id"]), primitive=CumCount, ) last_feat = Feature(cum_count, parent_dataframe_name="customers", primitive=Last) df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=[last_feat], cutoff_time=pd.Timestamp("2011-04-08"), ) @@ -428,7 +428,7 @@ def test_groupby_no_data(pd_es): assert all([pd.isnull(value) for value in cvalues]) -def test_groupby_uses_calc_time(pd_es): +def test_groupby_uses_calc_time(es): def projected_amount_left(amount, timestamp, time=None): # cumulative sum of amount, with timedelta * constant subtracted delta = time - timestamp @@ -450,14 +450,14 @@ def get_function(self): time_since_product = GroupByTransformFeature( [ - IdentityFeature(pd_es["log"].ww["value"]), - IdentityFeature(pd_es["log"].ww["datetime"]), + IdentityFeature(es["log"].ww["value"]), + IdentityFeature(es["log"].ww["datetime"]), ], - groupby=IdentityFeature(pd_es["log"].ww["product_id"]), + groupby=IdentityFeature(es["log"].ww["product_id"]), primitive=ProjectedAmountRemaining, ) df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=[time_since_product], cutoff_time=pd.Timestamp("2011-04-10 11:10:30"), ) @@ -485,7 +485,7 @@ def get_function(self): assert (pd.isnull(x) and pd.isnull(y)) or x == y -def test_groupby_multi_output_stacking(pd_es): +def test_groupby_multi_output_stacking(es): class TestTime(TransformPrimitive): name = "test_time" input_types = [ColumnSchema(logical_type=Datetime)] @@ -493,7 +493,7 @@ class TestTime(TransformPrimitive): number_output_features = 6 fl = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="sessions", agg_primitives=["sum"], groupby_trans_primitives=[TestTime], @@ -507,9 +507,9 @@ class TestTime(TransformPrimitive): assert ("customers.SUM(log.TEST_TIME(datetime)[%d] by session_id)" % i) in fl -def test_serialization(pd_es): - value = IdentityFeature(pd_es["log"].ww["value"]) - zipcode = IdentityFeature(pd_es["log"].ww["zipcode"]) +def test_serialization(es): + value = IdentityFeature(es["log"].ww["value"]) + zipcode = IdentityFeature(es["log"].ww["zipcode"]) primitive = CumSum() groupby = feature_base.GroupByTransformFeature(value, primitive, zipcode) @@ -527,13 +527,13 @@ def test_serialization(pd_es): } assert groupby == feature_base.GroupByTransformFeature.from_dictionary( dictionary, - pd_es, + es, dependencies, primitive, ) -def test_groupby_with_multioutput_primitive(pd_es): +def test_groupby_with_multioutput_primitive(es): class MultiCumSum(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] @@ -547,7 +547,7 @@ def multi_cum_sum(x): return multi_cum_sum fm, _ = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="customers", trans_primitives=[], agg_primitives=[], @@ -557,7 +557,7 @@ def multi_cum_sum(x): # Calculate output in a separate DFS call to make sure the multi-output code # does not alter any values fm2, _ = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="customers", trans_primitives=[], agg_primitives=[], @@ -588,7 +588,7 @@ def multi_cum_sum(x): assert x == y -def test_groupby_with_multioutput_primitive_custom_names(pd_es): +def test_groupby_with_multioutput_primitive_custom_names(es): class MultiCumSum(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] @@ -605,7 +605,7 @@ def generate_names(primitive, base_feature_names): return ["CUSTOM_SUM", "CUSTOM_MAX", "CUSTOM_MIN"] fm, _ = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="customers", trans_primitives=[], agg_primitives=[], diff --git a/featuretools/tests/primitive_tests/test_overrides.py b/featuretools/tests/primitive_tests/test_overrides.py index c848e8b325..b48949e1f0 100644 --- a/featuretools/tests/primitive_tests/test_overrides.py +++ b/featuretools/tests/primitive_tests/test_overrides.py @@ -29,7 +29,6 @@ SubtractNumericScalar, Sum, ) -from featuretools.tests.testing_utils import to_pandas def test_overrides(es): @@ -105,7 +104,6 @@ def test_override_boolean(es): features=features, instance_ids=[0, 1, 2], ) - df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].tolist() assert v == test @@ -188,14 +186,10 @@ def test_override_cmp_from_column(es): features = [count_lo] - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=[0, 1, 2], - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=[0, 1, 2], ) v = df[count_lo.get_name()].tolist() for i, test in enumerate(to_test): @@ -252,7 +246,6 @@ def test_override_cmp(es): features=features, instance_ids=[0, 1, 2], ) - df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].tolist() assert v == test diff --git a/featuretools/tests/primitive_tests/test_primitive_utils.py b/featuretools/tests/primitive_tests/test_primitive_utils.py index 8db286a9ab..e093389934 100644 --- a/featuretools/tests/primitive_tests/test_primitive_utils.py +++ b/featuretools/tests/primitive_tests/test_primitive_utils.py @@ -46,7 +46,6 @@ list_primitive_files, load_primitive_from_file, ) -from featuretools.utils.gen_utils import Library def test_list_primitives_order(): @@ -60,7 +59,6 @@ def test_list_primitives_order(): actual_desc = _get_descriptions([primitive])[0] if actual_desc: assert actual_desc == row["description"] - assert row["dask_compatible"] == (Library.DASK in primitive.compatibility) assert row["valid_inputs"] == ", ".join( _get_unique_input_types(primitive.input_types), ) diff --git a/featuretools/tests/primitive_tests/test_rolling_primitive_utils.py b/featuretools/tests/primitive_tests/test_rolling_primitive_utils.py index 1b8c94b0c5..c6b95feef9 100644 --- a/featuretools/tests/primitive_tests/test_rolling_primitive_utils.py +++ b/featuretools/tests/primitive_tests/test_rolling_primitive_utils.py @@ -20,16 +20,16 @@ from featuretools.tests.primitive_tests.utils import get_number_from_offset -def test_get_rolled_series_without_gap(window_series_pd): +def test_get_rolled_series_without_gap(window_series): # Data is daily, so number of rows should be number of days not included in the gap - assert len(_get_rolled_series_without_gap(window_series_pd, "11D")) == 9 - assert len(_get_rolled_series_without_gap(window_series_pd, "0D")) == 20 - assert len(_get_rolled_series_without_gap(window_series_pd, "48H")) == 18 - assert len(_get_rolled_series_without_gap(window_series_pd, "4H")) == 19 + assert len(_get_rolled_series_without_gap(window_series, "11D")) == 9 + assert len(_get_rolled_series_without_gap(window_series, "0D")) == 20 + assert len(_get_rolled_series_without_gap(window_series, "48H")) == 18 + assert len(_get_rolled_series_without_gap(window_series, "4H")) == 19 -def test_get_rolled_series_without_gap_not_uniform(window_series_pd): - non_uniform_series = window_series_pd.iloc[[0, 2, 5, 6, 8, 9]] +def test_get_rolled_series_without_gap_not_uniform(window_series): + non_uniform_series = window_series.iloc[[0, 2, 5, 6, 8, 9]] assert len(_get_rolled_series_without_gap(non_uniform_series, "10D")) == 0 assert len(_get_rolled_series_without_gap(non_uniform_series, "0D")) == 6 @@ -39,18 +39,18 @@ def test_get_rolled_series_without_gap_not_uniform(window_series_pd): assert len(_get_rolled_series_without_gap(non_uniform_series, "4D2H")) == 2 -def test_get_rolled_series_without_gap_empty_series(window_series_pd): +def test_get_rolled_series_without_gap_empty_series(window_series): empty_series = pd.Series([], dtype="object") assert len(_get_rolled_series_without_gap(empty_series, "1D")) == 0 assert len(_get_rolled_series_without_gap(empty_series, "0D")) == 0 -def test_get_rolled_series_without_gap_large_bound(window_series_pd): - assert len(_get_rolled_series_without_gap(window_series_pd, "100D")) == 0 +def test_get_rolled_series_without_gap_large_bound(window_series): + assert len(_get_rolled_series_without_gap(window_series, "100D")) == 0 assert ( len( _get_rolled_series_without_gap( - window_series_pd.iloc[[0, 2, 5, 6, 8, 9]], + window_series.iloc[[0, 2, 5, 6, 8, 9]], "20D", ), ) @@ -69,26 +69,26 @@ def test_get_rolled_series_without_gap_large_bound(window_series_pd): ("4d", "0d"), ], ) -def test_roll_series_with_gap(window_length, gap, window_series_pd): +def test_roll_series_with_gap(window_length, gap, window_series): rolling_max = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=gap, min_periods=1, ).max() rolling_min = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=gap, min_periods=1, ).min() - assert len(rolling_max) == len(window_series_pd) - assert len(rolling_min) == len(window_series_pd) + assert len(rolling_max) == len(window_series) + assert len(rolling_min) == len(window_series) gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) - for i in range(len(window_series_pd)): + for i in range(len(window_series)): start_idx = i - gap_num - window_length_num + 1 if isinstance(gap, str): @@ -114,14 +114,14 @@ def test_roll_series_with_gap(window_length, gap, window_series_pd): @pytest.mark.parametrize("window_length", [3, "3d"]) -def test_roll_series_with_no_gap(window_length, window_series_pd): +def test_roll_series_with_no_gap(window_length, window_series): actual_rolling = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=0, min_periods=1, ).mean() - expected_rolling = window_series_pd.rolling(window_length, min_periods=1).mean() + expected_rolling = window_series.rolling(window_length, min_periods=1).mean() pd.testing.assert_series_equal(actual_rolling, expected_rolling) @@ -135,13 +135,13 @@ def test_roll_series_with_no_gap(window_length, window_series_pd): ("6d", "2d"), ], ) -def test_roll_series_with_gap_early_values(window_length, gap, window_series_pd): +def test_roll_series_with_gap_early_values(window_length, gap, window_series): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Default min periods is 1 - will include all default_partial_values = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=gap, min_periods=1, @@ -162,7 +162,7 @@ def test_roll_series_with_gap_early_values(window_length, gap, window_series_pd) # Make min periods the size of the window no_partial_values = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=gap, min_periods=window_length_num, @@ -184,13 +184,13 @@ def test_roll_series_with_gap_early_values(window_length, gap, window_series_pd) assert num_partial_aggregates == gap_num -def test_roll_series_with_gap_nullable_types(window_series_pd): +def test_roll_series_with_gap_nullable_types(window_series): window_length = 3 gap = 2 min_periods = 1 # Because we're inserting nans, confirm that nullability of the dtype doesn't have an impact on the results - nullable_series = window_series_pd.astype("Int64") - non_nullable_series = window_series_pd.astype("int64") + nullable_series = window_series.astype("Int64") + non_nullable_series = window_series.astype("int64") nullable_rolling_max = roll_series_with_gap( nullable_series, @@ -208,11 +208,11 @@ def test_roll_series_with_gap_nullable_types(window_series_pd): pd.testing.assert_series_equal(nullable_rolling_max, non_nullable_rolling_max) -def test_roll_series_with_gap_nullable_types_with_nans(window_series_pd): +def test_roll_series_with_gap_nullable_types_with_nans(window_series): window_length = 3 gap = 2 min_periods = 1 - nullable_floats = window_series_pd.astype("float64").replace( + nullable_floats = window_series.astype("float64").replace( {1: np.nan, 3: np.nan}, ) nullable_ints = nullable_floats.astype("Int64") @@ -236,9 +236,9 @@ def test_roll_series_with_gap_nullable_types_with_nans(window_series_pd): ) expected_early_values = [np.nan, np.nan, 0, 0, 2, 2, 4] + list( - range(7 - gap, len(window_series_pd) - gap), + range(7 - gap, len(window_series) - gap), ) - for i in range(len(window_series_pd)): + for i in range(len(window_series)): actual = nullable_floats_rolling_max.iloc[i] expected = expected_early_values[i] @@ -256,12 +256,12 @@ def test_roll_series_with_gap_nullable_types_with_nans(window_series_pd): ("4d", "0d"), ], ) -def test_apply_roll_with_offset_gap(window_length, gap, window_series_pd): +def test_apply_roll_with_offset_gap(window_length, gap, window_series): def max_wrapper(sub_s): return apply_roll_with_offset_gap(sub_s, gap, max, min_periods=1) rolling_max_obj = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=gap, min_periods=1, @@ -272,19 +272,19 @@ def min_wrapper(sub_s): return apply_roll_with_offset_gap(sub_s, gap, min, min_periods=1) rolling_min_obj = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=gap, min_periods=1, ) rolling_min_series = rolling_min_obj.apply(min_wrapper) - assert len(rolling_max_series) == len(window_series_pd) - assert len(rolling_min_series) == len(window_series_pd) + assert len(rolling_max_series) == len(window_series) + assert len(rolling_min_series) == len(window_series) gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) - for i in range(len(window_series_pd)): + for i in range(len(window_series)): start_idx = i - gap_num - window_length_num + 1 # Now that we have the _apply call, this acts as expected end_idx = i - gap_num @@ -308,7 +308,7 @@ def min_wrapper(sub_s): "min_periods", [1, 0, None], ) -def test_apply_roll_with_offset_gap_default_min_periods(min_periods, window_series_pd): +def test_apply_roll_with_offset_gap_default_min_periods(min_periods, window_series): window_length = "5d" window_length_num = 5 gap = "3d" @@ -318,7 +318,7 @@ def count_wrapper(sub_s): return apply_roll_with_offset_gap(sub_s, gap, len, min_periods=min_periods) rolling_count_obj = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=gap, min_periods=min_periods, @@ -342,7 +342,7 @@ def count_wrapper(sub_s): "min_periods", [2, 3, 4, 5], ) -def test_apply_roll_with_offset_gap_min_periods(min_periods, window_series_pd): +def test_apply_roll_with_offset_gap_min_periods(min_periods, window_series): window_length = "5d" window_length_num = 5 gap = "3d" @@ -352,7 +352,7 @@ def count_wrapper(sub_s): return apply_roll_with_offset_gap(sub_s, gap, len, min_periods=min_periods) rolling_count_obj = roll_series_with_gap( - window_series_pd, + window_series, window_length, gap=gap, min_periods=min_periods, @@ -468,7 +468,7 @@ def max_wrapper(sub_s): assert rolling_max_series.isna().sum() == (min_periods - 1) + (gap_num * 24) -def test_apply_roll_with_offset_data_min_periods_too_big(window_series_pd): +def test_apply_roll_with_offset_data_min_periods_too_big(window_series): window_length = "5D" gap = "2d" @@ -479,7 +479,7 @@ def max_wrapper(sub_s): return apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods) rolling_max_obj = roll_series_with_gap( - window_series_pd, + window_series, window_length, min_periods=min_periods, gap=gap, @@ -487,11 +487,11 @@ def max_wrapper(sub_s): rolling_max_series = rolling_max_obj.apply(max_wrapper) # The resulting series is comprised entirely of nans - assert rolling_max_series.isna().sum() == len(window_series_pd) + assert rolling_max_series.isna().sum() == len(window_series) def test_roll_series_with_gap_different_input_types_same_result_uniform( - window_series_pd, + window_series, ): # Offset inputs will only produce the same results as numeric inputs # when the data has a uniform frequency @@ -503,7 +503,7 @@ def test_roll_series_with_gap_different_input_types_same_result_uniform( # Rolling series' with matching input types expected_rolling_numeric = roll_series_with_gap( - window_series_pd, + window_series, window_length=int_window_length, gap=int_gap, min_periods=min_periods, @@ -518,7 +518,7 @@ def count_wrapper(sub_s): ) rolling_count_obj = roll_series_with_gap( - window_series_pd, + window_series, window_length=offset_window_length, gap=offset_gap, min_periods=min_periods, @@ -530,7 +530,7 @@ def count_wrapper(sub_s): # Rolling series' with mismatched input types mismatched_numeric_gap = roll_series_with_gap( - window_series_pd, + window_series, window_length=offset_window_length, gap=int_gap, min_periods=min_periods, @@ -539,12 +539,12 @@ def count_wrapper(sub_s): pd.testing.assert_series_equal(expected_rolling_numeric, mismatched_numeric_gap) -def test_roll_series_with_gap_incorrect_types(window_series_pd): +def test_roll_series_with_gap_incorrect_types(window_series): error = "Window length must be either an offset string or an integer." with pytest.raises(TypeError, match=error): ( roll_series_with_gap( - window_series_pd, + window_series, window_length=4.2, gap=4, min_periods=1, @@ -553,24 +553,24 @@ def test_roll_series_with_gap_incorrect_types(window_series_pd): error = "Gap must be either an offset string or an integer." with pytest.raises(TypeError, match=error): - roll_series_with_gap(window_series_pd, window_length=4, gap=4.2, min_periods=1) + roll_series_with_gap(window_series, window_length=4, gap=4.2, min_periods=1) -def test_roll_series_with_gap_negative_inputs(window_series_pd): +def test_roll_series_with_gap_negative_inputs(window_series): error = "Window length must be greater than zero." with pytest.raises(ValueError, match=error): - roll_series_with_gap(window_series_pd, window_length=-4, gap=4, min_periods=1) + roll_series_with_gap(window_series, window_length=-4, gap=4, min_periods=1) error = "Gap must be greater than or equal to zero." with pytest.raises(ValueError, match=error): - roll_series_with_gap(window_series_pd, window_length=4, gap=-4, min_periods=1) + roll_series_with_gap(window_series, window_length=4, gap=-4, min_periods=1) -def test_roll_series_with_non_offset_string_inputs(window_series_pd): +def test_roll_series_with_non_offset_string_inputs(window_series): error = "Cannot roll series. The specified gap, test, is not a valid offset alias." with pytest.raises(ValueError, match=error): roll_series_with_gap( - window_series_pd, + window_series, window_length="4D", gap="test", min_periods=1, @@ -579,7 +579,7 @@ def test_roll_series_with_non_offset_string_inputs(window_series_pd): error = "Cannot roll series. The specified window length, test, is not a valid offset alias." with pytest.raises(ValueError, match=error): roll_series_with_gap( - window_series_pd, + window_series, window_length="test", gap="7D", min_periods=1, @@ -593,7 +593,7 @@ def test_roll_series_with_non_offset_string_inputs(window_series_pd): ) with pytest.raises(TypeError, match=error): roll_series_with_gap( - window_series_pd, + window_series, window_length=7, gap="2d", min_periods=1, @@ -610,19 +610,19 @@ def test_roll_series_with_non_offset_string_inputs(window_series_pd): def test_no_call_to_apply_roll_with_offset_gap_with_numeric( mock_apply_roll, primitive, - window_series_pd, + window_series, ): assert not mock_apply_roll.called fully_numeric_primitive = primitive(window_length=3, gap=1) primitive_func = fully_numeric_primitive.get_function() if isinstance(fully_numeric_primitive, RollingCount): - pd.Series(primitive_func(window_series_pd.index)) + pd.Series(primitive_func(window_series.index)) else: pd.Series( primitive_func( - window_series_pd.index, - pd.Series(window_series_pd.values), + window_series.index, + pd.Series(window_series.values), ), ) @@ -631,12 +631,12 @@ def test_no_call_to_apply_roll_with_offset_gap_with_numeric( offset_window_primitive = primitive(window_length="3d", gap=1) primitive_func = offset_window_primitive.get_function() if isinstance(offset_window_primitive, RollingCount): - pd.Series(primitive_func(window_series_pd.index)) + pd.Series(primitive_func(window_series.index)) else: pd.Series( primitive_func( - window_series_pd.index, - pd.Series(window_series_pd.values), + window_series.index, + pd.Series(window_series.values), ), ) @@ -645,12 +645,12 @@ def test_no_call_to_apply_roll_with_offset_gap_with_numeric( no_gap_specified_primitive = primitive(window_length="3d") primitive_func = no_gap_specified_primitive.get_function() if isinstance(no_gap_specified_primitive, RollingCount): - pd.Series(primitive_func(window_series_pd.index)) + pd.Series(primitive_func(window_series.index)) else: pd.Series( primitive_func( - window_series_pd.index, - pd.Series(window_series_pd.values), + window_series.index, + pd.Series(window_series.values), ), ) @@ -659,12 +659,12 @@ def test_no_call_to_apply_roll_with_offset_gap_with_numeric( no_gap_specified_primitive = primitive(window_length="3d", gap="1d") primitive_func = no_gap_specified_primitive.get_function() if isinstance(no_gap_specified_primitive, RollingCount): - pd.Series(primitive_func(window_series_pd.index)) + pd.Series(primitive_func(window_series.index)) else: pd.Series( primitive_func( - window_series_pd.index, - pd.Series(window_series_pd.values), + window_series.index, + pd.Series(window_series.values), ), ) diff --git a/featuretools/tests/primitive_tests/test_transform_features.py b/featuretools/tests/primitive_tests/test_transform_features.py index 478779bec9..ad6eb2867d 100644 --- a/featuretools/tests/primitive_tests/test_transform_features.py +++ b/featuretools/tests/primitive_tests/test_transform_features.py @@ -62,7 +62,6 @@ LessThanEqualToScalar, LessThanScalar, Longitude, - Min, Mode, MultiplyBoolean, MultiplyNumeric, @@ -84,11 +83,6 @@ get_transform_primitives, ) from featuretools.synthesis.deep_feature_synthesis import match -from featuretools.tests.testing_utils import to_pandas -from featuretools.utils.gen_utils import Library, import_or_none -from featuretools.utils.spark_utils import pd_to_spark_clean - -dd = import_or_none("dask.dataframe") def test_init_and_name(es): @@ -112,15 +106,6 @@ def test_init_and_name(es): assert getattr(attr, "name") is not None trans_primitives = get_transform_primitives().values() - # If Dask EntitySet use only Dask compatible primitives - if es.dataframe_type == Library.DASK: - trans_primitives = [ - prim for prim in trans_primitives if Library.DASK in prim.compatibility - ] - if es.dataframe_type == Library.SPARK: - trans_primitives = [ - prim for prim in trans_primitives if Library.SPARK in prim.compatibility - ] for transform_prim in trans_primitives: # skip automated testing if a few special cases @@ -182,13 +167,13 @@ def test_make_trans_feat(es): feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, feature_set=feature_set) - df = to_pandas(calculator.run(np.array([0]))) + df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert v == 10 @pytest.fixture -def pd_simple_es(): +def simple_es(): df = pd.DataFrame( { "id": range(4), @@ -212,72 +197,6 @@ def pd_simple_es(): return es -@pytest.fixture -def dd_simple_es(pd_simple_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - dask = pytest.importorskip("dask", reason="Dask not installed, skipping") - dask.config.set({"dataframe.convert-string": False}) - dataframes = {} - for df in pd_simple_es.dataframes: - dataframes[df.ww.name] = ( - dd.from_pandas(df.reset_index(drop=True), npartitions=4), - df.ww.index, - None, - df.ww.logical_types, - ) - - relationships = [ - ( - rel.parent_name, - rel._parent_column_name, - rel.child_name, - rel._child_column_name, - ) - for rel in pd_simple_es.relationships - ] - - return EntitySet( - id=pd_simple_es.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture -def spark_simple_es(pd_simple_es): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - dataframes = {} - for df in pd_simple_es.dataframes: - cleaned_df = pd_to_spark_clean(df).reset_index(drop=True) - dataframes[df.ww.name] = ( - ps.from_pandas(cleaned_df), - df.ww.index, - None, - df.ww.logical_types, - ) - - relationships = [ - ( - rel.parent_name, - rel._parent_column_name, - rel.child_name, - rel._child_column_name, - ) - for rel in pd_simple_es.relationships - ] - - return EntitySet( - id=pd_simple_es.id, - dataframes=dataframes, - relationships=relationships, - ) - - -@pytest.fixture(params=["pd_simple_es", "dd_simple_es", "spark_simple_es"]) -def simple_es(request): - return request.getfixturevalue(request.param) - - def test_equal_categorical(simple_es): f1 = Feature( [ @@ -288,12 +207,10 @@ def test_equal_categorical(simple_es): ) df = calculate_feature_matrix(entityset=simple_es, features=[f1]) - if simple_es.dataframe_type != Library.SPARK: - # Spark does not support categorical dtype - assert set(simple_es["values"]["value"].cat.categories) != set( - simple_es["values"]["value2"].cat.categories, - ) - assert to_pandas(df, index="id", sort_index=True)["value = value2"].to_list() == [ + assert set(simple_es["values"]["value"].cat.categories) != set( + simple_es["values"]["value2"].cat.categories, + ) + assert df["value = value2"].to_list() == [ True, False, False, @@ -320,12 +237,8 @@ def test_equal_different_dtypes(simple_es): # verify that equals works for different dtypes regardless of order df = calculate_feature_matrix(entityset=simple_es, features=[f1, f2]) - assert to_pandas(df, index="id", sort_index=True)[ - "object = datetime" - ].to_list() == [False, False, False, False] - assert to_pandas(df, index="id", sort_index=True)[ - "datetime = object" - ].to_list() == [False, False, False, False] + assert df["object = datetime"].to_list() == [False, False, False, False] + assert df["datetime = object"].to_list() == [False, False, False, False] def test_not_equal_categorical(simple_es): @@ -339,12 +252,10 @@ def test_not_equal_categorical(simple_es): df = calculate_feature_matrix(entityset=simple_es, features=[f1]) - if simple_es.dataframe_type != Library.SPARK: - # Spark does not support categorical dtype - assert set(simple_es["values"]["value"].cat.categories) != set( - simple_es["values"]["value2"].cat.categories, - ) - assert to_pandas(df, index="id", sort_index=True)["value != value2"].to_list() == [ + assert set(simple_es["values"]["value"].cat.categories) != set( + simple_es["values"]["value2"].cat.categories, + ) + assert df["value != value2"].to_list() == [ False, True, True, @@ -371,26 +282,22 @@ def test_not_equal_different_dtypes(simple_es): # verify that equals works for different dtypes regardless of order df = calculate_feature_matrix(entityset=simple_es, features=[f1, f2]) - assert to_pandas(df, index="id", sort_index=True)[ - "object != datetime" - ].to_list() == [True, True, True, True] - assert to_pandas(df, index="id", sort_index=True)[ - "datetime != object" - ].to_list() == [True, True, True, True] + assert df["object != datetime"].to_list() == [True, True, True, True] + assert df["datetime != object"].to_list() == [True, True, True, True] -def test_diff(pd_es): - value = Feature(pd_es["log"].ww["value"]) - customer_id_feat = Feature(pd_es["sessions"].ww["customer_id"], "log") +def test_diff(es): + value = Feature(es["log"].ww["value"]) + customer_id_feat = Feature(es["sessions"].ww["customer_id"], "log") diff1 = Feature( value, - groupby=Feature(pd_es["log"].ww["session_id"]), + groupby=Feature(es["log"].ww["session_id"]), primitive=Diff, ) diff2 = Feature(value, groupby=customer_id_feat, primitive=Diff) feature_set = FeatureSet([diff1, diff2]) - calculator = FeatureSetCalculator(pd_es, feature_set=feature_set) + calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array(range(15))) val1 = df[diff1.get_name()].tolist() @@ -418,13 +325,13 @@ def test_diff(pd_es): np.testing.assert_equal(val2, correct_vals2) -def test_diff_shift(pd_es): - value = Feature(pd_es["log"].ww["value"]) - customer_id_feat = Feature(pd_es["sessions"].ww["customer_id"], "log") +def test_diff_shift(es): + value = Feature(es["log"].ww["value"]) + customer_id_feat = Feature(es["sessions"].ww["customer_id"], "log") diff_periods = Feature(value, groupby=customer_id_feat, primitive=Diff(periods=1)) feature_set = FeatureSet([diff_periods]) - calculator = FeatureSetCalculator(pd_es, feature_set=feature_set) + calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array(range(15))) val3 = df[diff_periods.get_name()].tolist() @@ -432,52 +339,52 @@ def test_diff_shift(pd_es): np.testing.assert_equal(val3, correct_vals3) -def test_diff_single_value(pd_es): +def test_diff_single_value(es): diff = Feature( - pd_es["stores"].ww["num_square_feet"], - groupby=Feature(pd_es["stores"].ww["région_id"]), + es["stores"].ww["num_square_feet"], + groupby=Feature(es["stores"].ww["région_id"]), primitive=Diff, ) feature_set = FeatureSet([diff]) - calculator = FeatureSetCalculator(pd_es, feature_set=feature_set) + calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array([4])) assert df[diff.get_name()][4] == 6000.0 -def test_diff_reordered(pd_es): +def test_diff_reordered(es): sum_feat = Feature( - pd_es["log"].ww["value"], + es["log"].ww["value"], parent_dataframe_name="sessions", primitive=Sum, ) diff = Feature(sum_feat, primitive=Diff) feature_set = FeatureSet([diff]) - calculator = FeatureSetCalculator(pd_es, feature_set=feature_set) + calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array([4, 2])) assert df[diff.get_name()][4] == 16 assert df[diff.get_name()][2] == -6 -def test_diff_single_value_is_nan(pd_es): +def test_diff_single_value_is_nan(es): diff = Feature( - pd_es["stores"].ww["num_square_feet"], - groupby=Feature(pd_es["stores"].ww["région_id"]), + es["stores"].ww["num_square_feet"], + groupby=Feature(es["stores"].ww["région_id"]), primitive=Diff, ) feature_set = FeatureSet([diff]) - calculator = FeatureSetCalculator(pd_es, feature_set=feature_set) + calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array([5])) assert df.shape[0] == 1 assert df[diff.get_name()].dropna().shape[0] == 0 -def test_diff_datetime(pd_es): +def test_diff_datetime(es): diff = Feature( - pd_es["log"].ww["datetime"], + es["log"].ww["datetime"], primitive=DiffDatetime, ) feature_set = FeatureSet([diff]) - calculator = FeatureSetCalculator(pd_es, feature_set=feature_set) + calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array(range(15))) vals = pd.Series(df[diff.get_name()].tolist()) expected_vals = pd.Series( @@ -502,13 +409,13 @@ def test_diff_datetime(pd_es): pd.testing.assert_series_equal(vals, expected_vals) -def test_diff_datetime_shift(pd_es): +def test_diff_datetime_shift(es): diff = Feature( - pd_es["log"].ww["datetime"], + es["log"].ww["datetime"], primitive=DiffDatetime(periods=1), ) feature_set = FeatureSet([diff]) - calculator = FeatureSetCalculator(pd_es, feature_set=feature_set) + calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array(range(6))) vals = pd.Series(df[diff.get_name()].tolist()) expected_vals = pd.Series( @@ -538,14 +445,10 @@ def test_compare_of_identity(es): for test in to_test: features.append(Feature(es["log"].ww["value"], primitive=test[0](10))) - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=[0, 1, 2, 3], - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=[0, 1, 2, 3], ) for i, test in enumerate(to_test): @@ -573,7 +476,6 @@ def test_compare_of_direct(es): features=features, instance_ids=[0, 1, 2, 3], ) - df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].tolist() @@ -592,7 +494,6 @@ def test_compare_of_transform(es): features.append(Feature(day, primitive=test[0](10))) df = calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 14]) - df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].tolist() @@ -624,7 +525,6 @@ def test_compare_of_agg(es): features=features, instance_ids=[0, 1, 2, 3], ) - df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].tolist() @@ -632,20 +532,12 @@ def test_compare_of_agg(es): def test_compare_all_nans(es): - if es.dataframe_type != Library.PANDAS: - nan_feat = Feature( - es["log"].ww["value"], - parent_dataframe_name="sessions", - primitive=Min, - ) - compare = nan_feat == 0.0 - else: - nan_feat = Feature( - es["log"].ww["product_id"], - parent_dataframe_name="sessions", - primitive=Mode, - ) - compare = nan_feat == "brown bag" + nan_feat = Feature( + es["log"].ww["product_id"], + parent_dataframe_name="sessions", + primitive=Mode, + ) + compare = nan_feat == "brown bag" # before all data time_last = pd.Timestamp("1/1/1993") @@ -656,7 +548,6 @@ def test_compare_all_nans(es): instance_ids=[0, 1, 2], cutoff_time=time_last, ) - df = to_pandas(df, index="id", sort_index=True) assert df[nan_feat.get_name()].dropna().shape[0] == 0 assert not df[compare.get_name()].any() @@ -683,7 +574,6 @@ def test_arithmetic_of_val(es): features=features, instance_ids=[0, 1, 2, 3], ) - df = to_pandas(df, index="id", sort_index=True) for f, test in zip(features, to_test): v = df[f.get_name()].tolist() @@ -708,9 +598,6 @@ def test_arithmetic_of_identity(es): (MultiplyNumeric, [0, 10, 40, 90]), (DivideNumeric, [np.nan, 2.5, 2.5, 2.5]), ] - # SubtractNumeric not supported for Spark EntitySets - if es.dataframe_type == Library.SPARK: - to_test = to_test[:1] + to_test[2:] features = [] for test in to_test: @@ -729,7 +616,6 @@ def test_arithmetic_of_identity(es): features=features, instance_ids=[0, 1, 2, 3], ) - df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test[:-1]): v = df[features[i].get_name()].tolist() @@ -753,8 +639,6 @@ def test_arithmetic_of_direct(es): (MultiplyNumeric, [165, 132, 148.5, 148.5]), (DivideNumeric, [6.6, 8.25, 22.0 / 3, 22.0 / 3]), ] - if es.dataframe_type == Library.SPARK: - to_test = to_test[:1] + to_test[2:] features = [] for test in to_test: @@ -765,21 +649,14 @@ def test_arithmetic_of_direct(es): features=features, instance_ids=[0, 3, 5, 7], ) - df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].tolist() assert v == test[1] -# Spark EntitySets do not support boolean multiplication -@pytest.fixture(params=["pd_boolean_mult_es", "dask_boolean_mult_es"]) -def boolean_mult_es(request): - return request.getfixturevalue(request.param) - - @pytest.fixture -def pd_boolean_mult_es(): +def boolean_mult_es(): es = EntitySet() df = pd.DataFrame( { @@ -799,21 +676,6 @@ def pd_boolean_mult_es(): return es -@pytest.fixture -def dask_boolean_mult_es(pd_boolean_mult_es): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - dataframes = {} - for df in pd_boolean_mult_es.dataframes: - dataframes[df.ww.name] = ( - dd.from_pandas(df, npartitions=2), - df.ww.index, - None, - df.ww.logical_types, - ) - - return EntitySet(id=pd_boolean_mult_es.id, dataframes=dataframes) - - def test_boolean_multiply(boolean_mult_es): es = boolean_mult_es to_test = [ @@ -826,9 +688,9 @@ def test_boolean_multiply(boolean_mult_es): for row in to_test: features.append(Feature(es["test"].ww[row[0]]) * Feature(es["test"].ww[row[1]])) - fm = to_pandas(calculate_feature_matrix(entityset=es, features=features)) + fm = calculate_feature_matrix(entityset=es, features=features) - df = to_pandas(es["test"]) + df = es["test"] for row in to_test: col_name = "{} * {}".format(row[0], row[1]) @@ -838,10 +700,7 @@ def test_boolean_multiply(boolean_mult_es): assert fm[col_name].equals(df[row[0]] * df[row[1]]) -# TODO: rework test to be Dask and Spark compatible def test_arithmetic_of_transform(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Test uses Diff which is not supported in Dask or Spark") diff1 = Feature([Feature(es["log"].ww["value"])], primitive=Diff) diff2 = Feature([Feature(es["log"].ww["value_2"])], primitive=Diff) @@ -869,9 +728,7 @@ def test_arithmetic_of_transform(es): def test_not_feature(es): not_feat = Feature(es["customers"].ww["loves_ice_cream"], primitive=Not) features = [not_feat] - df = to_pandas( - calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1]), - ) + df = calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1]) v = df[not_feat.get_name()].values assert not v[0] assert v[1] @@ -896,9 +753,6 @@ def test_arithmetic_of_agg(es): (MultiplyNumeric, [9, 0]), (DivideNumeric, [1, 0]), ] - # Skip SubtractNumeric for Spark as it's unsupported - if es.dataframe_type == Library.SPARK: - to_test = to_test[:1] + to_test[2:] features = [] for test in to_test: @@ -906,7 +760,6 @@ def test_arithmetic_of_agg(es): ids = ["United States", "Mexico"] df = calculate_feature_matrix(entityset=es, features=features, instance_ids=ids) - df = to_pandas(df, index="id", sort_index=True) df = df.loc[ids] for i, test in enumerate(to_test): @@ -914,13 +767,13 @@ def test_arithmetic_of_agg(es): assert v == test[1] -def test_latlong(pd_es): - log_latlong_feat = Feature(pd_es["log"].ww["latlong"]) +def test_latlong(es): + log_latlong_feat = Feature(es["log"].ww["latlong"]) latitude = Feature(log_latlong_feat, primitive=Latitude) longitude = Feature(log_latlong_feat, primitive=Longitude) features = [latitude, longitude] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -942,18 +795,18 @@ def test_latlong(pd_es): assert v == lonvalues[i] -def test_latlong_with_nan(pd_es): - df = pd_es["log"] +def test_latlong_with_nan(es): + df = es["log"] df["latlong"][0] = np.nan df["latlong"][1] = (10, np.nan) df["latlong"][2] = (np.nan, 4) df["latlong"][3] = (np.nan, np.nan) - pd_es.replace_dataframe(dataframe_name="log", df=df) - log_latlong_feat = Feature(pd_es["log"].ww["latlong"]) + es.replace_dataframe(dataframe_name="log", df=df) + log_latlong_feat = Feature(es["log"].ww["latlong"]) latitude = Feature(log_latlong_feat, primitive=Latitude) longitude = Feature(log_latlong_feat, primitive=Longitude) features = [latitude, longitude] - fm = calculate_feature_matrix(entityset=pd_es, features=features) + fm = calculate_feature_matrix(entityset=es, features=features) latvalues = fm[latitude.get_name()].values lonvalues = fm[longitude.get_name()].values assert len(latvalues) == 17 @@ -1000,14 +853,14 @@ def test_latlong_with_nan(pd_es): assert np.allclose(lonvalues, real_lons, atol=0.0001, equal_nan=True) -def test_haversine(pd_es): - log_latlong_feat = Feature(pd_es["log"].ww["latlong"]) - log_latlong_feat2 = Feature(pd_es["log"].ww["latlong2"]) +def test_haversine(es): + log_latlong_feat = Feature(es["log"].ww["latlong"]) + log_latlong_feat2 = Feature(es["log"].ww["latlong2"]) haversine = Feature([log_latlong_feat, log_latlong_feat2], primitive=Haversine) features = [haversine] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -1038,7 +891,7 @@ def test_haversine(pd_es): ) features = [haversine] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -1067,18 +920,18 @@ def test_haversine(pd_es): Haversine(unit="inches") -def test_haversine_with_nan(pd_es): +def test_haversine_with_nan(es): # Check some `nan` values - df = pd_es["log"] + df = es["log"] df["latlong"][0] = np.nan df["latlong"][1] = (10, np.nan) - pd_es.replace_dataframe(dataframe_name="log", df=df) - log_latlong_feat = Feature(pd_es["log"].ww["latlong"]) - log_latlong_feat2 = Feature(pd_es["log"].ww["latlong2"]) + es.replace_dataframe(dataframe_name="log", df=df) + log_latlong_feat = Feature(es["log"].ww["latlong"]) + log_latlong_feat2 = Feature(es["log"].ww["latlong2"]) haversine = Feature([log_latlong_feat, log_latlong_feat2], primitive=Haversine) features = [haversine] - df = calculate_feature_matrix(entityset=pd_es, features=features) + df = calculate_feature_matrix(entityset=es, features=features) values = df[haversine.get_name()].values real = [ np.nan, @@ -1103,17 +956,17 @@ def test_haversine_with_nan(pd_es): assert np.allclose(values, real, atol=0.0001, equal_nan=True) # Check all `nan` values - df = pd_es["log"] + df = es["log"] df["latlong2"] = np.nan - pd_es.replace_dataframe(dataframe_name="log", df=df) - log_latlong_feat = Feature(pd_es["log"].ww["latlong"]) - log_latlong_feat2 = Feature(pd_es["log"].ww["latlong2"]) + es.replace_dataframe(dataframe_name="log", df=df) + log_latlong_feat = Feature(es["log"].ww["latlong"]) + log_latlong_feat2 = Feature(es["log"].ww["latlong2"]) haversine = Feature([log_latlong_feat, log_latlong_feat2], primitive=Haversine) features = [haversine] - df = calculate_feature_matrix(entityset=pd_es, features=features) + df = calculate_feature_matrix(entityset=es, features=features) values = df[haversine.get_name()].values - real = [np.nan] * pd_es["log"].shape[0] + real = [np.nan] * es["log"].shape[0] assert np.allclose(values, real, atol=0.0001, equal_nan=True) @@ -1124,14 +977,10 @@ def test_text_primitives(es): features = [words, chars] - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=range(15), - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=range(15), ) word_counts = [532, 3, 3, 653, 1306, 1305, 174, 173, 79, 246, 1253, 3, 3, 3, 3] @@ -1167,14 +1016,10 @@ def test_isin_feat(es): primitive=IsIn(list_of_outputs=["toothpaste", "coke zero"]), ) features = [isin] - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=range(8), - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=range(8), ) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].tolist() @@ -1184,14 +1029,10 @@ def test_isin_feat(es): def test_isin_feat_other_syntax(es): isin = Feature(es["log"].ww["product_id"]).isin(["toothpaste", "coke zero"]) features = [isin] - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=range(8), - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=range(8), ) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].tolist() @@ -1201,14 +1042,10 @@ def test_isin_feat_other_syntax(es): def test_isin_feat_other_syntax_int(es): isin = Feature(es["log"].ww["value"]).isin([5, 10]) features = [isin] - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=range(8), - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=range(8), ) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].tolist() @@ -1235,14 +1072,10 @@ def pd_is_in(array): primitive=CustomIsIn(list_of_outputs=["toothpaste", "coke zero"]), ) features = [isin] - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=range(8), - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=range(8), ) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].tolist() @@ -1250,14 +1083,10 @@ def pd_is_in(array): isin = Feature(es["log"].ww["product_id"]).isin(["toothpaste", "coke zero"]) features = [isin] - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=range(8), - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=range(8), ) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].tolist() @@ -1265,31 +1094,27 @@ def pd_is_in(array): isin = Feature(es["log"].ww["value"]).isin([5, 10]) features = [isin] - df = to_pandas( - calculate_feature_matrix( - entityset=es, - features=features, - instance_ids=range(8), - ), - index="id", - sort_index=True, + df = calculate_feature_matrix( + entityset=es, + features=features, + instance_ids=range(8), ) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].tolist() assert true == v -def test_isnull_feat(pd_es): - value = Feature(pd_es["log"].ww["value"]) +def test_isnull_feat(es): + value = Feature(es["log"].ww["value"]) diff = Feature( value, - groupby=Feature(pd_es["log"].ww["session_id"]), + groupby=Feature(es["log"].ww["session_id"]), primitive=Diff, ) isnull = Feature(diff, primitive=IsNull) features = [isnull] df = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=features, instance_ids=range(15), ) @@ -1315,55 +1140,55 @@ def test_isnull_feat(pd_es): assert correct_vals == values -def test_percentile(pd_es): - v = Feature(pd_es["log"].ww["value"]) +def test_percentile(es): + v = Feature(es["log"].ww["value"]) p = Feature(v, primitive=Percentile) feature_set = FeatureSet([p]) - calculator = FeatureSetCalculator(pd_es, feature_set) + calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array(range(10, 17))) - true = pd_es["log"][v.get_name()].rank(pct=True) + true = es["log"][v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a -def test_dependent_percentile(pd_es): - v = Feature(pd_es["log"].ww["value"]) +def test_dependent_percentile(es): + v = Feature(es["log"].ww["value"]) p = Feature(v, primitive=Percentile) p2 = Feature(p - 1, primitive=Percentile) feature_set = FeatureSet([p, p2]) - calculator = FeatureSetCalculator(pd_es, feature_set) + calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array(range(10, 17))) - true = pd_es["log"][v.get_name()].rank(pct=True) + true = es["log"][v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a -def test_agg_percentile(pd_es): - v = Feature(pd_es["log"].ww["value"]) +def test_agg_percentile(es): + v = Feature(es["log"].ww["value"]) p = Feature(v, primitive=Percentile) agg = Feature(p, parent_dataframe_name="sessions", primitive=Sum) feature_set = FeatureSet([agg]) - calculator = FeatureSetCalculator(pd_es, feature_set) + calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) - log_vals = pd_es["log"][[v.get_name(), "session_id"]] + log_vals = es["log"][[v.get_name(), "session_id"]] log_vals["percentile"] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby("session_id")["percentile"].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a -def test_percentile_agg_percentile(pd_es): - v = Feature(pd_es["log"].ww["value"]) +def test_percentile_agg_percentile(es): + v = Feature(es["log"].ww["value"]) p = Feature(v, primitive=Percentile) agg = Feature(p, parent_dataframe_name="sessions", primitive=Sum) pagg = Feature(agg, primitive=Percentile) feature_set = FeatureSet([pagg]) - calculator = FeatureSetCalculator(pd_es, feature_set) + calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) - log_vals = pd_es["log"][[v.get_name(), "session_id"]] + log_vals = es["log"][[v.get_name(), "session_id"]] log_vals["percentile"] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby("session_id")["percentile"].sum().fillna(0) true_p = true_p.rank(pct=True)[[0, 1]] @@ -1372,15 +1197,15 @@ def test_percentile_agg_percentile(pd_es): assert (pd.isnull(t) and pd.isnull(a)) or t == a -def test_percentile_agg(pd_es): - v = Feature(pd_es["log"].ww["value"]) +def test_percentile_agg(es): + v = Feature(es["log"].ww["value"]) agg = Feature(v, parent_dataframe_name="sessions", primitive=Sum) pagg = Feature(agg, primitive=Percentile) feature_set = FeatureSet([pagg]) - calculator = FeatureSetCalculator(pd_es, feature_set) + calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) - log_vals = pd_es["log"][[v.get_name(), "session_id"]] + log_vals = es["log"][[v.get_name(), "session_id"]] true_p = log_vals.groupby("session_id")[v.get_name()].sum().fillna(0) true_p = true_p.rank(pct=True)[[0, 1]] @@ -1388,31 +1213,31 @@ def test_percentile_agg(pd_es): assert (pd.isnull(t) and pd.isnull(a)) or t == a -def test_direct_percentile(pd_es): - v = Feature(pd_es["customers"].ww["age"]) +def test_direct_percentile(es): + v = Feature(es["customers"].ww["age"]) p = Feature(v, primitive=Percentile) d = Feature(p, "sessions") feature_set = FeatureSet([d]) - calculator = FeatureSetCalculator(pd_es, feature_set) + calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) - cust_vals = pd_es["customers"][[v.get_name()]] + cust_vals = es["customers"][[v.get_name()]] cust_vals["percentile"] = cust_vals[v.get_name()].rank(pct=True) true_p = cust_vals["percentile"].loc[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a -def test_direct_agg_percentile(pd_es): - v = Feature(pd_es["log"].ww["value"]) +def test_direct_agg_percentile(es): + v = Feature(es["log"].ww["value"]) p = Feature(v, primitive=Percentile) agg = Feature(p, parent_dataframe_name="customers", primitive=Sum) d = Feature(agg, "sessions") feature_set = FeatureSet([d]) - calculator = FeatureSetCalculator(pd_es, feature_set) + calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) - log_vals = pd_es["log"][[v.get_name(), "session_id"]] + log_vals = es["log"][[v.get_name(), "session_id"]] log_vals["percentile"] = log_vals[v.get_name()].rank(pct=True) log_vals["customer_id"] = [0] * 10 + [1] * 5 + [2] * 2 true_p = log_vals.groupby("customer_id")["percentile"].sum().fillna(0) @@ -1421,12 +1246,12 @@ def test_direct_agg_percentile(pd_es): assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3) -def test_percentile_with_cutoff(pd_es): - v = Feature(pd_es["log"].ww["value"]) +def test_percentile_with_cutoff(es): + v = Feature(es["log"].ww["value"]) p = Feature(v, primitive=Percentile) feature_set = FeatureSet([p]) calculator = FeatureSetCalculator( - pd_es, + es, feature_set, pd.Timestamp("2011/04/09 10:30:13"), ) @@ -1434,9 +1259,9 @@ def test_percentile_with_cutoff(pd_es): assert df[p.get_name()].tolist()[0] == 1.0 -def test_two_kinds_of_dependents(pd_es): - v = Feature(pd_es["log"].ww["value"]) - product = Feature(pd_es["log"].ww["product_id"]) +def test_two_kinds_of_dependents(es): + v = Feature(es["log"].ww["value"]) + product = Feature(es["log"].ww["product_id"]) agg = Feature( v, parent_dataframe_name="customers", @@ -1453,7 +1278,7 @@ def test_two_kinds_of_dependents(pd_es): ) agg3 = Feature(agg2, parent_dataframe_name="customers", primitive=Sum) feature_set = FeatureSet([p, g, agg3]) - calculator = FeatureSetCalculator(pd_es, feature_set) + calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) assert df[p.get_name()].tolist() == [2.0 / 3, 1.0] assert df[g.get_name()].tolist() == [15, 26] @@ -1466,7 +1291,6 @@ class Mod4(TransformPrimitive): name = "mod4" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def get_function(self): filepath = self.get_filepath("featuretools_unit_test_example.csv") @@ -1484,7 +1308,6 @@ def _map(x): feat = Feature(es["log"].ww["value"], primitive=Mod4) df = calculate_feature_matrix(features=[feat], entityset=es, instance_ids=range(17)) - df = to_pandas(df, index="id") assert pd.isnull(df["MOD4(value)"][15]) assert df["MOD4(value)"][0] == 0 assert df["MOD4(value)"][14] == 2 @@ -1495,13 +1318,12 @@ def _map(x): agg_primitives=[], trans_primitives=[Mod4], ) - fm = to_pandas(fm, index="id") assert fm["MOD4(value)"][0] == 0 assert fm["MOD4(value)"][14] == 2 assert pd.isnull(fm["MOD4(value)"][15]) -def test_override_multi_feature_names(pd_es): +def test_override_multi_feature_names(es): def gen_custom_names(primitive, base_feature_names): return [ "Above18(%s)" % base_feature_names, @@ -1525,7 +1347,7 @@ def generate_names(primitive, base_feature_names): return gen_custom_names(primitive, base_feature_names) fm, features = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="customers", instance_ids=[0, 1, 2], agg_primitives=[], @@ -1539,8 +1361,6 @@ def generate_names(primitive, base_feature_names): def test_time_since_primitive_matches_all_datetime_types(es): - if es.dataframe_type == Library.SPARK: - pytest.xfail("TimeSince transform primitive is incompatible with Spark") fm, fl = dfs( target_dataframe_name="customers", entityset=es, @@ -1560,9 +1380,9 @@ def test_time_since_primitive_matches_all_datetime_types(es): assert name in fm.columns -def test_cfm_with_numeric_lag_and_non_nullable_column(pd_es): +def test_cfm_with_numeric_lag_and_non_nullable_column(es): # fill nans so we can use non nullable numeric logical type in the EntitySet - new_log = pd_es["log"].copy() + new_log = es["log"].copy() new_log["value"] = new_log["value"].fillna(0) new_log.ww.init( logical_types={"value": "Integer", "product_id": "Categorical"}, @@ -1570,21 +1390,21 @@ def test_cfm_with_numeric_lag_and_non_nullable_column(pd_es): time_index="datetime", name="new_log", ) - pd_es.add_dataframe(new_log) + es.add_dataframe(new_log) rels = [ ("sessions", "id", "new_log", "session_id"), ("products", "id", "new_log", "product_id"), ] - pd_es = pd_es.add_relationships(rels) + es = es.add_relationships(rels) - assert isinstance(pd_es["new_log"].ww.logical_types["value"], Integer) + assert isinstance(es["new_log"].ww.logical_types["value"], Integer) periods = 5 lag_primitive = NumericLag(periods=periods) - cutoff_times = pd_es["new_log"][["id", "datetime"]] + cutoff_times = es["new_log"][["id", "datetime"]] fm, _ = dfs( target_dataframe_name="new_log", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[lag_primitive], cutoff_time=cutoff_times, @@ -1603,9 +1423,9 @@ def test_cfm_with_numeric_lag_and_non_nullable_column(pd_es): ) -def test_cfm_with_lag_and_non_nullable_columns(pd_es): +def test_cfm_with_lag_and_non_nullable_columns(es): # fill nans so we can use non nullable numeric logical type in the EntitySet - new_log = pd_es["log"].copy() + new_log = es["log"].copy() new_log["value"] = new_log["value"].fillna(0) new_log["value_double"] = new_log["value"] new_log["purchased_with_nulls"] = new_log["purchased"] @@ -1622,21 +1442,21 @@ def test_cfm_with_lag_and_non_nullable_columns(pd_es): time_index="datetime", name="new_log", ) - pd_es.add_dataframe(new_log) + es.add_dataframe(new_log) rels = [ ("sessions", "id", "new_log", "session_id"), ("products", "id", "new_log", "product_id"), ] - pd_es = pd_es.add_relationships(rels) + es = es.add_relationships(rels) - assert isinstance(pd_es["new_log"].ww.logical_types["value"], Integer) + assert isinstance(es["new_log"].ww.logical_types["value"], Integer) periods = 5 lag_primitive = Lag(periods=periods) - cutoff_times = pd_es["new_log"][["id", "datetime"]] + cutoff_times = es["new_log"][["id", "datetime"]] fm, _ = dfs( target_dataframe_name="new_log", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[lag_primitive], cutoff_time=cutoff_times, @@ -1695,20 +1515,20 @@ def test_cfm_with_lag_and_non_nullable_columns(pd_es): ) -def test_comparisons_with_ordinal_valid_inputs_that_dont_work_but_should(pd_es): +def test_comparisons_with_ordinal_valid_inputs_that_dont_work_but_should(es): # TODO: Remvoe this test once the correct behavior is implemented in CFM # The following test covers a scenario where an intermediate feature doesn't have the correct type # because Woodwork has not yet been initialized. This calculation should work and return valid True/False # values. This should be fixed in a future PR, but until a fix is implemented null values are returned to # prevent calculate_feature_matrix from raising an Error when calculating features generated by DFS. - priority_level = Feature(pd_es["log"].ww["priority_level"]) + priority_level = Feature(es["log"].ww["priority_level"]) first_priority = AggregationFeature( priority_level, parent_dataframe_name="customers", primitive=First, ) - engagement = Feature(pd_es["customers"].ww["engagement_level"]) + engagement = Feature(es["customers"].ww["engagement_level"]) invalid_but_should_be_valid = [ TransformFeature([engagement, first_priority], primitive=LessThan), TransformFeature([engagement, first_priority], primitive=LessThanEqualTo), @@ -1716,12 +1536,11 @@ def test_comparisons_with_ordinal_valid_inputs_that_dont_work_but_should(pd_es): TransformFeature([engagement, first_priority], primitive=GreaterThanEqualTo), ] fm = calculate_feature_matrix( - entityset=pd_es, + entityset=es, features=invalid_but_should_be_valid, ) feature_cols = [f.get_name() for f in invalid_but_should_be_valid] - fm = to_pandas(fm) for col in feature_cols: assert fm[col].isnull().all() diff --git a/featuretools/tests/primitive_tests/transform_primitive_tests/test_cumulative_time_since.py b/featuretools/tests/primitive_tests/transform_primitive_tests/test_cumulative_time_since.py index bcd6d221cc..f2194e89b4 100644 --- a/featuretools/tests/primitive_tests/transform_primitive_tests/test_cumulative_time_since.py +++ b/featuretools/tests/primitive_tests/transform_primitive_tests/test_cumulative_time_since.py @@ -82,11 +82,11 @@ def test_some_nans(self): given_answer = primitive_func(datetimes, booleans) assert given_answer.equals(answer) - def test_with_featuretools(self, pd_es): + def test_with_featuretools(self, es): transform, aggregation = find_applicable_primitives(self.primitive) primitive_instance = self.primitive() transform.append(primitive_instance) - valid_dfs(pd_es, aggregation, transform, self.primitive) + valid_dfs(es, aggregation, transform, self.primitive) class TestCumulativeTimeSinceLastFalse(PrimitiveTestBase): @@ -157,8 +157,8 @@ def test_some_nans(self): given_answer = primitive_func(datetimes, booleans) assert given_answer.equals(answer) - def test_with_featuretools(self, pd_es): + def test_with_featuretools(self, es): transform, aggregation = find_applicable_primitives(self.primitive) primitive_instance = self.primitive() transform.append(primitive_instance) - valid_dfs(pd_es, aggregation, transform, self.primitive) + valid_dfs(es, aggregation, transform, self.primitive) diff --git a/featuretools/tests/primitive_tests/transform_primitive_tests/test_expanding_primitives.py b/featuretools/tests/primitive_tests/transform_primitive_tests/test_expanding_primitives.py index 421735f310..9a65085e0a 100644 --- a/featuretools/tests/primitive_tests/transform_primitive_tests/test_expanding_primitives.py +++ b/featuretools/tests/primitive_tests/transform_primitive_tests/test_expanding_primitives.py @@ -24,13 +24,13 @@ (0, 0), ], ) -def test_expanding_count_series(window_series_pd, min_periods, gap): - test = window_series_pd.shift(gap) +def test_expanding_count_series(window_series, min_periods, gap): + test = window_series.shift(gap) expected = test.expanding(min_periods=min_periods).count() num_nans = gap + min_periods - 1 expected[range(num_nans)] = np.nan primitive_instance = ExpandingCount(min_periods=min_periods, gap=gap).get_function() - actual = primitive_instance(window_series_pd.index) + actual = primitive_instance(window_series.index) pd.testing.assert_series_equal(pd.Series(actual), expected) @@ -43,13 +43,13 @@ def test_expanding_count_series(window_series_pd, min_periods, gap): (0, 1), ], ) -def test_expanding_count_date_range(window_date_range_pd, min_periods, gap): - test = _apply_gap_for_expanding_primitives(gap=gap, x=window_date_range_pd) +def test_expanding_count_date_range(window_date_range, min_periods, gap): + test = _apply_gap_for_expanding_primitives(gap=gap, x=window_date_range) expected = test.expanding(min_periods=min_periods).count() num_nans = gap + min_periods - 1 expected[range(num_nans)] = np.nan primitive_instance = ExpandingCount(min_periods=min_periods, gap=gap).get_function() - actual = primitive_instance(window_date_range_pd) + actual = primitive_instance(window_date_range) pd.testing.assert_series_equal(pd.Series(actual), expected) @@ -62,13 +62,13 @@ def test_expanding_count_date_range(window_date_range_pd, min_periods, gap): (0, 1), ], ) -def test_expanding_min(window_series_pd, min_periods, gap): - test = window_series_pd.shift(gap) +def test_expanding_min(window_series, min_periods, gap): + test = window_series.shift(gap) expected = test.expanding(min_periods=min_periods).min().values primitive_instance = ExpandingMin(min_periods=min_periods, gap=gap).get_function() actual = primitive_instance( - numeric=window_series_pd, - datetime=window_series_pd.index, + numeric=window_series, + datetime=window_series.index, ) pd.testing.assert_series_equal(pd.Series(actual), pd.Series(expected)) @@ -82,13 +82,13 @@ def test_expanding_min(window_series_pd, min_periods, gap): (0, 1), ], ) -def test_expanding_max(window_series_pd, min_periods, gap): - test = window_series_pd.shift(gap) +def test_expanding_max(window_series, min_periods, gap): + test = window_series.shift(gap) expected = test.expanding(min_periods=min_periods).max().values primitive_instance = ExpandingMax(min_periods=min_periods, gap=gap).get_function() actual = primitive_instance( - numeric=window_series_pd, - datetime=window_series_pd.index, + numeric=window_series, + datetime=window_series.index, ) pd.testing.assert_series_equal(pd.Series(actual), pd.Series(expected)) @@ -102,13 +102,13 @@ def test_expanding_max(window_series_pd, min_periods, gap): (0, 1), ], ) -def test_expanding_std(window_series_pd, min_periods, gap): - test = window_series_pd.shift(gap) +def test_expanding_std(window_series, min_periods, gap): + test = window_series.shift(gap) expected = test.expanding(min_periods=min_periods).std().values primitive_instance = ExpandingSTD(min_periods=min_periods, gap=gap).get_function() actual = primitive_instance( - numeric=window_series_pd, - datetime=window_series_pd.index, + numeric=window_series, + datetime=window_series.index, ) pd.testing.assert_series_equal(pd.Series(actual), pd.Series(expected)) @@ -122,13 +122,13 @@ def test_expanding_std(window_series_pd, min_periods, gap): (0, 1), ], ) -def test_expanding_mean(window_series_pd, min_periods, gap): - test = window_series_pd.shift(gap) +def test_expanding_mean(window_series, min_periods, gap): + test = window_series.shift(gap) expected = test.expanding(min_periods=min_periods).mean().values primitive_instance = ExpandingMean(min_periods=min_periods, gap=gap).get_function() actual = primitive_instance( - numeric=window_series_pd, - datetime=window_series_pd.index, + numeric=window_series, + datetime=window_series.index, ) pd.testing.assert_series_equal(pd.Series(actual), pd.Series(expected)) @@ -142,13 +142,13 @@ def test_expanding_mean(window_series_pd, min_periods, gap): (0, 1), ], ) -def test_expanding_trend(window_series_pd, min_periods, gap): - test = window_series_pd.shift(gap) +def test_expanding_trend(window_series, min_periods, gap): + test = window_series.shift(gap) expected = test.expanding(min_periods=min_periods).aggregate(calculate_trend).values primitive_instance = ExpandingTrend(min_periods=min_periods, gap=gap).get_function() actual = primitive_instance( - numeric=window_series_pd, - datetime=window_series_pd.index, + numeric=window_series, + datetime=window_series.index, ) pd.testing.assert_series_equal(pd.Series(actual), pd.Series(expected)) @@ -164,7 +164,7 @@ def test_expanding_trend(window_series_pd, min_periods, gap): ], ) def test_expanding_primitives_throw_error_when_given_string_offset( - window_series_pd, + window_series, primitive, ): error_msg = ( @@ -172,19 +172,19 @@ def test_expanding_primitives_throw_error_when_given_string_offset( ) with pytest.raises(TypeError, match=error_msg): primitive(gap="2H").get_function()( - numeric=window_series_pd, - datetime=window_series_pd.index, + numeric=window_series, + datetime=window_series.index, ) def test_apply_gap_for_expanding_primitives_throws_error_when_given_string_offset( - window_series_pd, + window_series, ): error_msg = ( "String offsets are not supported for the gap parameter in Expanding primitives" ) with pytest.raises(TypeError, match=error_msg): - _apply_gap_for_expanding_primitives(window_series_pd, gap="2H") + _apply_gap_for_expanding_primitives(window_series, gap="2H") @pytest.mark.parametrize( @@ -196,9 +196,9 @@ def test_apply_gap_for_expanding_primitives_throws_error_when_given_string_offse 0, ], ) -def test_apply_gap_for_expanding_primitives(window_series_pd, gap): - actual = _apply_gap_for_expanding_primitives(window_series_pd, gap).values - expected = window_series_pd.shift(gap).values +def test_apply_gap_for_expanding_primitives(window_series, gap): + actual = _apply_gap_for_expanding_primitives(window_series, gap).values + expected = window_series.shift(gap).values pd.testing.assert_series_equal(pd.Series(actual), pd.Series(expected)) @@ -212,11 +212,11 @@ def test_apply_gap_for_expanding_primitives(window_series_pd, gap): ], ) def test_apply_gap_for_expanding_primitives_handles_date_range( - window_date_range_pd, + window_date_range, gap, ): actual = pd.Series( - _apply_gap_for_expanding_primitives(window_date_range_pd, gap).values, + _apply_gap_for_expanding_primitives(window_date_range, gap).values, ) - expected = pd.Series(window_date_range_pd.to_series().shift(gap).values) + expected = pd.Series(window_date_range.to_series().shift(gap).values) pd.testing.assert_series_equal(actual, expected) diff --git a/featuretools/tests/primitive_tests/transform_primitive_tests/test_full_name_primitives.py b/featuretools/tests/primitive_tests/transform_primitive_tests/test_full_name_primitives.py index 9544fc5bd0..192a0e9006 100644 --- a/featuretools/tests/primitive_tests/transform_primitive_tests/test_full_name_primitives.py +++ b/featuretools/tests/primitive_tests/transform_primitive_tests/test_full_name_primitives.py @@ -74,11 +74,11 @@ def test_nan(self): answer = pd.Series(["James", np.nan, np.nan]) pd.testing.assert_series_equal(primitive_func(names), answer, check_names=False) - def test_with_featuretools(self, pd_es): + def test_with_featuretools(self, es): transform, aggregation = find_applicable_primitives(self.primitive) primitive_instance = self.primitive() transform.append(primitive_instance) - valid_dfs(pd_es, aggregation, transform, self.primitive) + valid_dfs(es, aggregation, transform, self.primitive) class TestFullNameToLastName(PrimitiveTestBase): @@ -140,11 +140,11 @@ def test_nan(self): answer = pd.Series(["Brown", np.nan, np.nan]) pd.testing.assert_series_equal(primitive_func(names), answer, check_names=False) - def test_with_featuretools(self, pd_es): + def test_with_featuretools(self, es): transform, aggregation = find_applicable_primitives(self.primitive) primitive_instance = self.primitive() transform.append(primitive_instance) - valid_dfs(pd_es, aggregation, transform, self.primitive) + valid_dfs(es, aggregation, transform, self.primitive) class TestFullNameToTitle(PrimitiveTestBase): @@ -184,8 +184,8 @@ def test_nan(self): answer = pd.Series(["Mr", np.nan, np.nan]) pd.testing.assert_series_equal(primitive_func(names), answer, check_names=False) - def test_with_featuretools(self, pd_es): + def test_with_featuretools(self, es): transform, aggregation = find_applicable_primitives(self.primitive) primitive_instance = self.primitive() transform.append(primitive_instance) - valid_dfs(pd_es, aggregation, transform, self.primitive) + valid_dfs(es, aggregation, transform, self.primitive) diff --git a/featuretools/tests/primitive_tests/transform_primitive_tests/test_percent_change.py b/featuretools/tests/primitive_tests/transform_primitive_tests/test_percent_change.py index 3308c41a7f..bdd59cc13c 100644 --- a/featuretools/tests/primitive_tests/transform_primitive_tests/test_percent_change.py +++ b/featuretools/tests/primitive_tests/transform_primitive_tests/test_percent_change.py @@ -69,8 +69,8 @@ def test_freq(self): given_answer = primtive_func(data) np.testing.assert_array_equal(given_answer, answer) - def test_with_featuretools(self, pd_es): + def test_with_featuretools(self, es): transform, aggregation = find_applicable_primitives(self.primitive) primitive_instantiate = self.primitive transform.append(primitive_instantiate) - valid_dfs(pd_es, aggregation, transform, self.primitive) + valid_dfs(es, aggregation, transform, self.primitive) diff --git a/featuretools/tests/primitive_tests/transform_primitive_tests/test_postal_primitives.py b/featuretools/tests/primitive_tests/transform_primitive_tests/test_postal_primitives.py index 295d40da7c..99d211c3e3 100644 --- a/featuretools/tests/primitive_tests/transform_primitive_tests/test_postal_primitives.py +++ b/featuretools/tests/primitive_tests/transform_primitive_tests/test_postal_primitives.py @@ -4,17 +4,14 @@ OneDigitPostalCode, TwoDigitPostalCode, ) -from featuretools.tests.testing_utils.es_utils import to_pandas def test_one_digit_postal_code(postal_code_dataframe): primitive = OneDigitPostalCode().get_function() for x in postal_code_dataframe: series = postal_code_dataframe[x] - actual = to_pandas(primitive(series)) - expected = to_pandas( - series.apply(lambda t: str(t)[0] if pd.notna(t) else pd.NA), - ) + actual = primitive(series) + expected = series.apply(lambda t: str(t)[0] if pd.notna(t) else pd.NA) pd.testing.assert_series_equal(actual, expected) @@ -22,8 +19,6 @@ def test_two_digit_postal_code(postal_code_dataframe): primitive = TwoDigitPostalCode().get_function() for x in postal_code_dataframe: series = postal_code_dataframe[x] - actual = to_pandas(primitive(series)) - expected = to_pandas( - series.apply(lambda t: str(t)[:2] if pd.notna(t) else pd.NA), - ) + actual = primitive(series) + expected = series.apply(lambda t: str(t)[:2] if pd.notna(t) else pd.NA) pd.testing.assert_series_equal(actual, expected) diff --git a/featuretools/tests/requirement_files/latest_requirements.txt b/featuretools/tests/requirement_files/latest_requirements.txt index 2de23e6eab..c00d4ecee2 100644 --- a/featuretools/tests/requirement_files/latest_requirements.txt +++ b/featuretools/tests/requirement_files/latest_requirements.txt @@ -5,7 +5,6 @@ holidays==0.46 numpy==1.26.4 pandas==2.2.1 psutil==5.9.8 -pyspark==3.5.1 scipy==1.13.0 tqdm==4.66.2 woodwork==0.29.0 diff --git a/featuretools/tests/requirement_files/minimum_dask_requirements.txt b/featuretools/tests/requirement_files/minimum_dask_requirements.txt index 9a8279947a..5041c3a7b3 100644 --- a/featuretools/tests/requirement_files/minimum_dask_requirements.txt +++ b/featuretools/tests/requirement_files/minimum_dask_requirements.txt @@ -9,4 +9,3 @@ psutil==5.7.0 scipy==1.10.0 tqdm==4.32.0 woodwork==0.28.0 -woodwork[dask]==0.28.0 diff --git a/featuretools/tests/requirement_files/minimum_spark_requirements.txt b/featuretools/tests/requirement_files/minimum_spark_requirements.txt deleted file mode 100644 index 546dd911d5..0000000000 --- a/featuretools/tests/requirement_files/minimum_spark_requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -cloudpickle==1.5.0 -holidays==0.17 -numpy==1.25.0 -packaging==20.0 -pandas==2.0.0 -psutil==5.7.0 -pyarrow==14.0.1 -pyspark==3.5.0 -scipy==1.10.0 -tqdm==4.32.0 -woodwork==0.28.0 -woodwork[spark]==0.28.0 diff --git a/featuretools/tests/selection/test_selection.py b/featuretools/tests/selection/test_selection.py index d3f20e82aa..337485cd96 100644 --- a/featuretools/tests/selection/test_selection.py +++ b/featuretools/tests/selection/test_selection.py @@ -31,12 +31,11 @@ def feature_matrix(): @pytest.fixture -def test_es(pd_es, feature_matrix): - pd_es.add_dataframe(dataframe_name="test", dataframe=feature_matrix, index="test") - return pd_es +def test_es(es, feature_matrix): + es.add_dataframe(dataframe_name="test", dataframe=feature_matrix, index="test") + return es -# remove low information features not supported in Dask def test_remove_low_information_feature_names(feature_matrix): feature_matrix = remove_low_information_features(feature_matrix) assert feature_matrix.shape == (3, 5) @@ -44,7 +43,6 @@ def test_remove_low_information_feature_names(feature_matrix): assert "all_null" not in feature_matrix.columns -# remove low information features not supported in Dask def test_remove_low_information_features(test_es, feature_matrix): features = [Feature(test_es["test"].ww[col]) for col in test_es["test"].columns] feature_matrix, features = remove_low_information_features(feature_matrix, features) diff --git a/featuretools/tests/synthesis/test_dask_dfs.py b/featuretools/tests/synthesis/test_dask_dfs.py deleted file mode 100644 index 1464f5a00d..0000000000 --- a/featuretools/tests/synthesis/test_dask_dfs.py +++ /dev/null @@ -1,512 +0,0 @@ -import pandas as pd -import pytest -from woodwork.logical_types import ( - Datetime, - Double, - Integer, - IntegerNullable, - NaturalLanguage, -) - -from featuretools import dfs -from featuretools.entityset import EntitySet -from featuretools.utils.gen_utils import import_or_none - -dd = import_or_none("dask.dataframe") - - -@pytest.mark.skipif("not dd") -def test_single_table_dask_entityset(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - - dask_es = EntitySet(id="dask_es") - df = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - values_dd = dd.from_pandas(df, npartitions=2) - ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} - dask_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - logical_types=ltypes, - ) - - dask_fm, _ = dfs( - entityset=dask_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - ) - - # Use the same columns and make sure both indexes are sorted the same - # update the type of the future index column so it doesn't conflict with the pandas fm - dask_fm = dask_fm.compute().astype({"id": "int64"}) - dask_computed_fm = dask_fm.set_index("id").loc[fm.index][fm.columns] - pd.testing.assert_frame_equal(fm, dask_computed_fm, check_dtype=False) - - -@pytest.mark.skipif("not dd") -def test_single_table_dask_entityset_ids_not_sorted(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - - dask_es = EntitySet(id="dask_es") - df = pd.DataFrame( - { - "id": [2, 0, 1, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - values_dd = dd.from_pandas(df, npartitions=2) - ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} - dask_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - logical_types=ltypes, - ) - - dask_fm, _ = dfs( - entityset=dask_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - ) - - # Make sure both indexes are sorted the same - dask_fm = dask_fm.compute().astype({"id": "int64"}) - pd.testing.assert_frame_equal( - fm, - dask_fm.set_index("id").loc[fm.index], - check_dtype=False, - ) - - -@pytest.mark.skipif("not dd") -def test_single_table_dask_entityset_with_instance_ids(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - instance_ids = [0, 1, 3] - - dask_es = EntitySet(id="dask_es") - df = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - - values_dd = dd.from_pandas(df, npartitions=2) - ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} - dask_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - logical_types=ltypes, - ) - - dask_fm, _ = dfs( - entityset=dask_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - instance_ids=instance_ids, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - instance_ids=instance_ids, - ) - - # Make sure both indexes are sorted the same - dask_fm = dask_fm.compute().astype({"id": "int64"}) - pd.testing.assert_frame_equal( - fm, - dask_fm.set_index("id").loc[fm.index], - check_dtype=False, - ) - - -@pytest.mark.skipif("not dd") -def test_single_table_dask_entityset_single_cutoff_time(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - - dask_es = EntitySet(id="dask_es") - df = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - values_dd = dd.from_pandas(df, npartitions=2) - ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} - dask_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - logical_types=ltypes, - ) - - dask_fm, _ = dfs( - entityset=dask_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - cutoff_time=pd.Timestamp("2019-01-05 04:00"), - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - cutoff_time=pd.Timestamp("2019-01-05 04:00"), - ) - - # Make sure both indexes are sorted the same - dask_fm = dask_fm.compute().astype({"id": "int64"}) - pd.testing.assert_frame_equal( - fm, - dask_fm.set_index("id").loc[fm.index], - check_dtype=False, - ) - - -@pytest.mark.skipif("not dd") -def test_single_table_dask_entityset_cutoff_time_df(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - - dask_es = EntitySet(id="dask_es") - df = pd.DataFrame( - { - "id": [0, 1, 2], - "values": [1, 12, -34], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - ], - "strings": ["I am a string", "23", "abcdef ghijk"], - }, - ) - values_dd = dd.from_pandas(df, npartitions=2) - ltypes = {"values": IntegerNullable, "dates": Datetime, "strings": NaturalLanguage} - dask_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - time_index="dates", - logical_types=ltypes, - ) - - ids = [0, 1, 2, 0] - times = [ - pd.Timestamp("2019-01-05 04:00"), - pd.Timestamp("2019-01-05 04:00"), - pd.Timestamp("2019-01-05 04:00"), - pd.Timestamp("2019-01-15 04:00"), - ] - labels = [True, False, True, False] - cutoff_times = pd.DataFrame( - {"id": ids, "time": times, "labels": labels}, - columns=["id", "time", "labels"], - ) - - dask_fm, _ = dfs( - entityset=dask_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - cutoff_time=cutoff_times, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - time_index="dates", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - cutoff_time=cutoff_times, - ) - # Because row ordering with Dask is not guaranteed, we need to sort on two columns to make sure that values - # for instance id 0 are compared correctly. Also, make sure the index column has the same dtype. - fm = fm.sort_values(["id", "labels"]) - dask_fm = dask_fm.compute().astype({"id": "int64"}) - dask_fm = dask_fm.set_index("id").sort_values(["id", "labels"]) - pd.testing.assert_frame_equal(fm, dask_fm, check_dtype=False) - - -@pytest.mark.skipif("not dd") -def test_single_table_dask_entityset_dates_not_sorted(): - dask_es = EntitySet(id="dask_es") - df = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - }, - ) - - primitives_list = ["absolute", "is_weekend", "year", "day"] - values_dd = dd.from_pandas(df, npartitions=1) - ltypes = { - "values": Integer, - "dates": Datetime, - } - dask_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - time_index="dates", - logical_types=ltypes, - ) - - dask_fm, _ = dfs( - entityset=dask_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - max_depth=1, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - time_index="dates", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - max_depth=1, - ) - - dask_fm = dask_fm.compute().astype({"id": "int64"}) - pd.testing.assert_frame_equal( - fm, - dask_fm.set_index("id").loc[fm.index], - check_dtype=False, - ) - - -@pytest.mark.skipif("not dd") -def test_dask_entityset_secondary_time_index(): - log_df = pd.DataFrame() - log_df["id"] = [0, 1, 2, 3] - log_df["scheduled_time"] = pd.to_datetime( - ["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01"], - ) - log_df["departure_time"] = pd.to_datetime( - [ - "2019-02-01 09:00", - "2019-02-06 10:00", - "2019-02-12 10:00", - "2019-03-01 11:30", - ], - ) - log_df["arrival_time"] = pd.to_datetime( - [ - "2019-02-01 11:23", - "2019-02-06 12:45", - "2019-02-12 13:53", - "2019-03-01 14:07", - ], - ) - log_df["delay"] = [-2, 10, 60, 0] - log_df["flight_id"] = [0, 1, 0, 1] - log_dask = dd.from_pandas(log_df, npartitions=2) - - flights_df = pd.DataFrame() - flights_df["id"] = [0, 1, 2, 3] - flights_df["origin"] = ["BOS", "LAX", "BOS", "LAX"] - flights_dask = dd.from_pandas(flights_df, npartitions=2) - - pd_es = EntitySet("flights") - dask_es = EntitySet("flights_dask") - - log_ltypes = { - "scheduled_time": Datetime, - "departure_time": Datetime, - "arrival_time": Datetime, - "delay": Double, - } - - pd_es.add_dataframe( - dataframe_name="logs", - dataframe=log_df, - index="id", - time_index="scheduled_time", - secondary_time_index={"arrival_time": ["departure_time", "delay"]}, - logical_types=log_ltypes, - ) - - dask_es.add_dataframe( - dataframe_name="logs", - dataframe=log_dask, - index="id", - logical_types=log_ltypes, - semantic_tags={"flight_id": "foreign_key"}, - time_index="scheduled_time", - secondary_time_index={"arrival_time": ["departure_time", "delay"]}, - ) - - pd_es.add_dataframe(dataframe_name="flights", dataframe=flights_df, index="id") - flights_ltypes = pd_es["flights"].ww.logical_types - dask_es.add_dataframe( - dataframe_name="flights", - dataframe=flights_dask, - index="id", - logical_types=flights_ltypes, - ) - - pd_es.add_relationship("flights", "id", "logs", "flight_id") - dask_es.add_relationship("flights", "id", "logs", "flight_id") - - cutoff_df = pd.DataFrame() - cutoff_df["id"] = [0, 1, 1] - cutoff_df["time"] = pd.to_datetime(["2019-02-02", "2019-02-02", "2019-02-20"]) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="logs", - cutoff_time=cutoff_df, - agg_primitives=["max"], - trans_primitives=["month"], - ) - - dask_fm, _ = dfs( - entityset=dask_es, - target_dataframe_name="logs", - cutoff_time=cutoff_df, - agg_primitives=["max"], - trans_primitives=["month"], - ) - - # Make sure both matrixes are sorted the same - # Also need to account for index differences - dask_fm_computed = dask_fm.compute().astype({"id": "int64"}).set_index("id") - pd.testing.assert_frame_equal( - fm.sort_values("delay"), - dask_fm_computed.sort_values("delay"), - check_dtype=False, - ) diff --git a/featuretools/tests/synthesis/test_deep_feature_synthesis.py b/featuretools/tests/synthesis/test_deep_feature_synthesis.py index 680cd520ee..f80cfbffca 100644 --- a/featuretools/tests/synthesis/test_deep_feature_synthesis.py +++ b/featuretools/tests/synthesis/test_deep_feature_synthesis.py @@ -58,7 +58,6 @@ make_ecommerce_entityset, number_of_features_with_name_like, ) -from featuretools.utils.gen_utils import Library def test_makes_agg_features_from_str(es): @@ -128,37 +127,6 @@ def find_other_agg_features(features): assert len(other_agg_features) == 0 -def test_errors_unsupported_primitives(es): - bad_trans_prim = CumSum() - bad_agg_prim = NumUnique() - bad_trans_prim.compatibility, bad_agg_prim.compatibility = [], [] - library = es.dataframe_type - error_text = "Selected primitives are incompatible with {} EntitySets: cum_sum, num_unique".format( - library.value, - ) - with pytest.raises(ValueError, match=error_text): - DeepFeatureSynthesis( - target_dataframe_name="sessions", - entityset=es, - agg_primitives=[bad_agg_prim], - trans_primitives=[bad_trans_prim], - ) - - -def test_errors_unsupported_primitives_spark(spark_es): - bad_trans_prim = CumSum() - bad_agg_prim = NumUnique() - bad_trans_prim.spark_compatible, bad_agg_prim.spark_compatible = False, False - error_text = "Selected primitives are incompatible with Spark EntitySets: cum_sum" - with pytest.raises(ValueError, match=error_text): - DeepFeatureSynthesis( - target_dataframe_name="sessions", - entityset=spark_es, - agg_primitives=[bad_agg_prim], - trans_primitives=[bad_trans_prim], - ) - - def test_error_for_missing_target_dataframe(es): error_text = ( "Provided target dataframe missing_dataframe does not exist in ecommerce" @@ -285,10 +253,10 @@ def test_makes_trans_feat(es): assert feature_with_name(features, "HOUR(datetime)") -def test_handles_diff_dataframe_groupby(pd_es): +def test_handles_diff_dataframe_groupby(es): dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[], groupby_trans_primitives=[Diff], ) @@ -298,10 +266,10 @@ def test_handles_diff_dataframe_groupby(pd_es): assert feature_with_name(features, "DIFF(value) by product_id") -def test_handles_time_since_previous_dataframe_groupby(pd_es): +def test_handles_time_since_previous_dataframe_groupby(es): dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[], groupby_trans_primitives=[TimeSincePrevious], ) @@ -311,9 +279,9 @@ def test_handles_time_since_previous_dataframe_groupby(pd_es): # M TODO -# def test_handles_cumsum_dataframe_groupby(pd_es): +# def test_handles_cumsum_dataframe_groupby(es): # dfs_obj = DeepFeatureSynthesis(target_dataframe_name='sessions', -# entityset=pd_es, +# entityset=es, # agg_primitives=[], # trans_primitives=[CumMean]) @@ -371,9 +339,6 @@ def test_makes_agg_features_of_trans_primitives(es): def test_makes_agg_features_with_where(es): - # TODO: Update to work with Dask and Spark `es` fixture when issue #978 is closed - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask EntitySets do not support add_interesting_values") es.add_interesting_values() dfs_obj = DeepFeatureSynthesis( @@ -391,10 +356,10 @@ def test_makes_agg_features_with_where(es): assert feature_with_name(features, "COUNT(log WHERE products.department = food)") -def test_make_groupby_features(pd_es): +def test_make_groupby_features(es): dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[], groupby_trans_primitives=["cum_sum"], @@ -403,10 +368,10 @@ def test_make_groupby_features(pd_es): assert feature_with_name(features, "CUM_SUM(value) by session_id") -def test_make_indirect_groupby_features(pd_es): +def test_make_indirect_groupby_features(es): dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[], groupby_trans_primitives=["cum_sum"], @@ -415,15 +380,15 @@ def test_make_indirect_groupby_features(pd_es): assert feature_with_name(features, "CUM_SUM(products.rating) by session_id") -def test_make_groupby_features_with_id(pd_es): +def test_make_groupby_features_with_id(es): # Need to convert customer_id to categorical column in order to build desired feature - pd_es["sessions"].ww.set_types( + es["sessions"].ww.set_types( logical_types={"customer_id": "Categorical"}, semantic_tags={"customer_id": "foreign_key"}, ) dfs_obj = DeepFeatureSynthesis( target_dataframe_name="sessions", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[], groupby_trans_primitives=["cum_count"], @@ -433,15 +398,15 @@ def test_make_groupby_features_with_id(pd_es): assert feature_with_name(features, "CUM_COUNT(customer_id) by customer_id") -def test_make_groupby_features_with_diff_id(pd_es): +def test_make_groupby_features_with_diff_id(es): # Need to convert cohort to categorical column in order to build desired feature - pd_es["customers"].ww.set_types( + es["customers"].ww.set_types( logical_types={"cohort": "Categorical"}, semantic_tags={"cohort": "foreign_key"}, ) dfs_obj = DeepFeatureSynthesis( target_dataframe_name="customers", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[], groupby_trans_primitives=["cum_count"], @@ -452,10 +417,10 @@ def test_make_groupby_features_with_diff_id(pd_es): assert feature_with_name(features, groupby_with_diff_id) -def test_make_groupby_features_with_agg(pd_es): +def test_make_groupby_features_with_agg(es): dfs_obj = DeepFeatureSynthesis( target_dataframe_name="cohorts", - entityset=pd_es, + entityset=es, agg_primitives=["sum"], trans_primitives=[], groupby_trans_primitives=["cum_sum"], @@ -498,7 +463,7 @@ def test_bad_groupby_feature(es): ("7d", "3d"), ], ) -def test_make_rolling_features(window_length, gap, rolling_primitive, pd_es): +def test_make_rolling_features(window_length, gap, rolling_primitive, es): rolling_primitive_obj = rolling_primitive( window_length=window_length, gap=gap, @@ -506,7 +471,7 @@ def test_make_rolling_features(window_length, gap, rolling_primitive, pd_es): ) dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[rolling_primitive_obj], ) @@ -522,11 +487,11 @@ def test_make_rolling_features(window_length, gap, rolling_primitive, pd_es): ("7d", "3d"), ], ) -def test_make_rolling_count_off_datetime_feature(window_length, gap, pd_es): +def test_make_rolling_count_off_datetime_feature(window_length, gap, es): rolling_count = RollingCount(window_length=window_length, min_periods=gap) dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[rolling_count], ) @@ -665,10 +630,6 @@ def test_seed_features(es): def test_does_not_make_agg_of_direct_of_target_dataframe(es): - # TODO: Update to work with Dask and Spark supported primitive - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask EntitySets do not support the Last primitive") - count_sessions = Feature( es["sessions"].ww["id"], parent_dataframe_name="customers", @@ -690,10 +651,6 @@ def test_does_not_make_agg_of_direct_of_target_dataframe(es): def test_dfs_builds_on_seed_features_more_than_max_depth(es): - # TODO: Update to work with Dask and Spark supported primitive - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask EntitySets do not support the Last and Mode primitives") - seed_feature_sessions = Feature( es["log"].ww["id"], parent_dataframe_name="sessions", @@ -752,10 +709,6 @@ def test_dfs_includes_seed_features_greater_than_max_depth(es): def test_allowed_paths(es): - # TODO: Update to work with Dask and Spark supported primitive - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask EntitySets do not support the Last primitive") - kwargs = dict( target_dataframe_name="customers", entityset=es, @@ -858,9 +811,6 @@ def test_where_primitives(es): def test_stacking_where_primitives(es): - # TODO: Update to work with Dask supported primitive - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask and Spark EntitySets do not support the Last primitive") es = copy.deepcopy(es) es.add_interesting_values(dataframe_name="sessions", values={"device_type": [0]}) es.add_interesting_values( @@ -948,9 +898,6 @@ def test_where_different_base_feats(es): def test_dfeats_where(es): - # TODO: Update to work with Dask `es` fixture when issue #978 is closed - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask and Spark EntitySets do not support add_interesting_values") es.add_interesting_values() dfs_obj = DeepFeatureSynthesis( @@ -1026,9 +973,6 @@ def test_transform_consistency(transform_es): def test_transform_no_stack_agg(es): - # TODO: Update to work with Dask and Spark supported primitives - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask EntitySets do not support the NMostCommon primitive") dfs_obj = DeepFeatureSynthesis( target_dataframe_name="customers", entityset=es, @@ -1059,9 +1003,6 @@ def test_initialized_trans_prim(es): def test_initialized_agg_prim(es): - # TODO: Update to work with Dask and Spark supported primitives - if es.dataframe_type != Library.PANDAS: - pytest.xfail("Dask EntitySets do not support the NMostCommon primitive") ThreeMost = NMostCommon(n=3) dfs_obj = DeepFeatureSynthesis( target_dataframe_name="sessions", @@ -1075,11 +1016,6 @@ def test_initialized_agg_prim(es): def test_return_types(es): - # TODO: Update to work with Dask and Spark supported primitive - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Dask and Spark EntitySets do not support the NMostCommon primitive", - ) dfs_obj = DeepFeatureSynthesis( target_dataframe_name="sessions", entityset=es, @@ -1182,12 +1118,6 @@ def test_makes_direct_features_through_multiple_relationships(games_es): def test_stacks_multioutput_features(es): - # TODO: Update to work with Dask and Spark supported primitive - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Dask EntitySets do not support the NumUnique and NMostCommon primitives", - ) - class TestTime(TransformPrimitive): name = "test_time" input_types = [ColumnSchema(logical_type=Datetime)] @@ -1217,11 +1147,6 @@ def test_f(x): def test_seed_multi_output_feature_stacking(es): - # TODO: Update to work with Dask and Spark supported primitive - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Dask EntitySets do not support the NMostCommon and NumUnique primitives", - ) threecommon = NMostCommon(3) tc = Feature( es["log"].ww["product_id"], @@ -1566,7 +1491,7 @@ def test_primitive_options_with_globals(es): ) -def test_primitive_options_groupbys(pd_es): +def test_primitive_options_groupbys(es): options = { "cum_count": {"include_groupby_dataframes": ["log", "customers"]}, "cum_sum": {"ignore_groupby_dataframes": ["sessions"]}, @@ -1583,7 +1508,7 @@ def test_primitive_options_groupbys(pd_es): dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[], max_depth=3, @@ -1621,10 +1546,6 @@ def test_primitive_options_groupbys(pd_es): def test_primitive_options_multiple_inputs(es): - if es.dataframe_type != Library.PANDAS: - pytest.xfail( - "Dask and Spark EntitySets do not support various primitives used in this test", - ) too_many_options = { "mode": [{"include_dataframes": ["logs"]}, {"ignore_dataframes": ["sessions"]}], } @@ -1782,7 +1703,6 @@ class AddThree(TransformPrimitive): ] return_type = ColumnSchema(semantic_tags={"numeric"}) commutative = True - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] def generate_name(self, base_feature_names): return "%s + %s + %s" % ( @@ -1975,8 +1895,6 @@ def test_does_not_build_features_on_last_time_index_col(es): def test_builds_features_using_all_input_types(es): - if es.dataframe_type == Library.SPARK: - pytest.skip("NumTrue primitive not compatible with Spark") new_log_df = es["log"] new_log_df.ww["purchased_nullable"] = es["log"]["purchased"] new_log_df.ww.set_types(logical_types={"purchased_nullable": "boolean_nullable"}) @@ -2016,12 +1934,12 @@ def test_builds_features_using_all_input_types(es): assert feature_with_name(agg_features, "NUM_TRUE(log.purchased_nullable)") -def test_make_groupby_features_with_depth_none(pd_es): +def test_make_groupby_features_with_depth_none(es): # If max_depth is set to -1, it sets it to None internally, so this # test validates code paths that have a None max_depth dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[], trans_primitives=[], groupby_trans_primitives=["cum_sum"], @@ -2031,14 +1949,14 @@ def test_make_groupby_features_with_depth_none(pd_es): assert feature_with_name(features, "CUM_SUM(value) by session_id") -def test_check_stacking_when_building_transform_features(pd_es): +def test_check_stacking_when_building_transform_features(es): class NewMean(Mean): name = "NEW_MEAN" base_of_exclude = [Absolute] dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[NewMean, "mean"], trans_primitives=["absolute"], max_depth=-1, @@ -2048,14 +1966,14 @@ class NewMean(Mean): assert number_of_features_with_name_like(features, "ABSOLUTE(NEW_MEAN") == 0 -def test_check_stacking_when_building_groupby_features(pd_es): +def test_check_stacking_when_building_groupby_features(es): class NewMean(Mean): name = "NEW_MEAN" base_of_exclude = [CumSum] dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=[NewMean, "mean"], groupby_trans_primitives=["cum_sum"], max_depth=5, @@ -2065,14 +1983,14 @@ class NewMean(Mean): assert number_of_features_with_name_like(features, "CUM_SUM(NEW_MEAN") == 0 -def test_check_stacking_when_building_agg_features(pd_es): +def test_check_stacking_when_building_agg_features(es): class NewAbsolute(Absolute): name = "NEW_ABSOLUTE" base_of_exclude = [Mean] dfs_obj = DeepFeatureSynthesis( target_dataframe_name="log", - entityset=pd_es, + entityset=es, agg_primitives=["mean"], trans_primitives=[NewAbsolute, "absolute"], max_depth=5, diff --git a/featuretools/tests/synthesis/test_dfs_method.py b/featuretools/tests/synthesis/test_dfs_method.py index ed7056bc87..042c2d83d3 100644 --- a/featuretools/tests/synthesis/test_dfs_method.py +++ b/featuretools/tests/synthesis/test_dfs_method.py @@ -18,10 +18,6 @@ from featuretools.primitives.base import AggregationPrimitive, TransformPrimitive from featuretools.synthesis import dfs from featuretools.synthesis.deep_feature_synthesis import DeepFeatureSynthesis -from featuretools.tests.testing_utils import to_pandas -from featuretools.utils.gen_utils import Library, import_or_none, is_instance - -dd = import_or_none("dask.dataframe") @pytest.fixture @@ -140,28 +136,11 @@ def test_accepts_cutoff_time_df(dataframes, relationships): target_dataframe_name="transactions", cutoff_time=cutoff_times_df, ) - feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True) + feature_matrix = feature_matrix assert len(feature_matrix.index) == 3 assert len(feature_matrix.columns) == len(features) -@pytest.mark.skipif("not dd") -def test_warns_cutoff_time_dask(dataframes, relationships): - cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3], "time": [10, 12, 15]}) - cutoff_times_df = dd.from_pandas(cutoff_times_df, npartitions=2) - match = ( - "cutoff_time should be a Pandas DataFrame: " - "computing cutoff_time, this may take a while" - ) - with pytest.warns(UserWarning, match=match): - dfs( - dataframes=dataframes, - relationships=relationships, - target_dataframe_name="transactions", - cutoff_time=cutoff_times_df, - ) - - def test_accepts_cutoff_time_compose(dataframes, relationships): def fraud_occured(df): return df["fraud"].any() @@ -178,7 +157,7 @@ def fraud_occured(df): lm = cp.LabelMaker(**kwargs) - transactions_df = to_pandas(dataframes["transactions"][0]) + transactions_df = dataframes["transactions"][0] labels = lm.search(transactions_df, num_examples_per_instance=-1) @@ -191,7 +170,6 @@ def fraud_occured(df): target_dataframe_name="cards", cutoff_time=labels, ) - feature_matrix = to_pandas(feature_matrix, index="id") assert len(feature_matrix.index) == 6 assert len(feature_matrix.columns) == len(features) + 1 @@ -203,7 +181,6 @@ def test_accepts_single_cutoff_time(dataframes, relationships): target_dataframe_name="transactions", cutoff_time=20, ) - feature_matrix = to_pandas(feature_matrix, index="id") assert len(feature_matrix.index) == 5 assert len(feature_matrix.columns) == len(features) @@ -215,7 +192,6 @@ def test_accepts_no_cutoff_time(dataframes, relationships): target_dataframe_name="transactions", instance_ids=[1, 2, 3, 5, 6], ) - feature_matrix = to_pandas(feature_matrix, index="id") assert len(feature_matrix.index) == 5 assert len(feature_matrix.columns) == len(features) @@ -230,20 +206,18 @@ def test_ignores_instance_ids_if_cutoff_df(dataframes, relationships): cutoff_time=cutoff_times_df, instance_ids=instance_ids, ) - feature_matrix = to_pandas(feature_matrix, index="id") assert len(feature_matrix.index) == 3 assert len(feature_matrix.columns) == len(features) -def test_approximate_features(pd_dataframes, relationships): - # TODO: Update to use Dask dataframes when issue #985 is closed +def test_approximate_features(dataframes, relationships): cutoff_times_df = pd.DataFrame( {"instance_id": [1, 3, 1, 5, 3, 6], "time": [11, 16, 16, 26, 17, 22]}, ) # force column to BooleanNullable - pd_dataframes["transactions"] += ({"fraud": "BooleanNullable"},) + dataframes["transactions"] += ({"fraud": "BooleanNullable"},) feature_matrix, features = dfs( - dataframes=pd_dataframes, + dataframes=dataframes, relationships=relationships, target_dataframe_name="transactions", cutoff_time=cutoff_times_df, @@ -259,10 +233,10 @@ def test_approximate_features(pd_dataframes, relationships): assert (feature_matrix[direct_agg_feat_name] == truth_values.values).all() -def test_all_columns(pd_dataframes, relationships): +def test_all_columns(dataframes, relationships): cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3], "time": [10, 12, 15]}) feature_matrix, features = dfs( - dataframes=pd_dataframes, + dataframes=dataframes, relationships=relationships, target_dataframe_name="transactions", cutoff_time=cutoff_times_df, @@ -291,20 +265,11 @@ def test_features_only(dataframes, relationships): features_only=True, ) - # pandas creates 11 features - # dask creates 10 features (no skew) - # spark creates 9 features (no skew, no percent_true) - if isinstance(dataframes["transactions"][0], pd.DataFrame): - expected_features = 11 - elif is_instance(dataframes["transactions"][0], dd, "DataFrame"): - expected_features = 10 - else: - expected_features = 9 + expected_features = 11 assert len(features) == expected_features def test_accepts_relative_training_window(datetime_es): - # TODO: Update to use Dask dataframes when issue #882 is closed feature_matrix, _ = dfs(entityset=datetime_es, target_dataframe_name="transactions") feature_matrix_2, _ = dfs( @@ -353,7 +318,6 @@ def test_accepts_relative_training_window(datetime_es): def test_accepts_pd_timedelta_training_window(datetime_es): - # TODO: Update to use Dask dataframes when issue #882 is closed feature_matrix, _ = dfs( entityset=datetime_es, target_dataframe_name="transactions", @@ -365,7 +329,6 @@ def test_accepts_pd_timedelta_training_window(datetime_es): def test_accepts_pd_dateoffset_training_window(datetime_es): - # TODO: Update to use Dask dataframes when issue #882 is closed feature_matrix, _ = dfs( entityset=datetime_es, target_dataframe_name="transactions", @@ -425,8 +388,6 @@ def test_handles_pandas_overflow_error(datetime_es): def test_warns_with_unused_primitives(es): - if es.dataframe_type == Library.SPARK: - pytest.skip("Spark throws extra warnings") trans_primitives = ["num_characters", "num_words", "add_numeric"] agg_primitives = [Max, "min"] @@ -489,11 +450,11 @@ def test_no_warns_with_camel_and_title_case(es): ) -def test_does_not_warn_with_stacking_feature(pd_es): +def test_does_not_warn_with_stacking_feature(es): with warnings.catch_warnings(): warnings.simplefilter("error") dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="régions", agg_primitives=["percent_true"], trans_primitives=[GreaterThanScalar(5)], @@ -505,8 +466,6 @@ def test_does_not_warn_with_stacking_feature(pd_es): def test_warns_with_unused_where_primitives(es): - if es.dataframe_type == Library.SPARK: - pytest.skip("Spark throws extra warnings") warning_text = ( "Some specified primitives were not used during DFS:\n" + " where_primitives: ['count', 'sum']\n" @@ -528,7 +487,7 @@ def test_warns_with_unused_where_primitives(es): assert record[0].message.args[0] == warning_text -def test_warns_with_unused_groupby_primitives(pd_es): +def test_warns_with_unused_groupby_primitives(es): warning_text = ( "Some specified primitives were not used during DFS:\n" + " groupby_trans_primitives: ['cum_sum']\n" @@ -539,7 +498,7 @@ def test_warns_with_unused_groupby_primitives(pd_es): with pytest.warns(UnusedPrimitiveWarning) as record: dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="sessions", groupby_trans_primitives=["cum_sum"], max_depth=1, @@ -552,7 +511,7 @@ def test_warns_with_unused_groupby_primitives(pd_es): with warnings.catch_warnings(): warnings.simplefilter("error") dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="customers", groupby_trans_primitives=["cum_sum"], max_depth=1, @@ -560,7 +519,7 @@ def test_warns_with_unused_groupby_primitives(pd_es): ) -def test_warns_with_unused_custom_primitives(pd_es): +def test_warns_with_unused_custom_primitives(es): class AboveTen(TransformPrimitive): name = "above_ten" input_types = [ColumnSchema(semantic_tags={"numeric"})] @@ -578,7 +537,7 @@ class AboveTen(TransformPrimitive): with pytest.warns(UnusedPrimitiveWarning) as record: dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="sessions", trans_primitives=trans_primitives, max_depth=1, @@ -591,7 +550,7 @@ class AboveTen(TransformPrimitive): with warnings.catch_warnings(): warnings.simplefilter("error") dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="customers", trans_primitives=trans_primitives, max_depth=1, @@ -615,7 +574,7 @@ class MaxAboveTen(AggregationPrimitive): with pytest.warns(UnusedPrimitiveWarning) as record: dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="stores", agg_primitives=agg_primitives, max_depth=1, @@ -628,7 +587,7 @@ class MaxAboveTen(AggregationPrimitive): with warnings.catch_warnings(): warnings.simplefilter("error") dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="sessions", agg_primitives=agg_primitives, max_depth=1, @@ -666,7 +625,7 @@ def __call__(self, update, progress_percent, time_elapsed): assert np.isclose(mock_progress_callback.total_progress_percent, 100.0) -def test_calls_progress_callback_cluster(pd_dataframes, relationships, dask_cluster): +def test_calls_progress_callback_cluster(dataframes, relationships, dask_cluster): class MockProgressCallback: def __init__(self): self.progress_history = [] @@ -682,7 +641,7 @@ def __call__(self, update, progress_percent, time_elapsed): dkwargs = {"cluster": dask_cluster.scheduler.address} dfs( - dataframes=pd_dataframes, + dataframes=dataframes, relationships=relationships, target_dataframe_name="transactions", progress_callback=mock_progress_callback, @@ -693,10 +652,10 @@ def __call__(self, update, progress_percent, time_elapsed): assert np.isclose(mock_progress_callback.total_progress_percent, 100.0) -def test_dask_kwargs(pd_dataframes, relationships, dask_cluster): +def test_dask_kwargs(dataframes, relationships, dask_cluster): cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3], "time": [10, 12, 15]}) feature_matrix, features = dfs( - dataframes=pd_dataframes, + dataframes=dataframes, relationships=relationships, target_dataframe_name="transactions", cutoff_time=cutoff_times_df, @@ -704,7 +663,7 @@ def test_dask_kwargs(pd_dataframes, relationships, dask_cluster): dask_kwargs = {"cluster": dask_cluster.scheduler.address} feature_matrix_2, features_2 = dfs( - dataframes=pd_dataframes, + dataframes=dataframes, relationships=relationships, target_dataframe_name="transactions", cutoff_time=cutoff_times_df, diff --git a/featuretools/tests/synthesis/test_encode_features.py b/featuretools/tests/synthesis/test_encode_features.py index 420759269e..635d10d3f0 100644 --- a/featuretools/tests/synthesis/test_encode_features.py +++ b/featuretools/tests/synthesis/test_encode_features.py @@ -7,15 +7,15 @@ from featuretools.synthesis import encode_features -def test_encodes_features(pd_es): - f1 = IdentityFeature(pd_es["log"].ww["product_id"]) - f2 = IdentityFeature(pd_es["log"].ww["purchased"]) - f3 = IdentityFeature(pd_es["log"].ww["value"]) +def test_encodes_features(es): + f1 = IdentityFeature(es["log"].ww["product_id"]) + f2 = IdentityFeature(es["log"].ww["purchased"]) + f3 = IdentityFeature(es["log"].ww["value"]) features = [f1, f2, f3] feature_matrix = calculate_feature_matrix( features, - pd_es, + es, instance_ids=[0, 1, 2, 3, 4, 5], ) @@ -33,30 +33,13 @@ def test_encodes_features(pd_es): assert len(features_encoded) == 5 -def test_dask_errors_encode_features(dask_es): - f1 = IdentityFeature(dask_es["log"].ww["product_id"]) - f2 = IdentityFeature(dask_es["log"].ww["purchased"]) - f3 = IdentityFeature(dask_es["log"].ww["value"]) - - features = [f1, f2, f3] - feature_matrix = calculate_feature_matrix( - features, - dask_es, - instance_ids=[0, 1, 2, 3, 4, 5], - ) - error_text = "feature_matrix must be a Pandas DataFrame" - - with pytest.raises(TypeError, match=error_text): - encode_features(feature_matrix, features) - - -def test_inplace_encodes_features(pd_es): - f1 = IdentityFeature(pd_es["log"].ww["product_id"]) +def test_inplace_encodes_features(es): + f1 = IdentityFeature(es["log"].ww["product_id"]) features = [f1] feature_matrix = calculate_feature_matrix( features, - pd_es, + es, instance_ids=[0, 1, 2, 3, 4, 5], ) @@ -70,15 +53,15 @@ def test_inplace_encodes_features(pd_es): assert feature_matrix_encoded.shape == feature_matrix.shape -def test_to_encode_features(pd_es): - f1 = IdentityFeature(pd_es["log"].ww["product_id"]) - f2 = IdentityFeature(pd_es["log"].ww["value"]) - f3 = IdentityFeature(pd_es["log"].ww["datetime"]) +def test_to_encode_features(es): + f1 = IdentityFeature(es["log"].ww["product_id"]) + f2 = IdentityFeature(es["log"].ww["value"]) + f3 = IdentityFeature(es["log"].ww["datetime"]) features = [f1, f2, f3] feature_matrix = calculate_feature_matrix( features, - pd_es, + es, instance_ids=[0, 1, 2, 3, 4, 5], ) @@ -108,20 +91,20 @@ def test_to_encode_features(pd_es): assert feature_matrix_encoded["product_id"].dtype == "category" -def test_encode_features_handles_pass_columns(pd_es): - f1 = IdentityFeature(pd_es["log"].ww["product_id"]) - f2 = IdentityFeature(pd_es["log"].ww["value"]) +def test_encode_features_handles_pass_columns(es): + f1 = IdentityFeature(es["log"].ww["product_id"]) + f2 = IdentityFeature(es["log"].ww["value"]) features = [f1, f2] cutoff_time = pd.DataFrame( { "instance_id": range(6), - "time": pd_es["log"]["datetime"][0:6], + "time": es["log"]["datetime"][0:6], "label": [i % 2 for i in range(6)], }, columns=["instance_id", "time", "label"], ) - feature_matrix = calculate_feature_matrix(features, pd_es, cutoff_time) + feature_matrix = calculate_feature_matrix(features, es, cutoff_time) assert "label" in feature_matrix.columns @@ -148,21 +131,21 @@ def test_encode_features_handles_pass_columns(pd_es): assert "label" in feature_matrix_encoded.columns -def test_encode_features_catches_features_mismatch(pd_es): - f1 = IdentityFeature(pd_es["log"].ww["product_id"]) - f2 = IdentityFeature(pd_es["log"].ww["value"]) - f3 = IdentityFeature(pd_es["log"].ww["session_id"]) +def test_encode_features_catches_features_mismatch(es): + f1 = IdentityFeature(es["log"].ww["product_id"]) + f2 = IdentityFeature(es["log"].ww["value"]) + f3 = IdentityFeature(es["log"].ww["session_id"]) features = [f1, f2] cutoff_time = pd.DataFrame( { "instance_id": range(6), - "time": pd_es["log"]["datetime"][0:6], + "time": es["log"]["datetime"][0:6], "label": [i % 2 for i in range(6)], }, columns=["instance_id", "time", "label"], ) - feature_matrix = calculate_feature_matrix(features, pd_es, cutoff_time) + feature_matrix = calculate_feature_matrix(features, es, cutoff_time) assert "label" in feature_matrix.columns @@ -177,15 +160,15 @@ def test_encode_unknown_features(): {"category": "category"}, ) - pd_es = EntitySet("test") - pd_es.add_dataframe( + es = EntitySet("test") + es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True, ) features, feature_defs = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="a", max_depth=1, ) @@ -202,14 +185,14 @@ def test_encode_unknown_features(): ] -def test_encode_features_topn(pd_es): +def test_encode_features_topn(es): topn = Feature( - Feature(pd_es["log"].ww["product_id"]), + Feature(es["log"].ww["product_id"]), parent_dataframe_name="customers", primitive=NMostCommon(n=3), ) features, feature_defs = dfs( - entityset=pd_es, + entityset=es, instance_ids=[0, 1, 2], target_dataframe_name="customers", agg_primitives=[NMostCommon(n=3)], @@ -229,15 +212,15 @@ def test_encode_features_drop_first(): df = pd.DataFrame({"category": ["ao", "b", "c", "d", "e"]}).astype( {"category": "category"}, ) - pd_es = EntitySet("test") - pd_es.add_dataframe( + es = EntitySet("test") + es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True, ) features, feature_defs = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="a", max_depth=1, ) @@ -260,13 +243,13 @@ def test_encode_features_drop_first(): assert len(features_enc.columns) == 2 -def test_encode_features_handles_dictionary_input(pd_es): - f1 = IdentityFeature(pd_es["log"].ww["product_id"]) - f2 = IdentityFeature(pd_es["log"].ww["purchased"]) - f3 = IdentityFeature(pd_es["log"].ww["session_id"]) +def test_encode_features_handles_dictionary_input(es): + f1 = IdentityFeature(es["log"].ww["product_id"]) + f2 = IdentityFeature(es["log"].ww["purchased"]) + f3 = IdentityFeature(es["log"].ww["session_id"]) features = [f1, f2, f3] - feature_matrix = calculate_feature_matrix(features, pd_es, instance_ids=range(16)) + feature_matrix = calculate_feature_matrix(features, es, instance_ids=range(16)) feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features) true_values = [ "product_id = coke zero", @@ -347,15 +330,15 @@ def test_encode_features_matches_calculate_feature_matrix(): {"category": "category"}, ) - pd_es = EntitySet("test") - pd_es.add_dataframe( + es = EntitySet("test") + es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True, ) features, feature_defs = dfs( - entityset=pd_es, + entityset=es, target_dataframe_name="a", max_depth=1, ) @@ -366,7 +349,7 @@ def test_encode_features_matches_calculate_feature_matrix(): to_encode=["category"], ) - features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es) + features_calc = calculate_feature_matrix(feature_defs_enc, entityset=es) pd.testing.assert_frame_equal(features_enc, features_calc) assert features_calc.ww._schema == features_enc.ww._schema diff --git a/featuretools/tests/synthesis/test_get_valid_primitives.py b/featuretools/tests/synthesis/test_get_valid_primitives.py index 6bf5b73036..98e51e6470 100644 --- a/featuretools/tests/synthesis/test_get_valid_primitives.py +++ b/featuretools/tests/synthesis/test_get_valid_primitives.py @@ -11,7 +11,6 @@ TransformPrimitive, ) from featuretools.synthesis.get_valid_primitives import get_valid_primitives -from featuretools.utils.gen_utils import Library def test_get_valid_primitives_selected_primitives(es): @@ -61,14 +60,14 @@ def test_invalid_primitive(es): ) msg = ( - "Selected primitive " + "Selected primitive " "is not an AggregationPrimitive, TransformPrimitive, or str" ) with pytest.raises(ValueError, match=msg): get_valid_primitives( es, target_dataframe_name="log", - selected_primitives=[Library], + selected_primitives=[ColumnSchema], ) @@ -78,14 +77,10 @@ def test_primitive_compatibility(es): "customers", selected_primitives=[TimeSincePrevious], ) + assert len(trans_prims) == 1 - if es.dataframe_type != Library.PANDAS: - assert len(trans_prims) == 0 - else: - assert len(trans_prims) == 1 - -def test_get_valid_primitives_custom_primitives(pd_es): +def test_get_valid_primitives_custom_primitives(es): class ThreeMostCommonCat(AggregationPrimitive): name = "n_most_common_categorical" input_types = [ColumnSchema(semantic_tags={"category"})] @@ -101,9 +96,8 @@ class AddThree(TransformPrimitive): ] return_type = ColumnSchema(semantic_tags="numeric") commutative = True - compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] - agg_prims, trans_prims = get_valid_primitives(pd_es, "log") + agg_prims, trans_prims = get_valid_primitives(es, "log") assert ThreeMostCommonCat not in agg_prims assert AddThree not in trans_prims @@ -112,7 +106,7 @@ class AddThree(TransformPrimitive): match="'add_three' is not a recognized primitive name", ): agg_prims, trans_prims = get_valid_primitives( - pd_es, + es, "log", 2, [ThreeMostCommonCat, "add_three"], diff --git a/featuretools/tests/synthesis/test_spark_dfs.py b/featuretools/tests/synthesis/test_spark_dfs.py deleted file mode 100644 index c39bdf8201..0000000000 --- a/featuretools/tests/synthesis/test_spark_dfs.py +++ /dev/null @@ -1,534 +0,0 @@ -import pandas as pd -import pytest -from woodwork.logical_types import ( - Datetime, - Double, - Integer, - IntegerNullable, - NaturalLanguage, -) - -from featuretools import dfs -from featuretools.entityset import EntitySet -from featuretools.utils.gen_utils import import_or_none - -ps = import_or_none("pyspark.pandas") - - -@pytest.mark.skipif("not ps") -def test_single_table_spark_entityset(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - - spark_es = EntitySet(id="spark_es") - df = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - values_dd = ps.from_pandas(df) - ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} - spark_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - logical_types=ltypes, - ) - - spark_fm, _ = dfs( - entityset=spark_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - ) - - spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) - spark_computed_fm = spark_fm.set_index("id").loc[fm.index][fm.columns] - # Spark dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing - pd.testing.assert_frame_equal( - fm.astype(spark_computed_fm.dtypes), - spark_computed_fm, - ) - - -@pytest.mark.skipif("not ps") -def test_single_table_spark_entityset_ids_not_sorted(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - - spark_es = EntitySet(id="spark_es") - df = pd.DataFrame( - { - "id": [2, 0, 1, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - values_dd = ps.from_pandas(df) - ltypes = { - "values": Integer, - "dates": Datetime, - "strings": NaturalLanguage, - } - spark_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - logical_types=ltypes, - ) - - spark_fm, _ = dfs( - entityset=spark_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - ) - - spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) - spark_computed_fm = spark_fm.set_index("id").loc[fm.index] - # Spark dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing - pd.testing.assert_frame_equal( - fm.astype(spark_computed_fm.dtypes), - spark_computed_fm, - ) - - -@pytest.mark.skipif("not ps") -def test_single_table_spark_entityset_with_instance_ids(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - instance_ids = [0, 1, 3] - - spark_es = EntitySet(id="spark_es") - df = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - - values_dd = ps.from_pandas(df) - ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} - spark_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - logical_types=ltypes, - ) - - spark_fm, _ = dfs( - entityset=spark_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - instance_ids=instance_ids, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - instance_ids=instance_ids, - ) - - spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) - spark_computed_fm = spark_fm.set_index("id").loc[fm.index] - # Spark dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing - pd.testing.assert_frame_equal( - fm.astype(spark_computed_fm.dtypes), - spark_computed_fm, - ) - - -@pytest.mark.skipif("not ps") -def test_single_table_spark_entityset_single_cutoff_time(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - - spark_es = EntitySet(id="spark_es") - df = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - "strings": ["I am a string", "23", "abcdef ghijk", ""], - }, - ) - values_dd = ps.from_pandas(df) - ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} - spark_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - logical_types=ltypes, - ) - - spark_fm, _ = dfs( - entityset=spark_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - cutoff_time=pd.Timestamp("2019-01-05 04:00"), - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - cutoff_time=pd.Timestamp("2019-01-05 04:00"), - ) - - spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) - spark_computed_fm = spark_fm.set_index("id").loc[fm.index] - # Spark dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing - pd.testing.assert_frame_equal( - fm.astype(spark_computed_fm.dtypes), - spark_computed_fm, - ) - - -@pytest.mark.skipif("not ps") -def test_single_table_spark_entityset_cutoff_time_df(): - primitives_list = [ - "absolute", - "is_weekend", - "year", - "day", - "num_characters", - "num_words", - ] - - spark_es = EntitySet(id="spark_es") - df = pd.DataFrame( - { - "id": [0, 1, 2], - "values": [1, 12, -34], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - ], - "strings": ["I am a string", "23", "abcdef ghijk"], - }, - ) - values_dd = ps.from_pandas(df) - ltypes = {"values": IntegerNullable, "dates": Datetime, "strings": NaturalLanguage} - spark_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - time_index="dates", - logical_types=ltypes, - ) - - ids = [0, 1, 2, 0] - times = [ - pd.Timestamp("2019-01-05 04:00"), - pd.Timestamp("2019-01-05 04:00"), - pd.Timestamp("2019-01-05 04:00"), - pd.Timestamp("2019-01-15 04:00"), - ] - labels = [True, False, True, False] - cutoff_times = pd.DataFrame( - {"id": ids, "time": times, "labels": labels}, - columns=["id", "time", "labels"], - ) - - spark_fm, _ = dfs( - entityset=spark_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - cutoff_time=cutoff_times, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - time_index="dates", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - cutoff_time=cutoff_times, - ) - # Because row ordering with spark is not guaranteed, `we need to sort on two columns to make sure that values - # for instance id 0 are compared correctly. Also, make sure the index column has the same dtype. - fm = fm.sort_values(["id", "labels"]) - spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) - spark_fm = spark_fm.set_index("id").sort_values(["id", "labels"]) - - for column in fm.columns: - if fm[column].dtype.name == "category": - fm[column] = fm[column].astype("Int64").astype("string") - - pd.testing.assert_frame_equal( - fm.astype(spark_fm.dtypes), - spark_fm, - check_dtype=False, - ) - - -@pytest.mark.skipif("not ps") -def test_single_table_spark_entityset_dates_not_sorted(): - spark_es = EntitySet(id="spark_es") - df = pd.DataFrame( - { - "id": [0, 1, 2, 3], - "values": [1, 12, -34, 27], - "dates": [ - pd.to_datetime("2019-01-10"), - pd.to_datetime("2019-02-03"), - pd.to_datetime("2019-01-01"), - pd.to_datetime("2017-08-25"), - ], - }, - ) - - primitives_list = ["absolute", "is_weekend", "year", "day"] - values_dd = ps.from_pandas(df) - ltypes = { - "values": Integer, - "dates": Datetime, - } - spark_es.add_dataframe( - dataframe_name="data", - dataframe=values_dd, - index="id", - time_index="dates", - logical_types=ltypes, - ) - - spark_fm, _ = dfs( - entityset=spark_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - max_depth=1, - ) - - pd_es = EntitySet(id="pd_es") - pd_es.add_dataframe( - dataframe_name="data", - dataframe=df, - index="id", - time_index="dates", - logical_types=ltypes, - ) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="data", - trans_primitives=primitives_list, - max_depth=1, - ) - - spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) - spark_fm = spark_fm.set_index("id").loc[fm.index] - pd.testing.assert_frame_equal(fm.astype(spark_fm.dtypes), spark_fm) - - -@pytest.mark.skipif("not ps") -def test_spark_entityset_secondary_time_index(): - log_df = pd.DataFrame() - log_df["id"] = [0, 1, 2, 3] - log_df["scheduled_time"] = pd.to_datetime( - ["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01"], - ) - log_df["departure_time"] = pd.to_datetime( - [ - "2019-02-01 09:00", - "2019-02-06 10:00", - "2019-02-12 10:00", - "2019-03-01 11:30", - ], - ) - log_df["arrival_time"] = pd.to_datetime( - [ - "2019-02-01 11:23", - "2019-02-06 12:45", - "2019-02-12 13:53", - "2019-03-01 14:07", - ], - ) - log_df["delay"] = [-2, 10, 60, 0] - log_df["flight_id"] = [0, 1, 0, 1] - log_spark = ps.from_pandas(log_df) - - flights_df = pd.DataFrame() - flights_df["id"] = [0, 1, 2, 3] - flights_df["origin"] = ["BOS", "LAX", "BOS", "LAX"] - flights_spark = ps.from_pandas(flights_df) - - pd_es = EntitySet("flights") - spark_es = EntitySet("flights_spark") - - log_ltypes = { - "scheduled_time": Datetime, - "departure_time": Datetime, - "arrival_time": Datetime, - "delay": Double, - } - pd_es.add_dataframe( - dataframe_name="logs", - dataframe=log_df, - index="id", - logical_types=log_ltypes, - semantic_tags={"flight_id": "foreign_key"}, - time_index="scheduled_time", - secondary_time_index={"arrival_time": ["departure_time", "delay"]}, - ) - - spark_es.add_dataframe( - dataframe_name="logs", - dataframe=log_spark, - index="id", - logical_types=log_ltypes, - semantic_tags={"flight_id": "foreign_key"}, - time_index="scheduled_time", - secondary_time_index={"arrival_time": ["departure_time", "delay"]}, - ) - - pd_es.add_dataframe(dataframe_name="flights", dataframe=flights_df, index="id") - flights_ltypes = pd_es["flights"].ww.logical_types - spark_es.add_dataframe( - dataframe_name="flights", - dataframe=flights_spark, - index="id", - logical_types=flights_ltypes, - ) - - pd_es.add_relationship("flights", "id", "logs", "flight_id") - spark_es.add_relationship("flights", "id", "logs", "flight_id") - - cutoff_df = pd.DataFrame() - cutoff_df["id"] = [0, 1, 1] - cutoff_df["time"] = pd.to_datetime(["2019-02-02", "2019-02-02", "2019-02-20"]) - - fm, _ = dfs( - entityset=pd_es, - target_dataframe_name="logs", - cutoff_time=cutoff_df, - agg_primitives=["max"], - trans_primitives=["month"], - ) - - spark_fm, _ = dfs( - entityset=spark_es, - target_dataframe_name="logs", - cutoff_time=cutoff_df, - agg_primitives=["max"], - trans_primitives=["month"], - ) - - # Make sure both matrices are sorted the same - # Also make sure index has same dtype - spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) - spark_fm = spark_fm.set_index("id").sort_values("delay") - fm = fm.sort_values("delay") - - # Spark output for MONTH columns will be of string type without decimal points, - # while pandas will contain decimals - we need to convert before comparing - for column in fm.columns: - if fm[column].dtype.name == "category": - fm[column] = fm[column].astype("Int64").astype("string") - - pd.testing.assert_frame_equal( - fm, - spark_fm, - check_categorical=False, - check_dtype=False, - ) diff --git a/featuretools/tests/testing_utils/__init__.py b/featuretools/tests/testing_utils/__init__.py index d20b3220eb..11a7c80322 100644 --- a/featuretools/tests/testing_utils/__init__.py +++ b/featuretools/tests/testing_utils/__init__.py @@ -4,7 +4,7 @@ mock_cluster, get_mock_client_cluster, ) -from featuretools.tests.testing_utils.es_utils import get_df_tags, to_pandas +from featuretools.tests.testing_utils.es_utils import get_df_tags from featuretools.tests.testing_utils.features import ( feature_with_name, number_of_features_with_name_like, diff --git a/featuretools/tests/testing_utils/es_utils.py b/featuretools/tests/testing_utils/es_utils.py index 82ccdc96a2..652fd217a1 100644 --- a/featuretools/tests/testing_utils/es_utils.py +++ b/featuretools/tests/testing_utils/es_utils.py @@ -1,41 +1,3 @@ -import pandas as pd - -from featuretools.utils.gen_utils import import_or_none, is_instance - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - - -def to_pandas(df, index=None, sort_index=False, int_index=False): - """ - Testing util to convert dataframes to pandas. If a pandas dataframe is passed in, just returns the dataframe. - - Args: - index (str, optional): column name to set as index, defaults to None - sort_index (bool, optional): whether to sort the dataframe on the index after setting it, defaults to False - int_index (bool, optional): Converts computed dask index to Int64Index to avoid errors, defaults to False - - Returns: - Pandas DataFrame - """ - if isinstance(df, (pd.DataFrame, pd.Series)): - return df - - if is_instance(df, (dd, dd), ("DataFrame", "Series")): - pd_df = df.compute() - if is_instance(df, (ps, ps), ("DataFrame", "Series")): - pd_df = df.to_pandas() - - if index: - pd_df = pd_df.set_index(index) - if sort_index: - pd_df = pd_df.sort_index() - if int_index and is_instance(df, dd, "DataFrame"): - pd_df.index = pd.Index(pd_df.index, dtype="Int64") - - return pd_df - - def get_df_tags(df): """Gets a DataFrame's semantic tags without index or time index tags for Woodwork init""" semantic_tags = {} diff --git a/featuretools/tests/testing_utils/generate_fake_dataframe.py b/featuretools/tests/testing_utils/generate_fake_dataframe.py index 90814a7c15..5b39ecf6d7 100644 --- a/featuretools/tests/testing_utils/generate_fake_dataframe.py +++ b/featuretools/tests/testing_utils/generate_fake_dataframe.py @@ -2,7 +2,6 @@ from datetime import datetime as dt import pandas as pd -import pytest import woodwork.type_sys.type_system as ww_type_system from woodwork import logical_types @@ -41,9 +40,6 @@ def generate_fake_dataframe( n_rows=10, df_name="df", ): - dask = pytest.importorskip("dask", reason="Dask not installed, skipping") - dask.config.set({"dataframe.convert-string": False}) - def randomize(values_): random.seed(10) values = values_.copy() diff --git a/featuretools/tests/testing_utils/mock_ds.py b/featuretools/tests/testing_utils/mock_ds.py index 8f0285cc2d..ff768127d7 100644 --- a/featuretools/tests/testing_utils/mock_ds.py +++ b/featuretools/tests/testing_utils/mock_ds.py @@ -23,9 +23,6 @@ ) from featuretools.entityset import EntitySet -from featuretools.utils.gen_utils import import_or_none - -dask = import_or_none("dask") def make_ecommerce_entityset(with_integer_time_index=False): @@ -39,9 +36,6 @@ def make_ecommerce_entityset(with_integer_time_index=False): \\ / . L Log """ - if dask: - dask.config.set({"dataframe.convert-string": False}) - dataframes = make_ecommerce_dataframes( with_integer_time_index=with_integer_time_index, ) diff --git a/featuretools/tests/utils_tests/test_entry_point.py b/featuretools/tests/utils_tests/test_entry_point.py index a2cd3a4356..35f0c86d82 100644 --- a/featuretools/tests/utils_tests/test_entry_point.py +++ b/featuretools/tests/utils_tests/test_entry_point.py @@ -5,7 +5,7 @@ @pytest.fixture -def pd_entry_point_dfs(): +def entry_points_dfs(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame( { @@ -18,19 +18,6 @@ def pd_entry_point_dfs(): return cards_df, transactions_df -@pytest.fixture -def dask_entry_point_dfs(pd_entry_point_dfs): - dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping") - cards_df = dd.from_pandas(pd_entry_point_dfs[0], npartitions=2) - transactions_df = dd.from_pandas(pd_entry_point_dfs[1], npartitions=2) - return cards_df, transactions_df - - -@pytest.fixture(params=["pd_entry_point_dfs", "dask_entry_point_dfs"]) -def entry_points_dfs(request): - return request.getfixturevalue(request.param) - - class MockEntryPoint(object): def on_call(self, kwargs): self.kwargs = kwargs diff --git a/featuretools/tests/utils_tests/test_gen_utils.py b/featuretools/tests/utils_tests/test_gen_utils.py index c1a2aa50ae..a4d0813a23 100644 --- a/featuretools/tests/utils_tests/test_gen_utils.py +++ b/featuretools/tests/utils_tests/test_gen_utils.py @@ -8,11 +8,8 @@ camel_and_title_to_snake, import_or_none, import_or_raise, - is_instance, ) -dd = import_or_none("dask.dataframe") - def test_import_or_raise_errors(): with pytest.raises(ImportError, match="error message"): @@ -37,31 +34,6 @@ def df(): return pd.DataFrame({"id": range(5)}) -def test_is_instance_single_module(df): - assert is_instance(df, pd, "DataFrame") - - -@pytest.mark.skipif("not dd") -def test_is_instance_multiple_modules(df): - df2 = dd.from_pandas(df, npartitions=2) - assert is_instance(df, (dd, pd), "DataFrame") - assert is_instance(df2, (dd, pd), "DataFrame") - assert is_instance(df2["id"], (dd, pd), ("Series", "DataFrame")) - assert not is_instance(df2["id"], (dd, pd), ("DataFrame", "Series")) - - -def test_is_instance_errors_mismatch(): - msg = "Number of modules does not match number of classnames" - with pytest.raises(ValueError, match=msg): - is_instance("abc", pd, ("DataFrame", "Series")) - - -def test_is_instance_none_module(df): - assert not is_instance(df, None, "DataFrame") - assert is_instance(df, (None, pd), "DataFrame") - assert is_instance(df, (None, pd), ("Series", "DataFrame")) - - def test_list_logical_types(): ft_ltypes = list_logical_types() ww_ltypes = ww_list_logical_types() diff --git a/featuretools/utils/gen_utils.py b/featuretools/utils/gen_utils.py index 04e00f739b..87f052310c 100644 --- a/featuretools/utils/gen_utils.py +++ b/featuretools/utils/gen_utils.py @@ -2,7 +2,6 @@ import logging import re import sys -from enum import Enum from tqdm import tqdm @@ -67,42 +66,7 @@ def import_or_none(library): return None -def is_instance(obj, modules, classnames): - """ - Check if the given object is an instance of classname in module(s). Module - can be None (i.e. not installed) - - Args: - obj (obj): object to test - modules (module or tuple[module]): module to check, can be also be None (will be ignored) - classnames (str or tuple[str]): classname from module to check. If multiple values are - provided, they should match with a single module in order. - If a single value is provided, will be used for all modules. - Returns: - bool: True if object is an instance of classname from corresponding module, otherwise False. - Also returns False if the module is None (i.e. module is not installed) - """ - if type(modules) is not tuple: - modules = (modules,) - if type(classnames) is not tuple: - classnames = (classnames,) * len(modules) - if len(modules) != len(classnames): - raise ValueError("Number of modules does not match number of classnames") - to_check = tuple( - getattr(mod, classname, mod) - for mod, classname in zip(modules, classnames) - if mod - ) - return isinstance(obj, to_check) - - def camel_and_title_to_snake(name): name = re.sub(r"([^_\d]+)(\d+)", r"\1_\2", name) name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() - - -class Library(str, Enum): - PANDAS = "pandas" - DASK = "Dask" - SPARK = "Spark" diff --git a/featuretools/utils/spark_utils.py b/featuretools/utils/spark_utils.py deleted file mode 100644 index 7f803dd943..0000000000 --- a/featuretools/utils/spark_utils.py +++ /dev/null @@ -1,49 +0,0 @@ -import pandas as pd - - -def replace_tuple_columns(pdf): - new_df = pd.DataFrame() - for c in pdf.columns: - if isinstance(pdf[c].iloc[0], tuple): - new_df[c] = pdf[c].map(lambda x: list(x) if isinstance(x, tuple) else x) - else: - new_df[c] = pdf[c] - return new_df - - -def replace_nan_with_None(df): - new_df = pd.DataFrame() - - def replace_val(val): - if isinstance(val, (tuple, list)): - return list([None if pd.isna(x) else x for x in val]) - elif pd.isna(val): - return None - else: - return val - - for c in df.columns: - new_df[c] = df[c].apply(replace_val) - new_df[c] = new_df[c].astype(df[c].dtype) - - return new_df - - -def replace_categorical_columns(pdf): - new_df = pd.DataFrame() - for c in pdf.columns: - col = pdf[c] - if col.dtype.name == "category": - new_df[c] = col.astype("string") - else: - new_df[c] = pdf[c] - return new_df - - -def pd_to_spark_clean(pdf): - steps = [replace_tuple_columns, replace_nan_with_None, replace_categorical_columns] - intermediate_df = pdf - for f in steps: - intermediate_df = f(intermediate_df) - - return intermediate_df diff --git a/pyproject.toml b/pyproject.toml index 51a2bb362e..03869f837f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,20 +62,12 @@ test = [ "pytest-xdist >= 2.5.0", "smart-open >= 5.0.0", "urllib3 >= 1.26.18", - "pytest-timeout >= 2.1.0" + "pytest-timeout >= 2.1.0", + "featuretools[dask]" ] dask = [ - "dask[dataframe] >= 2023.2.0, <2024.3.0", - "distributed >= 2023.2.0, <2024.3.0", - "woodwork[dask] >= 0.28.0", -] -spark = [ - "woodwork[spark] >= 0.28.0", - "pyspark >= 3.5.0", - "pyarrow >= 14.0.1", -] -updater = [ - "alteryx-open-src-update-checker >= 2.1.0" + "dask[dataframe] >= 2023.2.0", + "distributed >= 2023.2.0", ] tsfresh = [ "featuretools-tsfresh-primitives >= 1.0.0", @@ -85,10 +77,17 @@ autonormalize = [ ] sql = [ "featuretools_sql >= 0.0.1", + "psycopg2-binary >= 2.9.3", ] sklearn = [ "featuretools-sklearn-transformer >= 1.0.0", ] +premium = [ + "premium-primitives >= 0.0.3", +] +nlp = [ + "nlp-primitives >= 2.12.0", +] docs = [ "ipython == 8.4.0", "jupyter == 1.0.0", @@ -103,16 +102,16 @@ docs = [ "myst-parser == 0.18.0", "autonormalize >= 2.0.1", "click >= 7.0.0", - "featuretools[dask,spark,test]", + "featuretools[dask,test]", ] dev = [ "ruff >= 0.1.6", "black[jupyter] >= 23.1.0", "pre-commit >= 2.20.0", - "featuretools[docs,dask,spark,test]", + "featuretools[docs,dask,test]", ] complete = [ - "featuretools[autonormalize,sklearn,dask,spark,sql,updater]", + "featuretools[premium,nlp,dask]", ] [tool.setuptools] @@ -158,8 +157,8 @@ filterwarnings = [ [tool.ruff] line-length = 88 target-version = "py311" -ignore = ["E501"] -select = [ +lint.ignore = ["E501"] +lint.select = [ # Pyflakes "F", # Pycodestyle @@ -170,10 +169,10 @@ select = [ ] src = ["featuretools"] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401", "I001", "E501"] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["featuretools"] [tool.coverage.run]