Less strict requirements (#333)

* bump a few requirements; increase lower bound for h3 version range; adding pyarrow dev dependency * fix type repr for spark types; fix: broken tests (pyspark 3.4) --------- Co-authored-by: Ralph Rassweiler <[email protected]>
quintoandar · Aug 16, 2023 · 6b78a50 · 6b78a50 · chip-n-dale · Aug 16, 2023
1 parent 3a73ed8
commit 6b78a50
Show file tree

Hide file tree

Showing 8 changed files with 29 additions and 26 deletions.
diff --git a/Makefile b/Makefile
@@ -9,8 +9,8 @@ VERSION := $(shell grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d
 .PHONY: environment
 ## create virtual environment for butterfree
 environment:
-	@pyenv install -s 3.7.6
-	@pyenv virtualenv 3.7.6 butterfree
+	@pyenv install -s 3.7.13
+	@pyenv virtualenv 3.7.13 butterfree
 	@pyenv local butterfree
 	@PYTHONPATH=. python -m pip install --upgrade pip
 
@@ -221,4 +221,4 @@ help:
 		} \
 		printf "\n"; \
 	}' \
-	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
+	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py
@@ -246,7 +246,7 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             cassandra_schema.append(
                 {
                     "column_name": features["column_name"],
-                    "type": cassandra_mapping[str(features["type"])],
+                    "type": cassandra_mapping[str(features["type"]).replace("()", "")],
                     "primary_key": features["primary_key"],
                 }
             )

diff --git a/butterfree/reports/metadata.py b/butterfree/reports/metadata.py
@@ -162,7 +162,7 @@ def to_json(self) -> Any:
                 "features": [
                     {
                         "column_name": c["column_name"],
-                        "data_type": str(c["type"]),
+                        "data_type": str(c["type"]).replace("()", ""),
                         "description": desc,
                     }
                     for c, desc in params._features
@@ -208,7 +208,7 @@ def to_markdown(self) -> Any:
 
         features = ["Column name", "Data type", "Description"]
         for c, desc in params._features:
-            features.extend([c["column_name"], str(c["type"]), desc])
+            features.extend([c["column_name"], str(c["type"]).replace("()", ""), desc])
 
         count_rows = len(features) // 3
 

diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -1,11 +1,9 @@
-cmake==3.18.4
-h3==3.7.0
-pyarrow==0.15.1
+h3==3.7.4
 jupyter==1.0.0
 twine==3.1.1
 mypy==0.790
-pyspark-stubs==3.0.0
 sphinx==3.5.4
 sphinxemoji==0.1.8
 sphinx-rtd-theme==0.5.2
-recommonmark==0.7.1
+recommonmark==0.7.1
+pyarrow>=1.0.0
diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,8 @@
 cassandra-driver>=3.22.0,<4.0
 mdutils>=1.2.2,<2.0
-pandas>=0.24,<1.1
+pandas>=0.24,<2.0
 parameters-validation>=1.1.5,<2.0
 pyspark==3.*
 typer>=0.3,<0.4
-setuptools>=41,<42
-typing-extensions==3.7.4.3
-boto3==1.17.*
+typing-extensions>3.7.4,<5
+boto3==1.17.*
diff --git a/setup.cfg b/setup.cfg
@@ -24,6 +24,7 @@ spark_options =
     spark.sql.session.timeZone: UTC
     spark.driver.bindAddress: 127.0.0.1
     spark.sql.legacy.timeParserPolicy: LEGACY
+    spark.sql.legacy.createHiveTableByDefault: false
 
 [mypy]
 # suppress errors about unsatisfied imports

diff --git a/setup.py b/setup.py
@@ -34,7 +34,7 @@
     license="Copyright",
     author="QuintoAndar",
     install_requires=requirements,
-    extras_require={"h3": ["cmake==3.16.3", "h3==3.4.2"]},
+    extras_require={"h3": ["h3>=3.7.4,<4"]},
     python_requires=">=3.7, <4",
     entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]},
     include_package_data=True,

diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py
@@ -77,9 +77,11 @@ def test_feature_set_pipeline(
         self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe,
     ):
         # arrange
+
         table_reader_id = "a_source"
         table_reader_table = "table"
         table_reader_db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE")
+
         create_temp_view(dataframe=mocked_df, name=table_reader_id)
         create_db_and_table(
             spark=spark_session,
@@ -88,14 +90,16 @@ def test_feature_set_pipeline(
             table_reader_table=table_reader_table,
         )
 
-        dbconfig = Mock()
-        dbconfig.mode = "overwrite"
-        dbconfig.format_ = "parquet"
+        path = "test_folder/historical/entity/feature_set"
+
+        dbconfig = MetastoreConfig()
         dbconfig.get_options = Mock(
-            return_value={"path": "test_folder/historical/entity/feature_set"}
+            return_value={"mode": "overwrite", "format_": "parquet", "path": path}
         )
 
-        historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig)
+        historical_writer = HistoricalFeatureStoreWriter(
+            db_config=dbconfig, debug_mode=True
+        )
 
         # act
         test_pipeline = FeatureSetPipeline(
@@ -151,9 +155,13 @@ def test_feature_set_pipeline(
         )
         test_pipeline.run()
 
+        # act and assert
+        dbconfig.get_path_with_partitions = Mock(
+            return_value=["historical/entity/feature_set"]
+        )
+
         # assert
-        path = dbconfig.get_options("historical/entity/feature_set").get("path")
-        df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN)
+        df = spark_session.sql("select * from historical_feature_store__feature_set")
 
         target_df = fixed_windows_output_feature_set_dataframe.orderBy(
             test_pipeline.feature_set.timestamp_column
@@ -162,9 +170,6 @@ def test_feature_set_pipeline(
         # assert
         assert_dataframe_equality(df, target_df)
 
-        # tear down
-        shutil.rmtree("test_folder")
-
     def test_feature_set_pipeline_with_dates(
         self,
         mocked_date_df,