Release/1.3.0 (#358)

* feat(mlop-2269): bump versions (#355) * fix: bump versions adjust tests * add checklist * chore: bump python * bump pyspark * chore: java version all steps modified * fix: sphinx version (#356)
quintoandar · Jun 5, 2024 · 99662f6 · 99662f6
1 parent 788ea75
commit 99662f6
Show file tree

Hide file tree

Showing 63 changed files with 579 additions and 236 deletions.
diff --git a/.checklist.yaml b/.checklist.yaml
@@ -0,0 +1,30 @@
+apiVersion: quintoandar.com.br/checklist/v2
+kind: ServiceChecklist
+metadata:
+  name: butterfree
+spec:
+  description: >-
+    A solution for Feature Stores.
+
+  costCenter: C055
+  department: engineering
+  lifecycle: production
+  docs: true
+
+  ownership:
+    team: data_products_mlops
+    line: tech_platform
+    owner: [email protected]
+
+  libraries:
+    - name: butterfree
+      type: common-usage
+      path: https://quintoandar.github.io/python-package-server/
+      description: A lib to build Feature Stores.
+      registries:
+      - github-packages
+      tier: T0
+
+  channels:
+    squad: 'mlops'
+    alerts: 'data-products-reports'
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -14,6 +14,19 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.9'
+
+    - uses: actions/setup-java@v4
+      with:
+        java-version: '11'
+        distribution: microsoft
+
+    - uses: vemonet/setup-spark@v1
+      with:
+        spark-version: '3.5.1'
+        hadoop-version: '3'
 
     - name: Install dependencies
       run: make ci-install

diff --git a/.github/workflows/skip_lint.yml b/.github/workflows/skip_lint.yml
@@ -0,0 +1,17 @@
+# This step is used only because we want to mark the runner-linter check as required
+# for PRs to develop, but not for the merge queue to merge into develop,
+# github does not have this functionality yet
+
+name: 'Skip github-actions/runner-linter check at merge queue'
+
+on:
+  merge_group:
+
+jobs:
+  empty_job:
+    name: 'github-actions/runner-linter'
+    runs-on: github-actions-developers-runner
+    steps:
+      - name: Skip github-actions/runner-linter check at merge queue
+        run: |
+          echo "Done"
diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml
@@ -8,11 +8,23 @@ jobs:
   Pipeline:
     if: github.ref == 'refs/heads/staging'
 
-    runs-on: ubuntu-22.04
-    container: quintoandar/python-3-7-java
+    runs-on: ubuntu-latest
 
     steps:
       - uses: actions/checkout@v2
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      - uses: actions/setup-java@v4
+        with:
+          java-version: '11'
+          distribution: microsoft
+
+      - uses: vemonet/setup-spark@v1
+        with:
+          spark-version: '3.5.1'
+          hadoop-version: '3'
 
       - name: Install dependencies
         run: make ci-install

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,11 +9,23 @@ on:
 
 jobs:
   Pipeline:
-    runs-on: ubuntu-22.04
-    container: quintoandar/python-3-7-java
+    runs-on: ubuntu-latest
 
     steps:
     - uses: actions/checkout@v2
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.9'
+
+    - uses: actions/setup-java@v4
+      with:
+        java-version: '11'
+        distribution: microsoft
+
+    - uses: vemonet/setup-spark@v1
+      with:
+        spark-version: '3.5.1'
+        hadoop-version: '3'
 
     - name: Install dependencies
       run: make ci-install

diff --git a/.gitignore b/.gitignore
@@ -66,6 +66,7 @@ instance/
 
 # PyBuilder
 target/
+pip/
 
 # Jupyter Notebook
 .ipynb_checkpoints

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,10 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each
 
 ## [Unreleased]
 
+## [1.3.0](https://github.com/quintoandar/butterfree/releases/tag/1.3.0)
+* Bump versions  ([#355](https://github.com/quintoandar/butterfree/pull/355))
+* Sphinx version ([#356](https://github.com/quintoandar/butterfree/pull/356))
+
 ## [1.2.4](https://github.com/quintoandar/butterfree/releases/tag/1.2.4)
 * Auto create feature sets ([#351](https://github.com/quintoandar/butterfree/pull/351))
 

diff --git a/Makefile b/Makefile
@@ -76,7 +76,7 @@ style-check:
 	@echo "Code Style"
 	@echo "=========="
 	@echo ""
-	@python -m black --check -t py36 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . && echo "\n\nSuccess" || (echo "\n\nFailure\n\nYou need to run \"make apply-style\" to apply style formatting to your code"; exit 1)
+	@python -m black --check -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . && echo "\n\nSuccess" || (echo "\n\nFailure\n\nYou need to run \"make apply-style\" to apply style formatting to your code"; exit 1)
 
 .PHONY: quality-check
 ## run code quality checks with flake8
@@ -104,8 +104,8 @@ checks: style-check quality-check type-check
 .PHONY: apply-style
 ## fix stylistic errors with black
 apply-style:
-	@python -m black -t py36 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" .
-	@python -m isort -rc --atomic butterfree/ tests/
+	@python -m black -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" .
+	@python -m isort --atomic butterfree/ tests/
 
 .PHONY: clean
 ## clean unused artifacts
@@ -152,7 +152,7 @@ package:
 ## update Butterfree API docs
 update-docs:
 	cd ./docs; rm -rf source/butterfree.*
-	cd ./docs; sphinx-apidoc -T -E -o source/ ../butterfree
+	cd ./docs; sphinx-apidoc -o source/ ../butterfree
 	cd ./docs; make coverage
 
 .PHONY: docs

diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py
@@ -46,13 +46,13 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]:
         logger.error(f"Path: {path} not found!")
         return set()
 
-    logger.info(f"Importing modules...")
+    logger.info("Importing modules...")
     package = ".".join(path.strip("/").split("/"))
     imported = set(
         importlib.import_module(f".{name}", package=package) for name in modules
     )
 
-    logger.info(f"Scanning modules...")
+    logger.info("Scanning modules...")
     content = {
         module: set(
             filter(
@@ -93,7 +93,8 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]:
 
 
 PATH = typer.Argument(
-    ..., help="Full or relative path to where feature set pipelines are being defined.",
+    ...,
+    help="Full or relative path to where feature set pipelines are being defined.",
 )
 
 GENERATE_LOGS = typer.Option(
@@ -113,7 +114,10 @@ class Migrate:
         pipelines: list of Feature Set Pipelines to use to migration.
     """
 
-    def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None:
+    def __init__(
+        self,
+        pipelines: Set[FeatureSetPipeline],
+    ) -> None:
         self.pipelines = pipelines
 
     def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None:

diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py
@@ -129,7 +129,9 @@ def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]:
         return response
 
     def _get_create_table_query(
-        self, columns: List[CassandraColumn], table: str,
+        self,
+        columns: List[CassandraColumn],
+        table: str,
     ) -> str:
         """Creates CQL statement to create a table."""
         parsed_columns = []

diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py
@@ -61,9 +61,9 @@ def read(
         if path and not isinstance(path, (str, list)):
             raise ValueError("path needs to be a string or a list of string")
 
-        df_reader: Union[
-            DataStreamReader, DataFrameReader
-        ] = self.conn.readStream if stream else self.conn.read
+        df_reader: Union[DataStreamReader, DataFrameReader] = (
+            self.conn.readStream if stream else self.conn.read
+        )
 
         df_reader = df_reader.schema(schema) if schema else df_reader
 

diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py
@@ -58,7 +58,10 @@ class Source(HookableComponent):
     """
 
     def __init__(
-        self, readers: List[Reader], query: str, eager_evaluation: bool = True,
+        self,
+        readers: List[Reader],
+        query: str,
+        eager_evaluation: bool = True,
     ) -> None:
         super().__init__()
         self.enable_pre_hooks = False

diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py
@@ -130,7 +130,10 @@ def __init__(
         self.check_schema_hook = check_schema_hook
 
     def write(
-        self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient,
+        self,
+        feature_set: FeatureSet,
+        dataframe: DataFrame,
+        spark_client: SparkClient,
     ) -> None:
         """Loads the data from a feature set into the Historical Feature Store.
 

diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py
@@ -116,7 +116,10 @@ def filter_latest(dataframe: DataFrame, id_columns: List[Any]) -> DataFrame:
 
         window = Window.partitionBy(*id_columns).orderBy(col(TIMESTAMP_COLUMN).desc())
         return (
-            dataframe.select(col("*"), row_number().over(window).alias("rn"),)
+            dataframe.select(
+                col("*"),
+                row_number().over(window).alias("rn"),
+            )
             .filter(col("rn") == 1)
             .drop("rn")
         )
@@ -162,7 +165,10 @@ def _write_in_debug_mode(
         )
 
     def write(
-        self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient,
+        self,
+        feature_set: FeatureSet,
+        dataframe: DataFrame,
+        spark_client: SparkClient,
     ) -> Union[StreamingQuery, None]:
         """Loads the latest data from a feature set into the Feature Store.
 

diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py
@@ -72,7 +72,10 @@ def _apply_transformations(self, df: DataFrame) -> DataFrame:
 
     @abstractmethod
     def write(
-        self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient,
+        self,
+        feature_set: FeatureSet,
+        dataframe: DataFrame,
+        spark_client: SparkClient,
     ) -> Any:
         """Loads the data from a feature set into the Feature Store.
 

diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py
@@ -180,7 +180,8 @@ def create_query(
 
     @staticmethod
     def _get_diff(
-        fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]],
+        fs_schema: List[Dict[str, Any]],
+        db_schema: List[Dict[str, Any]],
     ) -> Set[Diff]:
         """Gets schema difference between feature set and the table of a given db.
 
@@ -296,7 +297,7 @@ def apply_migration(
                 logger.info(f"Applying this query: {q} ...")
                 self._client.sql(q)
 
-            logger.info(f"Feature Set migration finished successfully.")
+            logger.info("Feature Set migration finished successfully.")
 
             # inform in drone console which feature set was migrated
             print(f"The {feature_set.name} feature set was migrated.")
diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py
@@ -30,7 +30,10 @@ class MetastoreMigration(DatabaseMigration):
         data is being loaded into an entity table, then users can drop columns manually.
     """
 
-    def __init__(self, database: str = None,) -> None:
+    def __init__(
+        self,
+        database: str = None,
+    ) -> None:
         self._db_config = MetastoreConfig()
         self.database = database or environment.get_variable(
             "FEATURE_STORE_HISTORICAL_DATABASE"

diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py
@@ -412,7 +412,9 @@ def _aggregate(
         # repartition to have all rows for each group at the same partition
         # by doing that, we won't have to shuffle data on grouping by id
         dataframe = repartition_df(
-            dataframe, partition_by=groupby, num_processors=num_processors,
+            dataframe,
+            partition_by=groupby,
+            num_processors=num_processors,
         )
         grouped_data = dataframe.groupby(*groupby)
 

diff --git a/butterfree/transform/transformations/aggregated_transform.py b/butterfree/transform/transformations/aggregated_transform.py
@@ -76,7 +76,11 @@ def aggregations(self) -> List[Tuple]:
         Function = namedtuple("Function", ["function", "data_type"])
 
         return [
-            Function(f.func(expression), f.data_type.spark,) for f in self.functions
+            Function(
+                f.func(expression),
+                f.data_type.spark,
+            )
+            for f in self.functions
         ]
 
     def _get_output_name(self, function: object) -> str:

diff --git a/butterfree/transform/transformations/custom_transform.py b/butterfree/transform/transformations/custom_transform.py
@@ -89,6 +89,8 @@ def transform(self, dataframe: DataFrame) -> DataFrame:
 
         """
         dataframe = self.transformer(
-            dataframe, self.parent, **self.transformer__kwargs,
+            dataframe,
+            self.parent,
+            **self.transformer__kwargs,
         )
         return dataframe
diff --git a/butterfree/transform/transformations/h3_transform.py b/butterfree/transform/transformations/h3_transform.py
@@ -84,7 +84,10 @@ class H3HashTransform(TransformComponent):
     """
 
     def __init__(
-        self, h3_resolutions: List[int], lat_column: str, lng_column: str,
+        self,
+        h3_resolutions: List[int],
+        lat_column: str,
+        lng_column: str,
     ):
         super().__init__()
         self.h3_resolutions = h3_resolutions

diff --git a/butterfree/transform/transformations/sql_expression_transform.py b/butterfree/transform/transformations/sql_expression_transform.py
@@ -54,7 +54,8 @@ class SQLExpressionTransform(TransformComponent):
     """
 
     def __init__(
-        self, expression: str,
+        self,
+        expression: str,
     ):
         super().__init__()
         self.expression = expression

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -4,5 +4,4 @@ sphinxemoji==0.1.6
 typing-extensions==3.7.4.2
 cmake==3.18.4
 h3==3.7.0
-pyarrow==0.15.1
-
+pyarrow==16.1.0
diff --git a/examples/test_examples.py b/examples/test_examples.py
@@ -36,9 +36,9 @@
         _, error = p.communicate()
         if p.returncode != 0:
             errors.append({"notebook": path, "error": error})
-            print(f"    >>> Error in execution!\n")
+            print("    >>> Error in execution!\n")
         else:
-            print(f"    >>> Successful execution\n")
+            print("    >>> Successful execution\n")
 
     if errors:
         print(">>> Errors in the following notebooks:")
-Original file line number
+Diff line change
@@ Expand Up / @@ -66,6 +66,7 @@ instance/ @@
     # PyBuilder
     target/
+    pip/
     # Jupyter Notebook
     .ipynb_checkpoints
@@ Expand Down @@