Skip to content

Commit

Permalink
feat(updating-auto-fs-creation): add docstring and auto-infer by df
Browse files Browse the repository at this point in the history
  • Loading branch information
albjoaov committed Mar 8, 2024
1 parent d7701d1 commit 2293aaa
Showing 1 changed file with 31 additions and 3 deletions.
34 changes: 31 additions & 3 deletions butterfree/automated/feature_set_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,15 @@ def _get_tables_with_regex(self, sql_query: str) -> Tuple[List[Table], str]:
return tables, modified_sql_query

def get_readers(self, sql_query: str) -> str:
"""
Extracts table readers from a SQL query and formats them as a string.
Args:
sql_query (str): The SQL query from which to extract table readers.
Returns:
str: A formatted string containing the table readers.
"""
tables, modified_sql_query = self._get_tables_with_regex(sql_query.lower())
readers = []
for table in tables:
Expand All @@ -122,6 +131,7 @@ def get_readers(self, sql_query: str) -> str:
),
"""
readers.append(table_reader_string)

final_string = """
source=Source(
readers=[
Expand All @@ -139,16 +149,34 @@ def get_readers(self, sql_query: str) -> str:

return final_string

def get_features(self, sql_query: str, df: Optional[DataFrame]) -> str:
def get_features(self, sql_query: str, df: Optional[DataFrame] = None) -> str:
"""
Extract features from a SQL query and return them formatted as a string.
Args:
sql_query (str): The SQL query used to extract features.
df (Optional[DataFrame], optional): Optional DataFrame used to infer data types. Defaults to None.
Returns:
str: A formatted string containing the extracted features.
This sould be used on Databricks.
Especially if you want automatic type inference without passing a reference dataframe.
The utility will only work in an environment where a spark session is available in the environment
"""

features = self._get_features_with_regex(sql_query)
features_formatted = []
for feature in features:
description = feature.replace("__", " ").replace("_", " ").capitalize()

data_type = "."

if df and isinstance(df, DataFrame):
data_type = self._get_data_type(feature, df)
if df is None:
df = spark.sql(sql_query)

data_type = self._get_data_type(feature, df)

feature_string = f"""
Feature(
Expand Down

0 comments on commit 2293aaa

Please sign in to comment.