Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: split the original generate_features.py module #29

Merged
merged 2 commits into from
Oct 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions nlp_profiler/generate_features/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pandas as pd
import swifter # noqa

from nlp_profiler.constants import DEFAULT_PARALLEL_METHOD, SWIFTER_METHOD
from nlp_profiler.generate_features.parallelisation_methods \
import get_progress_bar, using_joblib_parallel, using_swifter


def generate_features(main_header: str,
high_level_features_steps: list,
new_dataframe: pd.DataFrame,
parallelisation_method: str = DEFAULT_PARALLEL_METHOD):
generate_feature_progress_bar = get_progress_bar(high_level_features_steps)

# Using swifter or Using joblib Parallel and delay method:
parallelisation_method_function = using_joblib_parallel
if parallelisation_method == SWIFTER_METHOD:
parallelisation_method_function = using_swifter

for _, (new_column, source_column, transformation_function) in \
enumerate(generate_feature_progress_bar):
source_field = new_dataframe[source_column]
generate_feature_progress_bar.set_description(
f'{main_header}: {source_column} => {new_column}'
)

new_dataframe[new_column] = parallelisation_method_function(
source_field, transformation_function, new_column
)
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,11 @@
from joblib import Memory, Parallel, delayed
from tqdm.auto import tqdm

from nlp_profiler.constants import DEFAULT_PARALLEL_METHOD, SWIFTER_METHOD

memory = Memory(tempfile.gettempdir(), compress=9, verbose=0)


def is_running_from_ipython():
inJupyter = sys.argv[-1].endswith('json')
return inJupyter
return sys.argv[-1].endswith('json')


PROGRESS_BAR_WIDTH = 900 if is_running_from_ipython() else None
Expand All @@ -24,39 +21,14 @@ def get_progress_bar(values: list) -> tqdm:
return cached_tqdm(values, ncols=PROGRESS_BAR_WIDTH)


def generate_features(main_header: str,
high_level_features_steps: list,
new_dataframe: pd.DataFrame,
parallelisation_method: str = DEFAULT_PARALLEL_METHOD):
generate_feature_progress_bar = get_progress_bar(high_level_features_steps)

# Using swifter or Using joblib Parallel and delay method:
parallelisation_method_function = using_joblib_parallel
if parallelisation_method == SWIFTER_METHOD:
parallelisation_method_function = using_swifter

for _, (new_column, source_column, transformation_function) in \
enumerate(generate_feature_progress_bar):
source_field = new_dataframe[source_column]
generate_feature_progress_bar.set_description(
f'{main_header}: {source_column} => {new_column}'
)

new_dataframe[new_column] = parallelisation_method_function(
source_field, transformation_function,
source_column, new_column
)


def run_task(task_function, value: str): # pragma: no cover
# pragma: no cover => multiprocessing leads to loss of test coverage info
cached_task_function = memory.cache(task_function)
return cached_task_function(value)


def using_joblib_parallel(
source_field, apply_function,
source_column: str, new_column: str,
source_field, apply_function, new_column: str,
) -> pd.DataFrame:
source_values_to_transform = get_progress_bar(source_field.values)
source_values_to_transform.set_description(new_column)
Expand All @@ -71,8 +43,7 @@ def using_joblib_parallel(


def using_swifter(
source_field, apply_function,
source_column: str = None, new_column: str = None
source_field, apply_function, new_column: str = None
) -> pd.DataFrame:
return source_field \
.swifter \
Expand Down