From 592e08774c67cdd732028fd66e6661964f0ba644 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Fri, 5 Jan 2024 11:23:30 +0100 Subject: [PATCH 1/4] version --- outrank/__main__.py | 15 ++++--- outrank/core_ranking.py | 98 +++++++++++++++++++++++++---------------- outrank/task_ranking.py | 6 +-- setup.py | 2 +- 4 files changed, 72 insertions(+), 49 deletions(-) diff --git a/outrank/__main__.py b/outrank/__main__.py index 33abfeb..d9848f9 100644 --- a/outrank/__main__.py +++ b/outrank/__main__.py @@ -183,13 +183,6 @@ def main(): help="Which ';'-separated features should be one-hot encoded into n new features (coverage analysis)", ) - parser.add_argument( - '--silent', - type=str, - default='False', - help='Suppress the logo and tips.', - ) - parser.add_argument( '--subfeature_mapping', type=str, @@ -225,6 +218,14 @@ def main(): help='Relevant for task data_generator -- name of the folder that contains generated data.', ) + parser.add_argument( + '--verbosity_level', + type=int, + default=1, + help='Either 0 or 1.', + ) + + args = parser.parse_args() if args.task == 'selftest': diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index 31cc17e..6a8fac4 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -105,7 +105,8 @@ def mixed_rank_graph( out_time_struct = {} # Handle cont. types prior to interaction evaluation - pbar.set_description('Encoding columns') + if args.verbosity_level > 0: + pbar.set_description('Encoding columns') start_enc_timer = timer() tmp_df = pd.DataFrame({k : tmp_df[k].cat.codes for k in all_columns}) @@ -126,7 +127,8 @@ def mixed_rank_graph( return BatchRankingSummary(final_constant_imp, out_time_struct) # Map the scoring calls to the worker pool - pbar.set_description('Allocating thread pool') + if args.verbosity_level > 0: + pbar.set_description('Allocating thread pool') # starmap is an alternative that is slower unfortunately (but nicer) def get_grounded_importances_estimate(combination: tuple[str]) -> Any: @@ -134,7 +136,8 @@ def get_grounded_importances_estimate(combination: tuple[str]) -> Any: start_enc_timer = timer() with cpu_pool as p: - pbar.set_description(f'Computing (#ftr={len(combinations)})') + if args.verbosity_level > 0: + pbar.set_description(f'Computing (#ftr={len(combinations)})') results = p.amap(get_grounded_importances_estimate, combinations) while not results.ready(): time.sleep(4) @@ -144,7 +147,8 @@ def get_grounded_importances_estimate(combination: tuple[str]) -> Any: start_enc_timer # Gather the final triplets - pbar.set_description('Aggregation of ranking results') + if args.verbosity_level > 0: + pbar.set_description('Aggregation of ranking results') final_triplets = [] for triplet in triplets: inv = (triplet[1], triplet[0], triplet[2]) @@ -152,7 +156,8 @@ def get_grounded_importances_estimate(combination: tuple[str]) -> Any: final_triplets.append(triplet) triplets = final_triplets - pbar.set_description('Proceeding to the next batch of data') + if args.verbosity_level > 0: + pbar.set_description('Proceeding to the next batch of data') return BatchRankingSummary(triplets, out_time_struct) @@ -200,9 +205,10 @@ def compute_combined_features( com_counter = 0 new_feature_hash = {} for new_combination in full_combination_space: - pbar.set_description( - f'Created {com_counter}/{len(full_combination_space)}', - ) + if args.verbosity_level > 0: + pbar.set_description( + f'Created {com_counter}/{len(full_combination_space)}', + ) combined_feature: list[str] = [str(0)] * input_dataframe.shape[0] for feature in new_combination: tmp_feature = input_dataframe[feature].tolist() @@ -216,7 +222,8 @@ def compute_combined_features( new_feature_hash[ftr_name] = combined_feature com_counter += 1 tmp_df = pd.DataFrame(new_feature_hash) - pbar.set_description('Concatenating into final frame ..') + if args.verbosity_level > 0: + pbar.set_description('Concatenating into final frame ..') input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) del tmp_df @@ -415,7 +422,7 @@ def compute_value_counts(input_dataframe: pd.DataFrame, args: Any): del GLOBAL_RARE_VALUE_STORAGE[to_remove_val] -def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any) -> None: +def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any, args: Any) -> None: """Compute cardinalities of features, incrementally""" global GLOBAL_CARDINALITY_STORAGE @@ -432,9 +439,10 @@ def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any) -> None: GLOBAL_CARDINALITY_STORAGE[column].add( internal_hash(unique_value), ) - pbar.set_description( - f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}', - ) + if args.verbosity_level > 0: + pbar.set_description( + f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}', + ) def compute_bounds_increment( @@ -482,7 +490,8 @@ def compute_batch_ranking( input_dataframe = pd.DataFrame(line_tmp_storage) input_dataframe.columns = column_descriptions - pbar.set_description('Control features') + if args.verbosity_level > 0: + pbar.set_description('Control features') if args.feature_set_focus: if args.feature_set_focus == '_all_from_reference_JSON': @@ -498,49 +507,56 @@ def compute_batch_ranking( input_dataframe = input_dataframe[list(focus_set)] if args.transformers != 'none': - pbar.set_description('Adding transformations') + if args.verbosity_level > 0: + pbar.set_description('Adding transformations') input_dataframe = enrich_with_transformations( input_dataframe, numeric_column_types, logger, args, ) if args.explode_multivalue_features != 'False': - pbar.set_description('Constructing new features from multivalue ones') + if args.verbosity_level > 0: + pbar.set_description('Constructing new features from multivalue ones') input_dataframe = compute_expanded_multivalue_features( input_dataframe, logger, args, pbar, ) if args.subfeature_mapping != 'False': - pbar.set_description('Constructing new (sub)features') + if args.verbosity_level > 0: + pbar.set_description('Constructing new (sub)features') input_dataframe = compute_subfeatures( input_dataframe, logger, args, pbar, ) if args.interaction_order > 1: - pbar.set_description('Constructing new features') + if args.verbosity_level > 0: + pbar.set_description('Constructing new features') input_dataframe = compute_combined_features( input_dataframe, logger, args, pbar, ) # in case of 3mr we compute the score of combinations against the target if '3mr' in args.heuristic: - pbar.set_description( - 'Constructing features for computing relations in 3mr', - ) + if args.verbosity_level > 0: + pbar.set_description( + 'Constructing features for computing relations in 3mr', + ) input_dataframe = compute_combined_features( input_dataframe, logger, args, pbar, True, ) if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant': - pbar.set_description('Computing baseline features') + if args.verbosity_level > 0: + pbar.set_description('Computing baseline features') input_dataframe = include_noisy_features(input_dataframe, logger, args) # Compute incremental statistic useful for data inspection/transformer generation - pbar.set_description('Computing coverage') + if args.verbosity_level > 0: + pbar.set_description('Computing coverage') coverage_storage = compute_coverage(input_dataframe, args) feature_memory_consumption = compute_feature_memory_consumption( input_dataframe, args, ) - compute_cardinalities(input_dataframe, pbar) + compute_cardinalities(input_dataframe, pbar, args) if args.task == 'identify_rare_values': compute_value_counts(input_dataframe, args) @@ -548,10 +564,10 @@ def compute_batch_ranking( bounds_storage = compute_bounds_increment( input_dataframe, numeric_column_types, ) - - pbar.set_description( - f'Computing ranks for {input_dataframe.shape[1]} features', - ) + if args.verbosity_level > 0: + pbar.set_description( + f'Computing ranks for {input_dataframe.shape[1]} features', + ) return ( mixed_rank_graph(input_dataframe, args, cpu_pool, pbar), @@ -627,9 +643,12 @@ def estimate_importances_minibatches( step_timing_checkpoints = [] local_coverage_object = defaultdict(list) - local_pbar = tqdm.tqdm( - total=get_num_of_instances(input_file) - 1, position=0, - ) + if args.verbosity_level > 0: + local_pbar = tqdm.tqdm( + total=get_num_of_instances(input_file) - 1, position=0, + ) + else: + local_pbar = None file_name, file_extension = os.path.splitext(input_file) @@ -641,10 +660,12 @@ def estimate_importances_minibatches( file_stream.readline() - local_pbar.set_description('Starting ranking computation') + if args.verbosity_level > 0: + local_pbar.set_description('Starting ranking computation') for line in file_stream: line_counter += 1 - local_pbar.update(1) + if args.verbosity_level > 0: + local_pbar.update(1) if line_counter % args.subsampling != 0: continue @@ -685,12 +706,14 @@ def estimate_importances_minibatches( importances_df += importances_batch.triplet_scores if args.heuristic != 'Constant': - local_pbar.set_description('Creating checkpoint') + if args.verbosity_level > 0: + local_pbar.set_description('Creating checkpoint') checkpoint_importances_df(importances_df) file_stream.close() - local_pbar.set_description('Parsing the remainder') + if args.verbosity_level > 0: + local_pbar.set_description('Parsing the remainder') if invalid_lines > 0: logger.info( f"Detected {invalid_lines} invalid lines. If this number is very high, it's possible your header is off - re-check your data/attribute-feature mappings please!", @@ -726,8 +749,9 @@ def estimate_importances_minibatches( bounds_storage_batch.append(bounds_storage) checkpoint_importances_df(importances_df) - local_pbar.set_description('Wrapping up') - local_pbar.close() + if args.verbosity_level > 0: + local_pbar.set_description('Wrapping up') + local_pbar.close() return ( step_timing_checkpoints, diff --git a/outrank/task_ranking.py b/outrank/task_ranking.py index 1840bce..15f1c50 100644 --- a/outrank/task_ranking.py +++ b/outrank/task_ranking.py @@ -32,14 +32,12 @@ ) -def outrank_task_conduct_ranking(args: Any): +def outrank_task_conduct_ranking(args: Any) -> None: # Data source = folder structure + relevant file specifications - - # No need for full-blown ranking in this case if args.task in ['identify_rare_values', 'feature_summary_transformers']: args.heuristic = 'Constant' - if args.silent != 'True': + if args.verbosity_level > 0: display_tool_name() display_random_tip() diff --git a/setup.py b/setup.py index 5d148e0..012d397 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def _read_description(): packages = [x for x in setuptools.find_packages() if x != 'test'] setuptools.setup( name='outrank', - version='0.95.3', + version='0.95.4', description='OutRank: Feature ranking for massive sparse data sets.', long_description=_read_description(), long_description_content_type='text/markdown', From dcf4a704666954bed02546ae77941203eeb3402c Mon Sep 17 00:00:00 2001 From: bskrlj Date: Fri, 5 Jan 2024 11:36:18 +0100 Subject: [PATCH 2/4] test --- tests/ranking_module_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ranking_module_test.py b/tests/ranking_module_test.py index 5e96fbf..b0aa9cb 100644 --- a/tests/ranking_module_test.py +++ b/tests/ranking_module_test.py @@ -33,6 +33,7 @@ class args: target_ranking_only: str = 'True' interaction_order: int = 3 combination_number_upper_bound: int = 1024 + verbosity_level: int = 1 class CompareStrategiesTest(unittest.TestCase): From d5fc1c28fd9cca77bd43d884f92ade03f9e746b4 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Fri, 5 Jan 2024 13:47:49 +0100 Subject: [PATCH 3/4] refactorings --- outrank/__main__.py | 8 +-- outrank/core_ranking.py | 96 ++++++++++++++---------------------- outrank/task_ranking.py | 2 +- tests/ranking_module_test.py | 2 +- 4 files changed, 43 insertions(+), 65 deletions(-) diff --git a/outrank/__main__.py b/outrank/__main__.py index d9848f9..4363c9b 100644 --- a/outrank/__main__.py +++ b/outrank/__main__.py @@ -219,10 +219,10 @@ def main(): ) parser.add_argument( - '--verbosity_level', - type=int, - default=1, - help='Either 0 or 1.', + '--disable_tqdm', + default='False', + choices=['False', 'True'], + help='Either True or False.', ) diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index 6a8fac4..2cab7bd 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -105,8 +105,7 @@ def mixed_rank_graph( out_time_struct = {} # Handle cont. types prior to interaction evaluation - if args.verbosity_level > 0: - pbar.set_description('Encoding columns') + pbar.set_description('Encoding columns') start_enc_timer = timer() tmp_df = pd.DataFrame({k : tmp_df[k].cat.codes for k in all_columns}) @@ -127,8 +126,7 @@ def mixed_rank_graph( return BatchRankingSummary(final_constant_imp, out_time_struct) # Map the scoring calls to the worker pool - if args.verbosity_level > 0: - pbar.set_description('Allocating thread pool') + pbar.set_description('Allocating thread pool') # starmap is an alternative that is slower unfortunately (but nicer) def get_grounded_importances_estimate(combination: tuple[str]) -> Any: @@ -136,8 +134,7 @@ def get_grounded_importances_estimate(combination: tuple[str]) -> Any: start_enc_timer = timer() with cpu_pool as p: - if args.verbosity_level > 0: - pbar.set_description(f'Computing (#ftr={len(combinations)})') + pbar.set_description(f'Computing (#ftr={len(combinations)})') results = p.amap(get_grounded_importances_estimate, combinations) while not results.ready(): time.sleep(4) @@ -147,8 +144,7 @@ def get_grounded_importances_estimate(combination: tuple[str]) -> Any: start_enc_timer # Gather the final triplets - if args.verbosity_level > 0: - pbar.set_description('Aggregation of ranking results') + pbar.set_description('Aggregation of ranking results') final_triplets = [] for triplet in triplets: inv = (triplet[1], triplet[0], triplet[2]) @@ -156,8 +152,7 @@ def get_grounded_importances_estimate(combination: tuple[str]) -> Any: final_triplets.append(triplet) triplets = final_triplets - if args.verbosity_level > 0: - pbar.set_description('Proceeding to the next batch of data') + pbar.set_description('Proceeding to the next batch of data') return BatchRankingSummary(triplets, out_time_struct) @@ -205,10 +200,9 @@ def compute_combined_features( com_counter = 0 new_feature_hash = {} for new_combination in full_combination_space: - if args.verbosity_level > 0: - pbar.set_description( - f'Created {com_counter}/{len(full_combination_space)}', - ) + pbar.set_description( + f'Created {com_counter}/{len(full_combination_space)}', + ) combined_feature: list[str] = [str(0)] * input_dataframe.shape[0] for feature in new_combination: tmp_feature = input_dataframe[feature].tolist() @@ -222,8 +216,7 @@ def compute_combined_features( new_feature_hash[ftr_name] = combined_feature com_counter += 1 tmp_df = pd.DataFrame(new_feature_hash) - if args.verbosity_level > 0: - pbar.set_description('Concatenating into final frame ..') + pbar.set_description('Concatenating into final frame ..') input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) del tmp_df @@ -439,10 +432,10 @@ def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any, args: Any) - GLOBAL_CARDINALITY_STORAGE[column].add( internal_hash(unique_value), ) - if args.verbosity_level > 0: - pbar.set_description( - f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}', - ) + + pbar.set_description( + f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}', + ) def compute_bounds_increment( @@ -490,8 +483,7 @@ def compute_batch_ranking( input_dataframe = pd.DataFrame(line_tmp_storage) input_dataframe.columns = column_descriptions - if args.verbosity_level > 0: - pbar.set_description('Control features') + pbar.set_description('Control features') if args.feature_set_focus: if args.feature_set_focus == '_all_from_reference_JSON': @@ -507,51 +499,45 @@ def compute_batch_ranking( input_dataframe = input_dataframe[list(focus_set)] if args.transformers != 'none': - if args.verbosity_level > 0: - pbar.set_description('Adding transformations') + + pbar.set_description('Adding transformations') input_dataframe = enrich_with_transformations( input_dataframe, numeric_column_types, logger, args, ) if args.explode_multivalue_features != 'False': - if args.verbosity_level > 0: - pbar.set_description('Constructing new features from multivalue ones') + pbar.set_description('Constructing new features from multivalue ones') input_dataframe = compute_expanded_multivalue_features( input_dataframe, logger, args, pbar, ) if args.subfeature_mapping != 'False': - if args.verbosity_level > 0: - pbar.set_description('Constructing new (sub)features') + pbar.set_description('Constructing new (sub)features') input_dataframe = compute_subfeatures( input_dataframe, logger, args, pbar, ) if args.interaction_order > 1: - if args.verbosity_level > 0: - pbar.set_description('Constructing new features') + pbar.set_description('Constructing new features') input_dataframe = compute_combined_features( input_dataframe, logger, args, pbar, ) # in case of 3mr we compute the score of combinations against the target if '3mr' in args.heuristic: - if args.verbosity_level > 0: - pbar.set_description( - 'Constructing features for computing relations in 3mr', - ) + pbar.set_description( + 'Constructing features for computing relations in 3mr', + ) input_dataframe = compute_combined_features( input_dataframe, logger, args, pbar, True, ) if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant': - if args.verbosity_level > 0: - pbar.set_description('Computing baseline features') + pbar.set_description('Computing baseline features') input_dataframe = include_noisy_features(input_dataframe, logger, args) # Compute incremental statistic useful for data inspection/transformer generation - if args.verbosity_level > 0: - pbar.set_description('Computing coverage') + pbar.set_description('Computing coverage') coverage_storage = compute_coverage(input_dataframe, args) feature_memory_consumption = compute_feature_memory_consumption( input_dataframe, args, @@ -564,10 +550,10 @@ def compute_batch_ranking( bounds_storage = compute_bounds_increment( input_dataframe, numeric_column_types, ) - if args.verbosity_level > 0: - pbar.set_description( - f'Computing ranks for {input_dataframe.shape[1]} features', - ) + + pbar.set_description( + f'Computing ranks for {input_dataframe.shape[1]} features', + ) return ( mixed_rank_graph(input_dataframe, args, cpu_pool, pbar), @@ -643,12 +629,9 @@ def estimate_importances_minibatches( step_timing_checkpoints = [] local_coverage_object = defaultdict(list) - if args.verbosity_level > 0: - local_pbar = tqdm.tqdm( - total=get_num_of_instances(input_file) - 1, position=0, - ) - else: - local_pbar = None + local_pbar = tqdm.tqdm( + total=get_num_of_instances(input_file) - 1, position=0, disable=args.disable_tqdm == 'True', + ) file_name, file_extension = os.path.splitext(input_file) @@ -660,12 +643,10 @@ def estimate_importances_minibatches( file_stream.readline() - if args.verbosity_level > 0: - local_pbar.set_description('Starting ranking computation') + local_pbar.set_description('Starting ranking computation') for line in file_stream: line_counter += 1 - if args.verbosity_level > 0: - local_pbar.update(1) + local_pbar.update(1) if line_counter % args.subsampling != 0: continue @@ -706,14 +687,12 @@ def estimate_importances_minibatches( importances_df += importances_batch.triplet_scores if args.heuristic != 'Constant': - if args.verbosity_level > 0: - local_pbar.set_description('Creating checkpoint') + local_pbar.set_description('Creating checkpoint') checkpoint_importances_df(importances_df) file_stream.close() - if args.verbosity_level > 0: - local_pbar.set_description('Parsing the remainder') + local_pbar.set_description('Parsing the remainder') if invalid_lines > 0: logger.info( f"Detected {invalid_lines} invalid lines. If this number is very high, it's possible your header is off - re-check your data/attribute-feature mappings please!", @@ -749,9 +728,8 @@ def estimate_importances_minibatches( bounds_storage_batch.append(bounds_storage) checkpoint_importances_df(importances_df) - if args.verbosity_level > 0: - local_pbar.set_description('Wrapping up') - local_pbar.close() + local_pbar.set_description('Wrapping up') + local_pbar.close() return ( step_timing_checkpoints, diff --git a/outrank/task_ranking.py b/outrank/task_ranking.py index 15f1c50..1ab28c8 100644 --- a/outrank/task_ranking.py +++ b/outrank/task_ranking.py @@ -37,7 +37,7 @@ def outrank_task_conduct_ranking(args: Any) -> None: if args.task in ['identify_rare_values', 'feature_summary_transformers']: args.heuristic = 'Constant' - if args.verbosity_level > 0: + if args.disable_tqdm == 'False': display_tool_name() display_random_tip() diff --git a/tests/ranking_module_test.py b/tests/ranking_module_test.py index b0aa9cb..c7cf1d4 100644 --- a/tests/ranking_module_test.py +++ b/tests/ranking_module_test.py @@ -33,7 +33,7 @@ class args: target_ranking_only: str = 'True' interaction_order: int = 3 combination_number_upper_bound: int = 1024 - verbosity_level: int = 1 + disable_tqdm: bool = False class CompareStrategiesTest(unittest.TestCase): From 1a1d7b6ba725d9f9e30b727d9cbc35f370948133 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Fri, 5 Jan 2024 14:08:00 +0100 Subject: [PATCH 4/4] refactorings v2 --- outrank/core_ranking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index 2cab7bd..93672f9 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -415,7 +415,7 @@ def compute_value_counts(input_dataframe: pd.DataFrame, args: Any): del GLOBAL_RARE_VALUE_STORAGE[to_remove_val] -def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any, args: Any) -> None: +def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any) -> None: """Compute cardinalities of features, incrementally""" global GLOBAL_CARDINALITY_STORAGE @@ -542,7 +542,7 @@ def compute_batch_ranking( feature_memory_consumption = compute_feature_memory_consumption( input_dataframe, args, ) - compute_cardinalities(input_dataframe, pbar, args) + compute_cardinalities(input_dataframe, pbar) if args.task == 'identify_rare_values': compute_value_counts(input_dataframe, args)