From 2fa814a43a3cb58674ea6fea55b3951b9a6512fc Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Thu, 20 Oct 2022 21:18:40 -0500 Subject: [PATCH 1/4] added new combination function --- tobac/utils.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tobac/utils.py b/tobac/utils.py index 7217caef..3a86d43b 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -894,3 +894,43 @@ def spectral_filtering( return (lambda_mn, transfer_function), filtered_field else: return filtered_field + + +def combine_tobac_feats(list_of_feats): + """Function to combine a list of tobac feature detection dataframes + into one combined dataframe that can be used for tracking + or segmentation. + + Parameters + ---------- + list_of_feats: array-like of Pandas DataFrames + A list of dataframes (generated, for example, by + running feature detection on multiple nodes) + + Returns + ------- + pd.DataFrame + One combined DataFrame + + """ + import pandas as pd + import numpy as np + + # first, let's just combine these. + combined_df = pd.concat(list_of_feats) + # Then, sort by time first, then by feature number + combined_df = combined_df.sort_values(["time", "feature"]) + all_times = sorted(combined_df["time"].unique()) + # Loop through current times + start_feat_num = combined_df["feature"].min() + for frame_num, curr_time in enumerate(all_times): + # renumber the frame number + combined_df.loc[combined_df["time"] == curr_time, "frame"] = frame_num + # renumber the features + curr_row_count = len(combined_df.loc[combined_df["time"] == curr_time]) + feat_num_arr = np.arange(start_feat_num, start_feat_num + curr_row_count) + combined_df.loc[combined_df["time"] == curr_time, "feature"] = feat_num_arr + start_feat_num = np.max(feat_num_arr) + 1 + + combined_df = combined_df.reset_index() + return combined_df From 41badaccfa1d22cd1c9019dfa8579f6d8bdb1181 Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Thu, 20 Oct 2022 21:40:18 -0500 Subject: [PATCH 2/4] added tests, fixed a bug --- tobac/tests/test_utils.py | 25 +++++++++++++++++++++++++ tobac/utils.py | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/tobac/tests/test_utils.py b/tobac/tests/test_utils.py index 630985c8..ebeaf624 100644 --- a/tobac/tests/test_utils.py +++ b/tobac/tests/test_utils.py @@ -1,5 +1,6 @@ import numpy as np import tobac.utils as tb_utils +import tobac.testing as tb_test from scipy import fft @@ -55,3 +56,27 @@ def test_spectral_filtering(): ) >= 1 ) + + +def test_combine_tobac_feats(): + """tests tobac.utils.combine_tobac_feats + Test by generating two single feature dataframes, + combining them with this function, and then + testing to see if a single dataframe + matches. + """ + import datetime + import pandas as pd + + single_feat_1 = tb_test.generate_single_feature( + 0, 0, start_date=datetime.datetime(2022, 1, 1, 0, 0), frame_start=0 + ) + single_feat_2 = tb_test.generate_single_feature( + 1, 1, start_date=datetime.datetime(2022, 1, 1, 0, 5), frame_start=0 + ) + + combined_feat = tb_utils.combine_tobac_feats([single_feat_1, single_feat_2]) + + tot_feat = tb_test.generate_single_feature(0, 0, num_frames=2, frame_start=0) + + assert combined_feat.equals(tot_feat) diff --git a/tobac/utils.py b/tobac/utils.py index 3a86d43b..148c4fcb 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -932,5 +932,5 @@ def combine_tobac_feats(list_of_feats): combined_df.loc[combined_df["time"] == curr_time, "feature"] = feat_num_arr start_feat_num = np.max(feat_num_arr) + 1 - combined_df = combined_df.reset_index() + combined_df = combined_df.reset_index(drop=True) return combined_df From 068b014672a0cc734ff045e154985d2a69862dcc Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Thu, 20 Oct 2022 21:53:02 -0500 Subject: [PATCH 3/4] updates to documentation for new function --- doc/big_datasets.rst | 11 +++++++++++ doc/index.rst | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 doc/big_datasets.rst diff --git a/doc/big_datasets.rst b/doc/big_datasets.rst new file mode 100644 index 00000000..2e3e51d9 --- /dev/null +++ b/doc/big_datasets.rst @@ -0,0 +1,11 @@ +Handling Large Datasets +------------------------------------- + +Often, one desires to use *tobac* to identify and track features in large datasets ("big data"). This documentation strives to suggest various methods for doing so efficiently. Current versions of *tobac* do not allow for out-of-memory computation, meaning that these strategies may need to be employed for both computational and memory reasons. + +.. _Split Feature Detection: +======================= +Split Feature Detection +======================= +Current versions of threshold feature detection (see :doc:`feature_detection_overview`) are time independent, meaning that one can parallelize feature detection across all times (although not across space). *tobac* provides the :py:meth:`tobac.utils.combine_tobac_feats` function to combine a list of dataframes produced by a parallelization method (such as :code:`jug` or :code:`multiprocessing.pool`) into a single combined dataframe suitable to perform tracking with. + diff --git a/doc/index.rst b/doc/index.rst index 96b589cf..e7ef71e4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -23,7 +23,8 @@ The project is currently being extended by several contributors to include addit installation data_input analysis - plotting + plotting + big_datasets examples publications From dedefdbad8ab8cf2957fea68b28929ad7d4b806f Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Fri, 21 Oct 2022 10:13:11 -0500 Subject: [PATCH 4/4] Allow users to keep old feature column. Update docstrings. --- tobac/tests/test_utils.py | 22 +++++++++++++++++----- tobac/utils.py | 16 +++++++++++++--- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/tobac/tests/test_utils.py b/tobac/tests/test_utils.py index ebeaf624..66b3a62a 100644 --- a/tobac/tests/test_utils.py +++ b/tobac/tests/test_utils.py @@ -1,6 +1,11 @@ -import numpy as np +import datetime + import tobac.utils as tb_utils import tobac.testing as tb_test + +import pandas as pd +import pandas.testing as pd_test +import numpy as np from scipy import fft @@ -65,8 +70,6 @@ def test_combine_tobac_feats(): testing to see if a single dataframe matches. """ - import datetime - import pandas as pd single_feat_1 = tb_test.generate_single_feature( 0, 0, start_date=datetime.datetime(2022, 1, 1, 0, 0), frame_start=0 @@ -77,6 +80,15 @@ def test_combine_tobac_feats(): combined_feat = tb_utils.combine_tobac_feats([single_feat_1, single_feat_2]) - tot_feat = tb_test.generate_single_feature(0, 0, num_frames=2, frame_start=0) + tot_feat = tb_test.generate_single_feature( + 0, 0, spd_h1=1, spd_h2=1, num_frames=2, frame_start=0 + ) + + pd_test.assert_frame_equal(combined_feat, tot_feat) - assert combined_feat.equals(tot_feat) + # Now try preserving the old feature numbers. + combined_feat = tb_utils.combine_tobac_feats( + [single_feat_1, single_feat_2], preserve_old_feat_nums="old_feat_column" + ) + assert np.all(list(combined_feat["old_feat_column"].values) == [1, 1]) + assert np.all(list(combined_feat["feature"].values) == [1, 2]) diff --git a/tobac/utils.py b/tobac/utils.py index 148c4fcb..d9334239 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -896,7 +896,7 @@ def spectral_filtering( return filtered_field -def combine_tobac_feats(list_of_feats): +def combine_tobac_feats(list_of_feats, preserve_old_feat_nums=None): """Function to combine a list of tobac feature detection dataframes into one combined dataframe that can be used for tracking or segmentation. @@ -905,12 +905,18 @@ def combine_tobac_feats(list_of_feats): ---------- list_of_feats: array-like of Pandas DataFrames A list of dataframes (generated, for example, by - running feature detection on multiple nodes) + running feature detection on multiple nodes). + + preserve_old_feat_nums: str or None + The column name to preserve old feature numbers in. If None, these + old numbers will be deleted. Users may want to enable this feature + if they have run segmentation with the separate dataframes and + therefore old feature numbers. Returns ------- pd.DataFrame - One combined DataFrame + One combined DataFrame. """ import pandas as pd @@ -923,6 +929,10 @@ def combine_tobac_feats(list_of_feats): all_times = sorted(combined_df["time"].unique()) # Loop through current times start_feat_num = combined_df["feature"].min() + # Save the old feature numbers if requested. + if preserve_old_feat_nums is not None: + combined_df[preserve_old_feat_nums] = combined_df["feature"] + for frame_num, curr_time in enumerate(all_times): # renumber the frame number combined_df.loc[combined_df["time"] == curr_time, "frame"] = frame_num