diff --git a/doc/big_datasets.rst b/doc/big_datasets.rst new file mode 100644 index 00000000..2e3e51d9 --- /dev/null +++ b/doc/big_datasets.rst @@ -0,0 +1,11 @@ +Handling Large Datasets +------------------------------------- + +Often, one desires to use *tobac* to identify and track features in large datasets ("big data"). This documentation strives to suggest various methods for doing so efficiently. Current versions of *tobac* do not allow for out-of-memory computation, meaning that these strategies may need to be employed for both computational and memory reasons. + +.. _Split Feature Detection: +======================= +Split Feature Detection +======================= +Current versions of threshold feature detection (see :doc:`feature_detection_overview`) are time independent, meaning that one can parallelize feature detection across all times (although not across space). *tobac* provides the :py:meth:`tobac.utils.combine_tobac_feats` function to combine a list of dataframes produced by a parallelization method (such as :code:`jug` or :code:`multiprocessing.pool`) into a single combined dataframe suitable to perform tracking with. + diff --git a/doc/index.rst b/doc/index.rst index 96b589cf..e7ef71e4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -23,7 +23,8 @@ The project is currently being extended by several contributors to include addit installation data_input analysis - plotting + plotting + big_datasets examples publications diff --git a/tobac/tests/test_utils.py b/tobac/tests/test_utils.py index 630985c8..66b3a62a 100644 --- a/tobac/tests/test_utils.py +++ b/tobac/tests/test_utils.py @@ -1,5 +1,11 @@ -import numpy as np +import datetime + import tobac.utils as tb_utils +import tobac.testing as tb_test + +import pandas as pd +import pandas.testing as pd_test +import numpy as np from scipy import fft @@ -55,3 +61,34 @@ def test_spectral_filtering(): ) >= 1 ) + + +def test_combine_tobac_feats(): + """tests tobac.utils.combine_tobac_feats + Test by generating two single feature dataframes, + combining them with this function, and then + testing to see if a single dataframe + matches. + """ + + single_feat_1 = tb_test.generate_single_feature( + 0, 0, start_date=datetime.datetime(2022, 1, 1, 0, 0), frame_start=0 + ) + single_feat_2 = tb_test.generate_single_feature( + 1, 1, start_date=datetime.datetime(2022, 1, 1, 0, 5), frame_start=0 + ) + + combined_feat = tb_utils.combine_tobac_feats([single_feat_1, single_feat_2]) + + tot_feat = tb_test.generate_single_feature( + 0, 0, spd_h1=1, spd_h2=1, num_frames=2, frame_start=0 + ) + + pd_test.assert_frame_equal(combined_feat, tot_feat) + + # Now try preserving the old feature numbers. + combined_feat = tb_utils.combine_tobac_feats( + [single_feat_1, single_feat_2], preserve_old_feat_nums="old_feat_column" + ) + assert np.all(list(combined_feat["old_feat_column"].values) == [1, 1]) + assert np.all(list(combined_feat["feature"].values) == [1, 2]) diff --git a/tobac/utils.py b/tobac/utils.py index 7217caef..d9334239 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -894,3 +894,53 @@ def spectral_filtering( return (lambda_mn, transfer_function), filtered_field else: return filtered_field + + +def combine_tobac_feats(list_of_feats, preserve_old_feat_nums=None): + """Function to combine a list of tobac feature detection dataframes + into one combined dataframe that can be used for tracking + or segmentation. + + Parameters + ---------- + list_of_feats: array-like of Pandas DataFrames + A list of dataframes (generated, for example, by + running feature detection on multiple nodes). + + preserve_old_feat_nums: str or None + The column name to preserve old feature numbers in. If None, these + old numbers will be deleted. Users may want to enable this feature + if they have run segmentation with the separate dataframes and + therefore old feature numbers. + + Returns + ------- + pd.DataFrame + One combined DataFrame. + + """ + import pandas as pd + import numpy as np + + # first, let's just combine these. + combined_df = pd.concat(list_of_feats) + # Then, sort by time first, then by feature number + combined_df = combined_df.sort_values(["time", "feature"]) + all_times = sorted(combined_df["time"].unique()) + # Loop through current times + start_feat_num = combined_df["feature"].min() + # Save the old feature numbers if requested. + if preserve_old_feat_nums is not None: + combined_df[preserve_old_feat_nums] = combined_df["feature"] + + for frame_num, curr_time in enumerate(all_times): + # renumber the frame number + combined_df.loc[combined_df["time"] == curr_time, "frame"] = frame_num + # renumber the features + curr_row_count = len(combined_df.loc[combined_df["time"] == curr_time]) + feat_num_arr = np.arange(start_feat_num, start_feat_num + curr_row_count) + combined_df.loc[combined_df["time"] == curr_time, "feature"] = feat_num_arr + start_feat_num = np.max(feat_num_arr) + 1 + + combined_df = combined_df.reset_index(drop=True) + return combined_df