From 2fa814a43a3cb58674ea6fea55b3951b9a6512fc Mon Sep 17 00:00:00 2001
From: Sean Freeman <sean.freeman@colostate.edu>
Date: Thu, 20 Oct 2022 21:18:40 -0500
Subject: [PATCH 1/4] added new combination function

---
 tobac/utils.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tobac/utils.py b/tobac/utils.py
index 7217caef..3a86d43b 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -894,3 +894,43 @@ def spectral_filtering(
         return (lambda_mn, transfer_function), filtered_field
     else:
         return filtered_field
+
+
+def combine_tobac_feats(list_of_feats):
+    """Function to combine a list of tobac feature detection dataframes
+    into one combined dataframe that can be used for tracking
+    or segmentation.
+
+    Parameters
+    ----------
+    list_of_feats: array-like of Pandas DataFrames
+        A list of dataframes (generated, for example, by
+        running feature detection on multiple nodes)
+
+    Returns
+    -------
+    pd.DataFrame
+        One combined DataFrame
+
+    """
+    import pandas as pd
+    import numpy as np
+
+    # first, let's just combine these.
+    combined_df = pd.concat(list_of_feats)
+    # Then, sort by time first, then by feature number
+    combined_df = combined_df.sort_values(["time", "feature"])
+    all_times = sorted(combined_df["time"].unique())
+    # Loop through current times
+    start_feat_num = combined_df["feature"].min()
+    for frame_num, curr_time in enumerate(all_times):
+        # renumber the frame number
+        combined_df.loc[combined_df["time"] == curr_time, "frame"] = frame_num
+        # renumber the features
+        curr_row_count = len(combined_df.loc[combined_df["time"] == curr_time])
+        feat_num_arr = np.arange(start_feat_num, start_feat_num + curr_row_count)
+        combined_df.loc[combined_df["time"] == curr_time, "feature"] = feat_num_arr
+        start_feat_num = np.max(feat_num_arr) + 1
+
+    combined_df = combined_df.reset_index()
+    return combined_df

From 41badaccfa1d22cd1c9019dfa8579f6d8bdb1181 Mon Sep 17 00:00:00 2001
From: Sean Freeman <sean.freeman@colostate.edu>
Date: Thu, 20 Oct 2022 21:40:18 -0500
Subject: [PATCH 2/4] added tests, fixed a bug

---
 tobac/tests/test_utils.py | 25 +++++++++++++++++++++++++
 tobac/utils.py            |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/tobac/tests/test_utils.py b/tobac/tests/test_utils.py
index 630985c8..ebeaf624 100644
--- a/tobac/tests/test_utils.py
+++ b/tobac/tests/test_utils.py
@@ -1,5 +1,6 @@
 import numpy as np
 import tobac.utils as tb_utils
+import tobac.testing as tb_test
 from scipy import fft
 
 
@@ -55,3 +56,27 @@ def test_spectral_filtering():
         )
         >= 1
     )
+
+
+def test_combine_tobac_feats():
+    """tests tobac.utils.combine_tobac_feats
+    Test by generating two single feature dataframes,
+    combining them with this function, and then
+    testing to see if a single dataframe
+    matches.
+    """
+    import datetime
+    import pandas as pd
+
+    single_feat_1 = tb_test.generate_single_feature(
+        0, 0, start_date=datetime.datetime(2022, 1, 1, 0, 0), frame_start=0
+    )
+    single_feat_2 = tb_test.generate_single_feature(
+        1, 1, start_date=datetime.datetime(2022, 1, 1, 0, 5), frame_start=0
+    )
+
+    combined_feat = tb_utils.combine_tobac_feats([single_feat_1, single_feat_2])
+
+    tot_feat = tb_test.generate_single_feature(0, 0, num_frames=2, frame_start=0)
+
+    assert combined_feat.equals(tot_feat)
diff --git a/tobac/utils.py b/tobac/utils.py
index 3a86d43b..148c4fcb 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -932,5 +932,5 @@ def combine_tobac_feats(list_of_feats):
         combined_df.loc[combined_df["time"] == curr_time, "feature"] = feat_num_arr
         start_feat_num = np.max(feat_num_arr) + 1
 
-    combined_df = combined_df.reset_index()
+    combined_df = combined_df.reset_index(drop=True)
     return combined_df

From 068b014672a0cc734ff045e154985d2a69862dcc Mon Sep 17 00:00:00 2001
From: Sean Freeman <sean.freeman@colostate.edu>
Date: Thu, 20 Oct 2022 21:53:02 -0500
Subject: [PATCH 3/4] updates to documentation for new function

---
 doc/big_datasets.rst | 11 +++++++++++
 doc/index.rst        |  3 ++-
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 doc/big_datasets.rst

diff --git a/doc/big_datasets.rst b/doc/big_datasets.rst
new file mode 100644
index 00000000..2e3e51d9
--- /dev/null
+++ b/doc/big_datasets.rst
@@ -0,0 +1,11 @@
+Handling Large Datasets
+-------------------------------------
+
+Often, one desires to use *tobac* to identify and track features in large datasets ("big data"). This documentation strives to suggest various methods for doing so efficiently. Current versions of *tobac* do not allow for out-of-memory computation, meaning that these strategies may need to be employed for both computational and memory reasons. 
+
+.. _Split Feature Detection:
+=======================
+Split Feature Detection
+=======================
+Current versions of threshold feature detection (see :doc:`feature_detection_overview`) are time independent, meaning that one can parallelize feature detection across all times (although not across space). *tobac* provides the :py:meth:`tobac.utils.combine_tobac_feats` function to combine a list of dataframes produced by a parallelization method (such as :code:`jug` or :code:`multiprocessing.pool`) into a single combined dataframe suitable to perform tracking with. 
+
diff --git a/doc/index.rst b/doc/index.rst
index 96b589cf..e7ef71e4 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -23,7 +23,8 @@ The project is currently being extended by several contributors to include addit
    installation
    data_input
    analysis 
-   plotting 
+   plotting
+   big_datasets 
    examples
    publications
 

From dedefdbad8ab8cf2957fea68b28929ad7d4b806f Mon Sep 17 00:00:00 2001
From: Sean Freeman <freemansw1@gmail.com>
Date: Fri, 21 Oct 2022 10:13:11 -0500
Subject: [PATCH 4/4] Allow users to keep old feature column. Update
 docstrings.

---
 tobac/tests/test_utils.py | 22 +++++++++++++++++-----
 tobac/utils.py            | 16 +++++++++++++---
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/tobac/tests/test_utils.py b/tobac/tests/test_utils.py
index ebeaf624..66b3a62a 100644
--- a/tobac/tests/test_utils.py
+++ b/tobac/tests/test_utils.py
@@ -1,6 +1,11 @@
-import numpy as np
+import datetime
+
 import tobac.utils as tb_utils
 import tobac.testing as tb_test
+
+import pandas as pd
+import pandas.testing as pd_test
+import numpy as np
 from scipy import fft
 
 
@@ -65,8 +70,6 @@ def test_combine_tobac_feats():
     testing to see if a single dataframe
     matches.
     """
-    import datetime
-    import pandas as pd
 
     single_feat_1 = tb_test.generate_single_feature(
         0, 0, start_date=datetime.datetime(2022, 1, 1, 0, 0), frame_start=0
@@ -77,6 +80,15 @@ def test_combine_tobac_feats():
 
     combined_feat = tb_utils.combine_tobac_feats([single_feat_1, single_feat_2])
 
-    tot_feat = tb_test.generate_single_feature(0, 0, num_frames=2, frame_start=0)
+    tot_feat = tb_test.generate_single_feature(
+        0, 0, spd_h1=1, spd_h2=1, num_frames=2, frame_start=0
+    )
+
+    pd_test.assert_frame_equal(combined_feat, tot_feat)
 
-    assert combined_feat.equals(tot_feat)
+    # Now try preserving the old feature numbers.
+    combined_feat = tb_utils.combine_tobac_feats(
+        [single_feat_1, single_feat_2], preserve_old_feat_nums="old_feat_column"
+    )
+    assert np.all(list(combined_feat["old_feat_column"].values) == [1, 1])
+    assert np.all(list(combined_feat["feature"].values) == [1, 2])
diff --git a/tobac/utils.py b/tobac/utils.py
index 148c4fcb..d9334239 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -896,7 +896,7 @@ def spectral_filtering(
         return filtered_field
 
 
-def combine_tobac_feats(list_of_feats):
+def combine_tobac_feats(list_of_feats, preserve_old_feat_nums=None):
     """Function to combine a list of tobac feature detection dataframes
     into one combined dataframe that can be used for tracking
     or segmentation.
@@ -905,12 +905,18 @@ def combine_tobac_feats(list_of_feats):
     ----------
     list_of_feats: array-like of Pandas DataFrames
         A list of dataframes (generated, for example, by
-        running feature detection on multiple nodes)
+        running feature detection on multiple nodes).
+
+    preserve_old_feat_nums: str or None
+        The column name to preserve old feature numbers in. If None, these
+        old numbers will be deleted. Users may want to enable this feature
+        if they have run segmentation with the separate dataframes and
+        therefore old feature numbers.
 
     Returns
     -------
     pd.DataFrame
-        One combined DataFrame
+        One combined DataFrame.
 
     """
     import pandas as pd
@@ -923,6 +929,10 @@ def combine_tobac_feats(list_of_feats):
     all_times = sorted(combined_df["time"].unique())
     # Loop through current times
     start_feat_num = combined_df["feature"].min()
+    # Save the old feature numbers if requested.
+    if preserve_old_feat_nums is not None:
+        combined_df[preserve_old_feat_nums] = combined_df["feature"]
+
     for frame_num, curr_time in enumerate(all_times):
         # renumber the frame number
         combined_df.loc[combined_df["time"] == curr_time, "frame"] = frame_num