audeering · maxschmitt · Jun 4, 2024 · Apr 29, 2024 · Apr 29, 2024 · Apr 29, 2024
diff --git a/audinterface/core/segment.py b/audinterface/core/segment.py
@@ -505,6 +505,103 @@ def process_index(
 
         return audformat.segmented_index(files, starts, ends)
 
+    def process_table(
+        self,
+        table: typing.Union[pd.Series, pd.DataFrame],
+        *,
+        root: str = None,
+        cache_root: str = None,
+        process_func_args: typing.Dict[str, typing.Any] = None,
+    ) -> pd.Index:
+        r"""Segment files or segments from a table.
+
+        If ``cache_root`` is not ``None``,
+        a hash value is created from the index
+        using :func:`audformat.utils.hash` and
+        the result is stored as
+        ``<cache_root>/<hash>.pkl``.
+        When called again with the same index,
+        results will be read from the cached file.
+
+        Args:
+            table: ``pd.Series`` or ``pd.DataFrame``
+                with an index conform to audformat_
+            root: root folder to expand relative file paths
+            cache_root: cache folder (see description)
+            process_func_args: (keyword) arguments passed on
+                to the processing function.
+                They will temporarily overwrite
+                the ones stored in
+                :attr:`audinterface.Segment.process.process_func_args`
+
+        Returns:
+            Segmented table with an index conform to audformat_
+
+        Raises:
+            RuntimeError: if table has a wrong type
+            RuntimeError: if sampling rates do not match
+            RuntimeError: if channel selection is invalid
+
+        .. _audformat: https://audeering.github.io/audformat/data-format.html
+
+        """
+        if not isinstance(table, pd.Series) and not isinstance(table, pd.DataFrame):
+            raise ValueError("table has to be pd.Series or pd.DataFrame")
+
+        index = audformat.utils.to_segmented_index(table.index)
+        utils.assert_index(index)
+
+        if index.empty:
+            return table
+
+        y = self.process.process_index(
+            index,
+            preserve_index=False,
+            root=root,
+            cache_root=cache_root,
+            process_func_args=process_func_args,
+        )
+
+        files = []
+        starts = []
+        ends = []
+        labels = []
+        if isinstance(table, pd.Series):
+            dtype = table.dtype
+            for j, ((file, start, _), index) in enumerate(y.items()):
+                files.extend([file] * len(index))
+                starts.extend(index.get_level_values("start") + start)
+                ends.extend(index.get_level_values("end") + start)
+                labels.extend([[table.iloc[j]] * len(index)])
+            labels = np.hstack(labels)
+        else:
+            dtypes = [table[col].dtype for col in table.columns]
+            for j, ((file, start, _), index) in enumerate(y.items()):
+                files.extend([file] * len(index))
+                starts.extend(index.get_level_values("start") + start)
+                ends.extend(index.get_level_values("end") + start)
+                if len(index) > 0:  # avoid issues when stacking 2D 0-len
+                    labels.extend([[table.iloc[j].values] * len(index)])
+            if len(labels) > 0:
+                labels = np.vstack(labels)
+            else:
+                labels = np.empty((0, table.shape[1]))  # avoid issue below
+
+        index = audformat.segmented_index(files, starts, ends)
+
+        if isinstance(table, pd.Series):
+            table = pd.Series(labels, index, name=table.name, dtype=dtype)
+        else:
+            labels = {
+                col: pd.Series(
+                    labels[:, icol], index=index, dtype=dtypes[icol]
+                )  # supports also category
+                for icol, col in enumerate(table.columns)
+            }
+            table = pd.DataFrame(labels, index)
+
+        return table
+
     def process_signal(
         self,
         signal: np.ndarray,

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -391,6 +391,22 @@ would be a voice activity detection algorithm.
     idx = interface.process_file(files[0])
     idx
 
+Sometimes, it is required that a table (i.e., `pd.Series`
+or `pd.DataFrame`) is segmented and the ``labels`` from
+the original segments should be kept. For this,
+:class:`audinterface.Segment` has a dedicated method
+``process_table()``. This method is useful, if a
+segmentation (e.g., voice activity detection) is
+performed on an already labelled dataset in order
+to do data augmentation or teacher-student training,
+improving model performance for shorter chunks.
+
+.. jupyter-execute::
+
+    table = pd.DataFrame({"label": [i*2 for i in range(len(index))]}, index=index)
+    table_segmented = interface.process_table(table)
+    table_segmented
+
 
 Special processing function arguments
 -------------------------------------

diff --git a/tests/test_segment.py b/tests/test_segment.py
@@ -132,7 +132,7 @@ def test_folder_default_process_func(tmpdir, num_workers, multiprocessing):
 
 
 @pytest.mark.parametrize("num_workers", [1, 2, None])
-def test_index(tmpdir, num_workers):
+def test_index_and_table(tmpdir, num_workers):
     def process_func(signal, sampling_rate):
         duration = pd.to_timedelta(signal.shape[-1] / sampling_rate, unit="s")
         return audinterface.utils.signal_index(
@@ -156,12 +156,19 @@ def process_func(signal, sampling_rate):
     path = os.path.join(root, file)
     af.write(path, signal, sampling_rate)
 
-    # empty index
+    # empty index and table
     index = audformat.segmented_index()
     result = segment.process_index(index)
     assert result.empty
     result = segment.process_signal_from_index(signal, sampling_rate, index)
     assert result.empty
+    table = audformat.Table(index)
+    result = segment.process_table(table.get())
+    assert result.index.empty
+
+    # non-table object for process_table()
+    with pytest.raises(ValueError):
+        segment.process_table(index)
 
     # segmented index without file level
     index = audinterface.utils.signal_index(
@@ -191,6 +198,24 @@ def process_func(signal, sampling_rate):
     result = segment.process_signal_from_index(signal, sampling_rate, index)
     pd.testing.assert_index_equal(result, expected)
 
+    # segmented index with absolute paths: series and dataframe
+    table = audformat.Table(index)
+    table["values"] = audformat.Column()
+    table.set({"values": [0, 1, 2]})
+    expected_series = pd.Series(
+        table.get()["values"].values, index=expected, name="values"
+    )
+    result = segment.process_table(table.get()["values"])
+    pd.testing.assert_series_equal(result, expected_series)
+    table_df = table.copy()
+    table_df["string"] = audformat.Column()
+    table_df.set({"string": ["a", "b", "c"]})
+    expected_dataframe = pd.DataFrame(
+        table_df.get().values, index=expected, columns=["values", "string"]
+    )
+    result = segment.process_table(table_df.get())
+    pd.testing.assert_frame_equal(result, expected_dataframe)
+
     # filewise index with absolute paths
     index = pd.Index([path], name="file")
     expected = audformat.segmented_index(path, "0.1s", "2.9s")
@@ -199,6 +224,24 @@ def process_func(signal, sampling_rate):
     result = segment.process_signal_from_index(signal, sampling_rate, index)
     pd.testing.assert_index_equal(result, expected)
 
+    # filewise index with absolute paths: series and dataframe
+    table = audformat.Table(index)
+    table["values"] = audformat.Column()
+    table.set({"values": [5]})
+    expected_series = pd.Series(
+        table.get()["values"].values, index=expected, name="values"
+    )
+    result = segment.process_table(table.get()["values"])
+    pd.testing.assert_series_equal(result, expected_series)
+    table_df = table.copy()
+    table_df["string"] = audformat.Column()
+    table_df.set({"string": ["d"]})
+    expected_dataframe = pd.DataFrame(
+        table_df.get().values, index=expected, columns=["values", "string"]
+    )
+    result = segment.process_table(table_df.get())
+    pd.testing.assert_frame_equal(result, expected_dataframe)
+
     # segmented index with relative paths
     index = audformat.segmented_index(
         [file] * 3,
@@ -215,6 +258,24 @@ def process_func(signal, sampling_rate):
     result = segment.process_signal_from_index(signal, sampling_rate, index)
     pd.testing.assert_index_equal(result, expected)
 
+    # segmented index with relative paths: series and dataframe
+    table = audformat.Table(index)
+    table["values"] = audformat.Column()
+    table.set({"values": [0, 1, 2]})
+    expected_series = pd.Series(
+        table.get()["values"].values, index=expected, name="values"
+    )
+    result = segment.process_table(table.get()["values"], root=root)
+    pd.testing.assert_series_equal(result, expected_series)
+    table_df = table.copy()
+    table_df["string"] = audformat.Column()
+    table_df.set({"string": ["a", "b", "c"]})
+    expected_dataframe = pd.DataFrame(
+        table_df.get().values, index=expected, columns=["values", "string"]
+    )
+    result = segment.process_table(table_df.get(), root=root)
+    pd.testing.assert_frame_equal(result, expected_dataframe)
+
     # filewise index with relative paths
     index = pd.Index([file], name="file")
     expected = audformat.segmented_index(file, "0.1s", "2.9s")
@@ -223,7 +284,25 @@ def process_func(signal, sampling_rate):
     result = segment.process_signal_from_index(signal, sampling_rate, index)
     pd.testing.assert_index_equal(result, expected)
 
-    # empty index returned by process func
+    # filewise index with relative paths: series and dataframe
+    table = audformat.Table(index)
+    table["values"] = audformat.Column()
+    table.set({"values": [5]})
+    expected_series = pd.Series(
+        table.get()["values"].values, index=expected, name="values"
+    )
+    result = segment.process_table(table.get()["values"], root=root)
+    pd.testing.assert_series_equal(result, expected_series)
+    table_df = table.copy()
+    table_df["string"] = audformat.Column()
+    table_df.set({"string": ["d"]})
+    expected_dataframe = pd.DataFrame(
+        table_df.get().values, index=expected, columns=["values", "string"]
+    )
+    result = segment.process_table(table_df.get(), root=root)
+    pd.testing.assert_frame_equal(result, expected_dataframe)
+
+    # empty index / series / dataframe returned by process func
 
     def process_func(x, sr):
         return audinterface.utils.signal_index()
@@ -241,6 +320,107 @@ def process_func(x, sr):
     result = segment.process_index(index)
     pd.testing.assert_index_equal(result, expected)
 
+    table = pd.Series([0], index)
+    expected_series = pd.Series([], expected, dtype=np.int64)
+    result = segment.process_table(table)
+    pd.testing.assert_series_equal(result, expected_series)
+
+    table_df = pd.DataFrame([0], index, columns=["col"])
+    expected_df = pd.DataFrame([], expected, columns=["col"], dtype=np.int64)
+    result = segment.process_table(table_df)
+    pd.testing.assert_frame_equal(result, expected_df)
+
+    # correct assignment of labels if output has more segments
+    def process_func_increase(signal, sampling_rate, chunk_len=0.4):
+        duration = signal.shape[-1] / sampling_rate
+        chunks = []
+        for i in range(int(duration // chunk_len) + 1):
+            chunks.append((i * chunk_len, np.min([(i + 1) * chunk_len, duration])))
+        index = pd.MultiIndex.from_tuples(
+            [
+                (
+                    pd.Timedelta(start, unit="s"),
+                    pd.Timedelta(end, unit="s"),
+                )
+                for start, end in chunks
+            ],
+            names=["start", "end"],
+        )
+        return index
+
+    segment = audinterface.Segment(
+        process_func=process_func_increase,
+        sampling_rate=None,
+        resample=False,
+        num_workers=num_workers,
+        verbose=False,
+    )
+    index = audformat.segmented_index(
+        [path] * 3,
+        pd.timedelta_range("0s", "2s", 3),
+        pd.timedelta_range("1s", "3s", 3),
+    )
+    expected_index = audformat.segmented_index(
+        [path] * 9,
+        [
+            pd.to_timedelta("0.0s"),
+            pd.to_timedelta("0.4s"),
+            pd.to_timedelta("0.8s"),
+            pd.to_timedelta("1.0s"),
+            pd.to_timedelta("1.4s"),
+            pd.to_timedelta("1.8s"),
+            pd.to_timedelta("2.0s"),
+            pd.to_timedelta("2.4s"),
+            pd.to_timedelta("2.8s"),
+        ],
+        [
+            pd.to_timedelta("0.4s"),
+            pd.to_timedelta("0.8s"),
+            pd.to_timedelta("1.0s"),
+            pd.to_timedelta("1.4s"),
+            pd.to_timedelta("1.8s"),
+            pd.to_timedelta("2.0s"),
+            pd.to_timedelta("2.4s"),
+            pd.to_timedelta("2.8s"),
+            pd.to_timedelta("3.0s"),
+        ],
+    )
+    expected_values = [
+        [0, "a"],
+        [0, "a"],
+        [0, "a"],
+        [1, "b"],
+        [1, "b"],
+        [1, "b"],
+        [2, "c"],
+        [2, "c"],
+        [2, "c"],
+    ]
+
+    expected_df = pd.DataFrame(
+        expected_values, index=expected_index, columns=["values", "string"]
+    )
+    expected_series = expected_df["values"]
+
+    table_series = pd.Series(
+        np.array([0, 1, 2], dtype=np.int64), index=index, name="values"
+    )
+    result_series = segment.process_table(table_series)
+    pd.testing.assert_series_equal(result_series, expected_series)
+
+    table_df = pd.DataFrame(
+        {"values": np.array([0, 1, 2], dtype=np.int64), "string": ["a", "b", "c"]},
+        index=index,
+    )
+    result_df = segment.process_table(table_df)
+    pd.testing.assert_frame_equal(result_df, expected_df)
+
+    # single-column dataframe
+    table_df1 = pd.DataFrame(table_series)
+    expected_df1 = pd.DataFrame(expected_series)
+    result_df1 = segment.process_table(table_df1)
+    pd.testing.assert_frame_equal(result_df1, expected_df1)
+
 
 @pytest.mark.parametrize(
     "signal, sampling_rate, segment_func, result",