Add simulate_rir_ism method for room impulse response simulation (pyt…

…orch#2880) Summary: replicate of pytorch#2644 Pull Request resolved: pytorch#2880 Reviewed By: mthrok Differential Revision: D41633911 Pulled By: nateanl fbshipit-source-id: 73cf145d75c389e996aafe96571ab86dc21f86e5
rgt-yncrea · Feb 14, 2023 · 8c5c9a9 · 8c5c9a9
1 parent 3f02b89
commit 8c5c9a9
Show file tree

Hide file tree

Showing 20 changed files with 705 additions and 8 deletions.
diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
@@ -72,7 +72,7 @@ fi
 (
     set -x
     conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa>=0.8.0' parameterized 'requests>=2.20'
-    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag
+    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq

diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
@@ -90,7 +90,8 @@ esac
         unidecode \
         'protobuf<4.21.0' \
         demucs \
-        tinytag
+        tinytag \
+        pyroomacoustics
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -54,6 +54,7 @@ endif()
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
 option(BUILD_KALDI "Build kaldi statically" ON)
+option(BUILD_RIR "Enable RIR simulation" ON)
 option(BUILD_RNNT "Enable RNN transducer" ON)
 option(BUILD_CTC_DECODER "Build Flashlight CTC decoder" ON)
 option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)

diff --git a/docs/source/prototype.functional.rst b/docs/source/prototype.functional.rst
@@ -22,3 +22,12 @@ DSP
    oscillator_bank
    sinc_impulse_response
    frequency_impulse_response
+
+Room Impulse Response Simulation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   simulate_rir_ism
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
@@ -504,3 +504,27 @@ @inproceedings{valk2021voxlingua107
   year={2021},
   organization={IEEE}
 }
+@inproceedings{scheibler2018pyroomacoustics,
+  title={Pyroomacoustics: A python package for audio room simulation and array processing algorithms},
+  author={Scheibler, Robin and Bezzam, Eric and Dokmani{\'c}, Ivan},
+  booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
+  pages={351--355},
+  year={2018},
+  organization={IEEE}
+}
+@article{allen1979image,
+  title={Image method for efficiently simulating small-room acoustics},
+  author={Allen, Jont B and Berkley, David A},
+  journal={The Journal of the Acoustical Society of America},
+  volume={65},
+  number={4},
+  pages={943--950},
+  year={1979},
+  publisher={Acoustical Society of America}
+}
+@misc{wiki:Absorption_(acoustics),
+   author = "{Wikipedia contributors}",
+   title = "Absorption (acoustics) --- {W}ikipedia{,} The Free Encyclopedia",
+   url = "https://en.wikipedia.org/wiki/Absorption_(acoustics)",
+   note = "[Online]"
+ }
diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py
@@ -13,6 +13,7 @@
     skipIfNoMacOS,
     skipIfNoModule,
     skipIfNoQengine,
+    skipIfNoRIR,
     skipIfNoSox,
     skipIfPy310,
     skipIfRocm,
@@ -47,6 +48,7 @@
     "skipIfNoMacOS",
     "skipIfNoModule",
     "skipIfNoKaldi",
+    "skipIfNoRIR",
     "skipIfNoSox",
     "skipIfNoSoxBackend",
     "skipIfRocm",

diff --git a/test/torchaudio_unittest/common_utils/case_utils.py b/test/torchaudio_unittest/common_utils/case_utils.py
@@ -225,6 +225,11 @@ def skipIfNoModule(module, display_name=None):
     reason="Kaldi features are not available.",
     key="NO_KALDI",
 )
+skipIfNoRIR = _skipIf(
+    not torchaudio._extension._IS_RIR_AVAILABLE,
+    reason="RIR features are not available.",
+    key="NO_RIR",
+)
 skipIfNoCtcDecoder = _skipIf(
     not is_ctc_decoder_available(),
     reason="CTC decoder not available.",

diff --git a/test/torchaudio_unittest/prototype/functional/functional_cpu_test.py b/test/torchaudio_unittest/prototype/functional/functional_cpu_test.py
@@ -1,7 +1,7 @@
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
 
-from .functional_test_impl import Functional64OnlyTestImpl, FunctionalTestImpl
+from .functional_test_impl import Functional64OnlyTestImpl, FunctionalCPUOnlyTestImpl, FunctionalTestImpl
 
 
 class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
@@ -17,3 +17,13 @@ class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase):
 class FunctionalFloat64OnlyCPUTest(Functional64OnlyTestImpl, PytorchTestCase):
     dtype = torch.float64
     device = torch.device("cpu")
+
+
+class FunctionalCPUOnlyFloat32Test(FunctionalCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device("cpu")
+
+
+class FunctionalCPUOnlyFloat64Test(FunctionalCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
diff --git a/test/torchaudio_unittest/prototype/functional/functional_test_impl.py b/test/torchaudio_unittest/prototype/functional/functional_test_impl.py
@@ -1,7 +1,12 @@
+from torchaudio._internal import module_utils as _mod_utils
+
+if _mod_utils.is_module_available("pyroomacoustics"):
+    import pyroomacoustics as pra
+
 import torch
 import torchaudio.prototype.functional as F
 from parameterized import param, parameterized
-from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
+from torchaudio_unittest.common_utils import nested_params, skipIfNoModule, skipIfNoRIR, TestBaseMixin
 
 from .dsp_utils import freq_ir as freq_ir_np, oscillator_bank as oscillator_bank_np, sinc_ir as sinc_ir_np
 
@@ -424,3 +429,83 @@ def _debug_plot():
         except AssertionError:
             _debug_plot()
             raise
+
+
+@skipIfNoModule("pyroomacoustics")
+@skipIfNoRIR
+class FunctionalCPUOnlyTestImpl(TestBaseMixin):
+    @parameterized.expand([(1,), (4,)])
+    def test_simulate_rir_ism_single_band(self, channel):
+        """Test simulate_rir_ism function in the case where absorption coefficients are identical for all walls."""
+        room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        # absorption is set as a float value indicating absorption coefficients are the same for every wall.
+        absorption = 0.5
+        # compute rir signal by torchaudio implementation
+        actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, absorption)
+        # compute rir signal by pyroomacoustics
+        room = pra.ShoeBox(
+            room_dim.detach().numpy(),
+            fs=16000,
+            materials=pra.Material(absorption),
+            max_order=max_order,
+            ray_tracing=False,
+            air_absorption=False,
+        )
+        # mic_locs is a numpy array of dimension `(3, channel)`.
+        mic_locs = mic_array.transpose(0, 1).double().detach().numpy()
+        room.add_microphone_array(mic_locs)
+        room.add_source(source.tolist())
+        room.compute_rir()
+        max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
+        expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
+        for i in range(channel):
+            expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
+
+        self.assertEqual(expected, actual, atol=1e-3, rtol=1e-3)
+
+    @parameterized.expand([(1,), (4,)])
+    def test_simulate_rir_ism_multi_band(self, channel):
+        """Test simulate_rir_ism in the case where absorption coefficients are different for all walls."""
+        room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        # absorption is set as a Tensor with dimensions `(7, 6)` indicating there are
+        # 6 walls and each wall has 7 absorption coefficients corresponds to 7 octave bands, respectively.
+        absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
+        walls = ["west", "east", "south", "north", "floor", "ceiling"]
+        room = pra.ShoeBox(
+            room_dim.detach().numpy(),
+            fs=16000,
+            materials={
+                walls[i]: pra.Material(
+                    {
+                        "coeffs": absorption[:, i]
+                        .reshape(
+                            -1,
+                        )
+                        .detach()
+                        .numpy(),
+                        "center_freqs": [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0],
+                    }
+                )
+                for i in range(len(walls))
+            },
+            max_order=max_order,
+            ray_tracing=False,
+            air_absorption=False,
+        )
+        # mic_locs is a numpy array of dimension `(D, channel)`.
+        mic_locs = mic_array.transpose(0, 1).double().detach().numpy()
+        room.add_microphone_array(mic_locs)
+        room.add_source(source.tolist())
+        room.compute_rir()
+        max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
+        expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
+        for i in range(channel):
+            expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
+        actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, absorption)
+        self.assertEqual(expected, actual, atol=1e-3, rtol=1e-3)
diff --git a/test/torchaudio_unittest/prototype/functional/torchscript_consistency_cpu_test.py b/test/torchaudio_unittest/prototype/functional/torchscript_consistency_cpu_test.py
@@ -1,7 +1,7 @@
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
 
-from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl
+from .torchscript_consistency_test_impl import TorchScriptConsistencyCPUOnlyTestImpl, TorchScriptConsistencyTestImpl
 
 
 class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
@@ -12,3 +12,13 @@ class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, Pytor
 class TorchScriptConsistencyCPUFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
     dtype = torch.float64
     device = torch.device("cpu")
+
+
+class TorchScriptConsistencyCPUOnlyFloat32Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device("cpu")
+
+
+class TorchScriptConsistencyCPUOnlyFloat64Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
diff --git a/test/torchaudio_unittest/prototype/functional/torchscript_consistency_test_impl.py b/test/torchaudio_unittest/prototype/functional/torchscript_consistency_test_impl.py
@@ -2,7 +2,8 @@
 
 import torch
 import torchaudio.prototype.functional as F
-from torchaudio_unittest.common_utils import TestBaseMixin, torch_script
+from parameterized import parameterized
+from torchaudio_unittest.common_utils import skipIfNoRIR, TestBaseMixin, torch_script
 
 
 class TorchScriptConsistencyTestImpl(TestBaseMixin):
@@ -62,3 +63,52 @@ def test_sinc_ir(self):
     def test_freq_ir(self):
         mags = torch.tensor([0, 0.5, 1.0], device=self.device, dtype=self.dtype)
         self._assert_consistency(F.frequency_impulse_response, (mags,))
+
+
+class TorchScriptConsistencyCPUOnlyTestImpl(TestBaseMixin):
+    def _assert_consistency(self, func, inputs, shape_only=False):
+        inputs_ = []
+        for i in inputs:
+            if torch.is_tensor(i):
+                i = i.to(device=self.device, dtype=self.dtype)
+            inputs_.append(i)
+        ts_func = torch_script(func)
+
+        torch.random.manual_seed(40)
+        output = func(*inputs_)
+
+        torch.random.manual_seed(40)
+        ts_output = ts_func(*inputs_)
+
+        if shape_only:
+            ts_output = ts_output.shape
+            output = output.shape
+        self.assertEqual(ts_output, output)
+
+    @skipIfNoRIR
+    @parameterized.expand([(1,), (4,)])
+    def test_simulate_rir_ism_single_band(self, channel):
+        room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        absorption = 0.5
+        center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
+        self._assert_consistency(
+            F.simulate_rir_ism,
+            (room_dim, source, mic_array, max_order, absorption, None, 81, center_frequency, 343.0, 16000.0),
+        )
+
+    @skipIfNoRIR
+    @parameterized.expand([(1,), (4,)])
+    def test_simulate_rir_ism_multi_band(self, channel):
+        room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
+        center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
+        self._assert_consistency(
+            F.simulate_rir_ism,
+            (room_dim, source, mic_array, max_order, absorption, None, 81, center_frequency, 343.0, 16000.0),
+        )
diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py
@@ -35,6 +35,7 @@ def _get_build(var, default=False):
 
 _BUILD_SOX = False if platform.system() == "Windows" else _get_build("BUILD_SOX", True)
 _BUILD_KALDI = False if platform.system() == "Windows" else _get_build("BUILD_KALDI", True)
+_BUILD_RIR = _get_build("BUILD_RIR", True)
 _BUILD_RNNT = _get_build("BUILD_RNNT", True)
 _BUILD_CTC_DECODER = _get_build("BUILD_CTC_DECODER", True)
 _USE_FFMPEG = _get_build("USE_FFMPEG", False)
@@ -116,6 +117,7 @@ def build_extension(self, ext):
             f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
             f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
             f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",
+            f"-DBUILD_RIR:BOOL={'ON' if _BUILD_RIR else 'OFF'}",
             f"-DBUILD_RNNT:BOOL={'ON' if _BUILD_RNNT else 'OFF'}",
             f"-DBUILD_CTC_DECODER:BOOL={'ON' if _BUILD_CTC_DECODER else 'OFF'}",
             "-DBUILD_TORCHAUDIO_PYTHON_EXTENSION:BOOL=ON",

diff --git a/torchaudio/_extension/__init__.py b/torchaudio/_extension/__init__.py
@@ -20,6 +20,7 @@
     "_check_cuda_version",
     "_IS_TORCHAUDIO_EXT_AVAILABLE",
     "_IS_KALDI_AVAILABLE",
+    "_IS_RIR_AVAILABLE",
     "_SOX_INITIALIZED",
     "_FFMPEG_INITIALIZED",
 ]
@@ -33,16 +34,18 @@
 # In case of an error, we do not catch the failure as it suggests there is something
 # wrong with the installation.
 _IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
-# Kaldi features are implemented in _torchaudio extension, but it can be individually
+# Kaldi and RIR features are implemented in _torchaudio extension, but they can be individually
 # turned on/off at build time. Available means that _torchaudio is loaded properly, and
-# Kaldi features are found there.
+# Kaldi or RIR features are found there.
+_IS_RIR_AVAILABLE = False
 _IS_KALDI_AVAILABLE = False
 if _IS_TORCHAUDIO_EXT_AVAILABLE:
     _load_lib("libtorchaudio")
 
     import torchaudio.lib._torchaudio  # noqa
 
     _check_cuda_version()
+    _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
     _IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available()
 
 
@@ -88,3 +91,11 @@
 )
 
 fail_if_no_ffmpeg = no_op if _FFMPEG_INITIALIZED else _fail_since_no_ffmpeg
+
+fail_if_no_rir = (
+    no_op
+    if _IS_RIR_AVAILABLE
+    else fail_with_message(
+        "requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
+    )
+)
diff --git a/torchaudio/csrc/CMakeLists.txt b/torchaudio/csrc/CMakeLists.txt
@@ -41,6 +41,11 @@ if(BUILD_RNNT)
   endif()
 endif()
 
+if(BUILD_RIR)
+  list(APPEND sources rir.cpp)
+  list(APPEND compile_definitions INCLUDE_RIR)
+endif()
+
 if(USE_CUDA)
   list(
     APPEND

diff --git a/torchaudio/csrc/pybind/pybind.cpp b/torchaudio/csrc/pybind/pybind.cpp
@@ -6,6 +6,7 @@ namespace {
 
 PYBIND11_MODULE(_torchaudio, m) {
   m.def("is_kaldi_available", &is_kaldi_available, "");
+  m.def("is_rir_available", &is_rir_available, "");
   m.def("cuda_version", &cuda_version, "");
 }