From c08117828fcd446b303f69ac7ba094160ec6d4d6 Mon Sep 17 00:00:00 2001
From: Andrea Zoppi <texzk@email.it>
Date: Mon, 11 Mar 2024 23:13:30 +0100
Subject: [PATCH] Added initial source code snapshot

---
 apps/aymo_ymf262_play.c                     |  575 +++++++
 apps/meson.build                            |   22 +
 aymo.pc.in                                  |   15 +
 contrib/meson.build                         |   26 +
 doc/.gitkeep                                |    0
 include/aymo.h                              |   33 +
 include/aymo_cc.h                           |  189 +++
 include/aymo_convert.h                      |   54 +
 include/aymo_convert_arm_neon.h             |   66 +
 include/aymo_convert_none.h                 |   64 +
 include/aymo_convert_x86_avx2.h             |   66 +
 include/aymo_convert_x86_sse41.h            |   66 +
 include/aymo_cpu.h                          |   55 +
 include/aymo_cpu_arm.h                      |   43 +
 include/aymo_cpu_arm_neon.h                 |   53 +
 include/aymo_cpu_arm_neon_inline.h          |  382 +++++
 include/aymo_cpu_x86.h                      |   47 +
 include/aymo_cpu_x86_avx2.h                 |   46 +
 include/aymo_cpu_x86_avx2_inline.h          |  428 +++++
 include/aymo_cpu_x86_sse41.h                |   46 +
 include/aymo_cpu_x86_sse41_inline.h         |  426 +++++
 include/aymo_file.h                         |   42 +
 include/aymo_score.h                        |  145 ++
 include/aymo_score_avd.h                    |   89 +
 include/aymo_score_dro.h                    |  166 ++
 include/aymo_score_imf.h                    |  131 ++
 include/aymo_score_raw.h                    |   99 ++
 include/aymo_sys_linux.h                    |   25 +
 include/aymo_sys_windows.h                  |   25 +
 include/aymo_tda8425.h                      |   45 +
 include/aymo_tda8425_arm_neon.h             |  107 ++
 include/aymo_tda8425_common.h               |   84 +
 include/aymo_tda8425_none.h                 |   61 +
 include/aymo_tda8425_x86_avx2.h             |  100 ++
 include/aymo_tda8425_x86_sse41.h            |  106 ++
 include/aymo_wave.h                         |   85 +
 include/aymo_ym7128.h                       |   46 +
 include/aymo_ym7128_arm_neon.h              |   93 +
 include/aymo_ym7128_common.h                |  118 ++
 include/aymo_ym7128_none.h                  |   61 +
 include/aymo_ym7128_x86_sse41.h             |   93 +
 include/aymo_ymf262.h                       |   56 +
 include/aymo_ymf262_arm_neon.h              |  333 ++++
 include/aymo_ymf262_common.h                |  230 +++
 include/aymo_ymf262_none.h                  |   79 +
 include/aymo_ymf262_x86_avx.h               |  333 ++++
 include/aymo_ymf262_x86_avx2.h              |  332 ++++
 include/aymo_ymf262_x86_sse41.h             |  332 ++++
 include/meson.build                         |   11 +
 meson.build                                 |  688 ++++++++
 meson_options.txt                           |    8 +
 msvc-arm.txt                                |   14 +
 msvc-arm_env.bat                            |    4 +
 src/aymo.c                                  |   35 +
 src/aymo_convert.c                          |  206 +++
 src/aymo_convert_arm_neon.c                 |  821 +++++++++
 src/aymo_convert_none.c                     |  177 ++
 src/aymo_convert_x86_avx2.c                 |  335 ++++
 src/aymo_convert_x86_sse41.c                |  796 +++++++++
 src/aymo_cpu.c                              |   38 +
 src/aymo_cpu_arm.c                          |   61 +
 src/aymo_cpu_x86.c                          |  119 ++
 src/aymo_empty.c                            |    1 +
 src/aymo_file.c                             |  133 ++
 src/aymo_score.c                            |  153 ++
 src/aymo_score_avd.c                        |  174 ++
 src/aymo_score_dro.c                        |  376 +++++
 src/aymo_score_imf.c                        |  266 +++
 src/aymo_score_raw.c                        |  231 +++
 src/aymo_sys_linux.c                        |   19 +
 src/aymo_sys_windows.c                      |   71 +
 src/aymo_tda8425.c                          |  172 ++
 src/aymo_tda8425_arm_neon.c                 |  504 ++++++
 src/aymo_tda8425_common.c                   |  150 ++
 src/aymo_tda8425_none.c                     |  148 ++
 src/aymo_tda8425_x86_avx2.c                 |  499 ++++++
 src/aymo_tda8425_x86_sse41.c                |  512 ++++++
 src/aymo_wave.c                             |   79 +
 src/aymo_ym7128.c                           |  148 ++
 src/aymo_ym7128_arm_neon.c                  |  270 +++
 src/aymo_ym7128_common.c                    |  192 +++
 src/aymo_ym7128_none.c                      |  130 ++
 src/aymo_ym7128_x86_sse41.c                 |  270 +++
 src/aymo_ymf262.c                           |  250 +++
 src/aymo_ymf262_arm_neon.c                  | 1688 ++++++++++++++++++
 src/aymo_ymf262_common.c                    |  263 +++
 src/aymo_ymf262_none.c                      |  200 +++
 src/aymo_ymf262_x86_avx.c                   | 1691 +++++++++++++++++++
 src/aymo_ymf262_x86_avx2.c                  | 1683 ++++++++++++++++++
 src/aymo_ymf262_x86_sse41.c                 | 1691 +++++++++++++++++++
 tests/aymo_testing.c                        |  110 ++
 tests/aymo_testing.h                        |   54 +
 tests/aymo_testing_epilogue_inline.h        |   41 +
 tests/meson.build                           |  392 +++++
 tests/test_convert_arm_neon.c               |  376 +++++
 tests/test_convert_none.c                   |  371 ++++
 tests/test_convert_prologue_inline.h        |  296 ++++
 tests/test_convert_x86_avx2.c               |  376 +++++
 tests/test_convert_x86_sse41.c              |  376 +++++
 tests/test_tda8425_arm_neon_sweep.c         |   31 +
 tests/test_tda8425_none_sweep.c             |   27 +
 tests/test_tda8425_sweep_inline.h           |  330 ++++
 tests/test_tda8425_x86_avx2_sweep.c         |   31 +
 tests/test_tda8425_x86_sse41_sweep.c        |   31 +
 tests/test_ym7128_arm_neon_sweep.c          |   31 +
 tests/test_ym7128_none_sweep.c              |   27 +
 tests/test_ym7128_sweep_inline.h            |  316 ++++
 tests/test_ym7128_x86_sse41_sweep.c         |   31 +
 tests/test_ymf262_arm_neon_compare.c        |   77 +
 tests/test_ymf262_compare_epilogue_inline.h |  165 ++
 tests/test_ymf262_compare_prologue_inline.h |   77 +
 tests/test_ymf262_none_compare.c            |   73 +
 tests/test_ymf262_x86_avx2_compare.c        |  170 ++
 tests/test_ymf262_x86_avx_compare.c         |  170 ++
 tests/test_ymf262_x86_sse41_compare.c       |  170 ++
 115 files changed, 25544 insertions(+)
 create mode 100644 apps/aymo_ymf262_play.c
 create mode 100644 apps/meson.build
 create mode 100644 aymo.pc.in
 create mode 100644 contrib/meson.build
 create mode 100644 doc/.gitkeep
 create mode 100644 include/aymo.h
 create mode 100644 include/aymo_cc.h
 create mode 100644 include/aymo_convert.h
 create mode 100644 include/aymo_convert_arm_neon.h
 create mode 100644 include/aymo_convert_none.h
 create mode 100644 include/aymo_convert_x86_avx2.h
 create mode 100644 include/aymo_convert_x86_sse41.h
 create mode 100644 include/aymo_cpu.h
 create mode 100644 include/aymo_cpu_arm.h
 create mode 100644 include/aymo_cpu_arm_neon.h
 create mode 100644 include/aymo_cpu_arm_neon_inline.h
 create mode 100644 include/aymo_cpu_x86.h
 create mode 100644 include/aymo_cpu_x86_avx2.h
 create mode 100644 include/aymo_cpu_x86_avx2_inline.h
 create mode 100644 include/aymo_cpu_x86_sse41.h
 create mode 100644 include/aymo_cpu_x86_sse41_inline.h
 create mode 100644 include/aymo_file.h
 create mode 100644 include/aymo_score.h
 create mode 100644 include/aymo_score_avd.h
 create mode 100644 include/aymo_score_dro.h
 create mode 100644 include/aymo_score_imf.h
 create mode 100644 include/aymo_score_raw.h
 create mode 100644 include/aymo_sys_linux.h
 create mode 100644 include/aymo_sys_windows.h
 create mode 100644 include/aymo_tda8425.h
 create mode 100644 include/aymo_tda8425_arm_neon.h
 create mode 100644 include/aymo_tda8425_common.h
 create mode 100644 include/aymo_tda8425_none.h
 create mode 100644 include/aymo_tda8425_x86_avx2.h
 create mode 100644 include/aymo_tda8425_x86_sse41.h
 create mode 100644 include/aymo_wave.h
 create mode 100644 include/aymo_ym7128.h
 create mode 100644 include/aymo_ym7128_arm_neon.h
 create mode 100644 include/aymo_ym7128_common.h
 create mode 100644 include/aymo_ym7128_none.h
 create mode 100644 include/aymo_ym7128_x86_sse41.h
 create mode 100644 include/aymo_ymf262.h
 create mode 100644 include/aymo_ymf262_arm_neon.h
 create mode 100644 include/aymo_ymf262_common.h
 create mode 100644 include/aymo_ymf262_none.h
 create mode 100644 include/aymo_ymf262_x86_avx.h
 create mode 100644 include/aymo_ymf262_x86_avx2.h
 create mode 100644 include/aymo_ymf262_x86_sse41.h
 create mode 100644 include/meson.build
 create mode 100644 meson.build
 create mode 100644 meson_options.txt
 create mode 100644 msvc-arm.txt
 create mode 100644 msvc-arm_env.bat
 create mode 100644 src/aymo.c
 create mode 100644 src/aymo_convert.c
 create mode 100644 src/aymo_convert_arm_neon.c
 create mode 100644 src/aymo_convert_none.c
 create mode 100644 src/aymo_convert_x86_avx2.c
 create mode 100644 src/aymo_convert_x86_sse41.c
 create mode 100644 src/aymo_cpu.c
 create mode 100644 src/aymo_cpu_arm.c
 create mode 100644 src/aymo_cpu_x86.c
 create mode 100644 src/aymo_empty.c
 create mode 100644 src/aymo_file.c
 create mode 100644 src/aymo_score.c
 create mode 100644 src/aymo_score_avd.c
 create mode 100644 src/aymo_score_dro.c
 create mode 100644 src/aymo_score_imf.c
 create mode 100644 src/aymo_score_raw.c
 create mode 100644 src/aymo_sys_linux.c
 create mode 100644 src/aymo_sys_windows.c
 create mode 100644 src/aymo_tda8425.c
 create mode 100644 src/aymo_tda8425_arm_neon.c
 create mode 100644 src/aymo_tda8425_common.c
 create mode 100644 src/aymo_tda8425_none.c
 create mode 100644 src/aymo_tda8425_x86_avx2.c
 create mode 100644 src/aymo_tda8425_x86_sse41.c
 create mode 100644 src/aymo_wave.c
 create mode 100644 src/aymo_ym7128.c
 create mode 100644 src/aymo_ym7128_arm_neon.c
 create mode 100644 src/aymo_ym7128_common.c
 create mode 100644 src/aymo_ym7128_none.c
 create mode 100644 src/aymo_ym7128_x86_sse41.c
 create mode 100644 src/aymo_ymf262.c
 create mode 100644 src/aymo_ymf262_arm_neon.c
 create mode 100644 src/aymo_ymf262_common.c
 create mode 100644 src/aymo_ymf262_none.c
 create mode 100644 src/aymo_ymf262_x86_avx.c
 create mode 100644 src/aymo_ymf262_x86_avx2.c
 create mode 100644 src/aymo_ymf262_x86_sse41.c
 create mode 100644 tests/aymo_testing.c
 create mode 100644 tests/aymo_testing.h
 create mode 100644 tests/aymo_testing_epilogue_inline.h
 create mode 100644 tests/meson.build
 create mode 100644 tests/test_convert_arm_neon.c
 create mode 100644 tests/test_convert_none.c
 create mode 100644 tests/test_convert_prologue_inline.h
 create mode 100644 tests/test_convert_x86_avx2.c
 create mode 100644 tests/test_convert_x86_sse41.c
 create mode 100644 tests/test_tda8425_arm_neon_sweep.c
 create mode 100644 tests/test_tda8425_none_sweep.c
 create mode 100644 tests/test_tda8425_sweep_inline.h
 create mode 100644 tests/test_tda8425_x86_avx2_sweep.c
 create mode 100644 tests/test_tda8425_x86_sse41_sweep.c
 create mode 100644 tests/test_ym7128_arm_neon_sweep.c
 create mode 100644 tests/test_ym7128_none_sweep.c
 create mode 100644 tests/test_ym7128_sweep_inline.h
 create mode 100644 tests/test_ym7128_x86_sse41_sweep.c
 create mode 100644 tests/test_ymf262_arm_neon_compare.c
 create mode 100644 tests/test_ymf262_compare_epilogue_inline.h
 create mode 100644 tests/test_ymf262_compare_prologue_inline.h
 create mode 100644 tests/test_ymf262_none_compare.c
 create mode 100644 tests/test_ymf262_x86_avx2_compare.c
 create mode 100644 tests/test_ymf262_x86_avx_compare.c
 create mode 100644 tests/test_ymf262_x86_sse41_compare.c

diff --git a/apps/aymo_ymf262_play.c b/apps/aymo_ymf262_play.c
new file mode 100644
index 0000000..1f231f3
--- /dev/null
+++ b/apps/aymo_ymf262_play.c
@@ -0,0 +1,575 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+
+---
+
+To play via shell pipe, run:
+
+    - ALSA Play:
+        aymo_ymf262_play SCORE | aplay -c 2 -r 47916 -f S16_LE
+
+    - VLC:
+        aymo_ymf262_play SCORE | vlc --demux=rawaud --rawaud-channels 2 --rawaud-samplerate 47916 -
+*/
+
+#include "aymo.h"
+#include "aymo_cpu.h"
+#include "aymo_file.h"
+#include "aymo_score.h"
+#include "aymo_score_avd.h"
+#include "aymo_score_dro.h"
+#include "aymo_score_imf.h"
+#include "aymo_wave.h"
+#include "aymo_ymf262.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#if (defined(__WINDOWS__) || defined(__CYGWIN__))
+    #include <io.h>
+    #include <fcntl.h>
+    #ifndef _MSC_VER
+        #define _fileno(f)  ((f)->_file)
+    #endif
+#endif
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+struct app_args {
+    int argc;
+    char** argv;
+
+    // App parameters
+    unsigned loops;
+    bool benchmark;
+
+    // Score parameters
+    const char* score_path_cstr;        // NULL or "-" for stdin
+    const char* score_type_cstr;        // NULL uses score file extension
+    enum aymo_score_type score_type;
+    unsigned score_after;
+
+    // Output parameters
+    const char* out_path_cstr;          // NULL or "-" for stdout
+    uint32_t out_frame_length;
+    bool out_quad;
+
+    // YMF262 parameters
+    const struct aymo_ymf262_vt* ymf262_vt;
+    bool ymf262_extensions;
+};
+
+
+static int app_return;
+
+static struct app_args app_args;
+static clock_t clock_start;
+static clock_t clock_end;
+
+static void* score_data;
+static size_t score_size;
+static union app_scores {
+    struct aymo_score_instance base;
+    struct aymo_score_avd_instance avd;
+    struct aymo_score_dro_instance dro;
+    struct aymo_score_imf_instance imf;
+} score;
+
+static struct aymo_ymf262_chip* chip;
+
+static bool out_stdout;
+static FILE* out_file;
+static int16_t out_buffer_default[4];
+static int16_t* out_buffer_ptr;
+static uint32_t out_frame_length;
+static struct aymo_wave_heading wave_head;
+
+static void* aymo_aligned_alloc(size_t size, size_t align)
+{
+    assert(align);
+    assert(size < (SIZE_MAX - align - align));
+
+    void* allocptr = calloc((size + align + align), 1u);
+    if (allocptr) {
+        uintptr_t alignaddr = ((uintptr_t)(void*)allocptr + align);
+        uintptr_t offset = (alignaddr % align);
+        alignaddr += ((align - offset) % align);
+        void* alignptr = (void*)alignaddr;
+        uintptr_t refaddr = (alignaddr - sizeof(void*));
+        void** refptr = (void**)(void*)refaddr;
+        *refptr = allocptr;
+        return alignptr;
+    }
+    return NULL;
+}
+
+
+static void aymo_aligned_free(void* alignptr)
+{
+    if (alignptr) {
+        uintptr_t alignaddr = (uintptr_t)alignptr;
+        uintptr_t refaddr = (alignaddr - sizeof(void*));
+        void** refptr = (void**)(void*)refaddr;
+        void* allocptr = *refptr;
+        free(allocptr);
+    }
+}
+
+
+static int app_boot(void)
+{
+    app_return = 2;
+
+    aymo_cpu_boot();
+    aymo_ymf262_boot();
+
+    score_data = NULL;
+    score_size = 0u;
+    memset(&score, 0, sizeof(score));
+
+    chip = NULL;
+
+    out_file = NULL;
+    out_buffer_ptr = out_buffer_default;
+    out_frame_length = 1u;
+
+    return 0;
+}
+
+
+static int app_args_init(int argc, char** argv)
+{
+    memset(&app_args, 0, sizeof(app_args));
+
+    app_args.argc = argc;
+    app_args.argv = argv;
+
+    app_args.loops = 1u;
+
+    app_args.score_type = aymo_score_type_unknown;
+
+    app_args.out_frame_length = 1u;
+
+    app_args.ymf262_vt = aymo_ymf262_get_best_vt();
+
+    return 0;
+}
+
+
+static int app_usage(void)
+{
+    printf("TODO: USAGE\n");
+
+    return -1;  // help
+}
+
+
+static int app_args_parse(void)
+{
+    int argi;
+
+    for (argi = 1; argi < app_args.argc; ++argi) {
+        const char* name = app_args.argv[argi];
+
+        if (!strcmp(name, "--")) {
+            ++argi;
+            break;
+        }
+
+        // Unary options
+        if (!strcmp(name, "--help") || !strcmp(name, "-h")) {
+            return app_usage();
+        }
+        if (!strcmp(name, "--benchmark")) {
+            app_args.benchmark = true;
+            continue;
+        }
+        if (!strcmp(name, "--out-quad")) {
+            app_args.out_quad = true;
+            continue;
+        }
+        if (!strcmp(name, "--ymf62-extensions")) {
+            app_args.ymf262_extensions = true;
+            continue;
+        }
+
+        // Binary options
+        if (argi >= (app_args.argc - 1)) {
+            break;
+        }
+        if (!strcmp(name, "--loops")) {
+            const char* text = app_args.argv[++argi];
+            errno = 0;
+            app_args.loops = strtoul(text, NULL, 0);
+            if (errno) {
+                perror(name);
+                return 1;
+            }
+            continue;
+        }
+        if (!strcmp(name, "--score-after")) {
+            const char* text = app_args.argv[++argi];
+            errno = 0;
+            app_args.score_after = strtoul(text, NULL, 0);
+            if (errno) {
+                perror(name);
+                return 1;
+            }
+            continue;
+        }
+        if (!strcmp(name, "--score-type")) {
+            const char* value = app_args.argv[++argi];
+            app_args.score_type = aymo_score_ext_to_type(value);
+            if (app_args.score_type == aymo_score_type_unknown) {
+                fprintf(stderr, "ERROR: Unknown score type \"%s\"\n", value);
+                return 1;
+            }
+            continue;
+        }
+        if (!strcmp(name, "--cpu-ext")) {
+            const char* text = app_args.argv[++argi];
+            app_args.ymf262_vt = aymo_ymf262_get_vt(text);
+            if (!app_args.ymf262_vt) {
+                fprintf(stderr, "ERROR: Unsupported CPU extensions tag: \"%s\"\n", text);
+                return 1;
+            }
+            continue;
+        }
+        if (!strcmp(name, "--buffer-size")) {
+            const char* text = app_args.argv[++argi];
+            errno = 0;
+            app_args.out_frame_length = strtoul(text, NULL, 0);
+            if (errno) {
+                perror(name);
+                return 1;
+            }
+            continue;
+        }
+        break;
+    }
+
+    if (argi == (app_args.argc - 2)) {
+        const char* text = app_args.argv[argi++];
+        if (!strcmp(text, "-")) {
+            text = NULL;
+        }
+        app_args.score_path_cstr = text;
+    }
+
+    if (argi == (app_args.argc - 1)) {
+        const char* text = app_args.argv[argi++];
+        if (!strcmp(text, "-")) {
+            text = NULL;
+        }
+
+        if (app_args.score_path_cstr) {
+            app_args.out_path_cstr = text;
+        }
+        else {
+            app_args.score_path_cstr = text;
+        }
+    }
+
+
+    if (app_args.score_type == aymo_score_type_unknown) {
+        const char* text = app_args.score_path_cstr;
+        if (text) {
+            const char* ext = strrchr(text, '.');
+            if (ext) {
+                app_args.score_type = aymo_score_ext_to_type(ext + 1);
+            }
+        }
+        if (app_args.score_type == aymo_score_type_unknown) {
+            fprintf(stderr, "ERROR: Unsupported score type of \"%s\"\n", (text ? text : ""));
+            return 1;
+        }
+    }
+
+    if (argi < app_args.argc) {
+        fprintf(stderr, "ERROR: Unknown options after #%d = \"%s\"\n", argi, app_args.argv[argi]);
+        return 1;
+    }
+
+    return 0;
+}
+
+
+static int app_setup(void)
+{
+    if (aymo_file_load(app_args.score_path_cstr, &score_data, &score_size)) {
+        return 1;
+    }
+    score.base.vt = aymo_score_type_to_vt(app_args.score_type);
+    if (!score.base.vt) {
+        fprintf(stderr, "ERROR: Unsupported score type ID: %d\n", (int)app_args.score_type);
+        return 1;
+    }
+    aymo_score_ctor(&score.base);
+    if (aymo_score_load(&score.base, score_data, (uint32_t)score_size)) {
+        fprintf(stderr, "ERROR: Cannot load score \"%s\"\n", app_args.score_path_cstr);
+        return 1;
+    }
+
+    size_t chip_size = app_args.ymf262_vt->get_sizeof();
+    void* chip_alignptr = aymo_aligned_alloc(chip_size, 32u);
+    if (!chip_alignptr) {
+        perror("aymo_aligned_alloc(chip_size)");
+        return 2;
+    }
+    chip = (struct aymo_ymf262_chip*)chip_alignptr;
+    chip->vt = app_args.ymf262_vt;
+    aymo_ymf262_ctor(chip);
+
+    uint32_t out_channels = (app_args.out_quad ? 4u : 2u);
+    out_frame_length = app_args.out_frame_length;
+    if (out_frame_length < 1u) {
+        out_frame_length = 1u;
+    }
+    if (out_frame_length > (UINT32_MAX / (sizeof(int16_t) * out_channels))) {
+        out_frame_length = (UINT32_MAX / (sizeof(int16_t) * out_channels));
+    }
+    uint32_t out_buffer_size = (out_frame_length * sizeof(int16_t) * out_channels);
+    out_buffer_ptr = (int16_t*)malloc(out_buffer_size);
+    if (!out_buffer_ptr) {
+        perror("malloc(out_buffer_size)");
+        return 2;
+    }
+
+    if (app_args.benchmark) {
+        out_stdout = false;
+        out_file = NULL;
+    }
+    else if (!app_args.out_path_cstr || !strcmp(app_args.out_path_cstr, "") || !strcmp(app_args.out_path_cstr, "-")) {
+        out_stdout = true;
+        out_file = stdout;
+
+        #if (defined(__WINDOWS__) || defined(__CYGWIN__))
+            errno = 0;
+            _setmode(_fileno(stdout), O_BINARY);
+            if (errno) {
+                perror("_setmode(stdout)");
+                return 2;
+            }
+        #endif
+    }
+    else {
+        out_stdout = false;
+        out_file = fopen(app_args.out_path_cstr, "wb");
+        if (!out_file) {
+            perror(app_args.out_path_cstr);
+            return 1;
+        }
+
+        aymo_wave_heading_setup(
+            &wave_head,
+            AYMO_WAVE_FMT_TYPE_PCM,
+            (uint16_t)out_channels,
+            16u,
+            AYMO_YMF262_SAMPLE_RATE,
+            0u
+        );
+        if (fwrite(&wave_head, sizeof(wave_head), 1u, out_file) != 1u) {
+            perror("fwrite(wave_head)");
+            return 2;
+        }
+    }
+
+    return 0;
+}
+
+
+static void app_teardown(void)
+{
+    if (chip) {
+        aymo_ymf262_dtor(chip);
+        aymo_aligned_free(chip);
+    }
+    chip = NULL;
+
+    if (score.base.vt) {
+        aymo_score_unload(&score.base);
+        aymo_score_dtor(&score.base);
+    }
+    aymo_file_unload(score_data);
+    score_data = NULL;
+
+    if (!out_stdout && out_file) {
+        fclose(out_file);
+    }
+    out_file = NULL;
+
+    if (out_buffer_ptr && (out_buffer_ptr != out_buffer_default)) {
+        free(out_buffer_ptr);
+    }
+    out_buffer_ptr = NULL;
+    out_frame_length = 0u;
+}
+
+
+static int app_run(void)
+{
+    size_t out_channels = (app_args.out_quad ? 4u : 2u);
+    size_t out_sample_length = ((size_t)out_frame_length * out_channels);
+    uint32_t frame_total = 0u;
+    unsigned pending_loops = (app_args.loops - 1u);
+    unsigned score_after = app_args.score_after;
+
+    aymo_ymf262_generate_i16x2_f aymo_ymf262_generate_i16;
+    if (app_args.out_quad) {
+        aymo_ymf262_generate_i16 = aymo_ymf262_generate_i16x4;
+    }
+    else {
+        aymo_ymf262_generate_i16 = aymo_ymf262_generate_i16x2;
+    }
+
+    struct aymo_score_status* status = aymo_score_get_status(&score.base);
+    bool playing = !(status->flags & AYMO_SCORE_FLAG_EOF);
+
+    clock_start = clock();
+
+    while (playing) {
+        int16_t* buffer_ptr = out_buffer_ptr;
+        uint32_t avail_length = out_frame_length;
+        uint32_t delay_length = status->delay;
+
+        while (avail_length) {
+            if (delay_length > avail_length) {
+                delay_length = avail_length;
+            }
+
+            aymo_ymf262_generate_i16(chip, delay_length, buffer_ptr);
+            buffer_ptr += (delay_length * out_channels);
+            frame_total++;
+
+            aymo_score_tick(&score.base, delay_length);
+            avail_length -= delay_length;
+
+            if (status->flags & AYMO_SCORE_FLAG_EVENT) {
+                aymo_ymf262_enqueue_write(chip, status->address, status->value);
+            }
+
+            while (!(status->flags & (AYMO_SCORE_FLAG_DELAY | AYMO_SCORE_FLAG_EOF))) {
+                aymo_score_tick(&score.base, 0u);
+
+                if (status->flags & AYMO_SCORE_FLAG_EVENT) {
+                    aymo_ymf262_enqueue_write(chip, status->address, status->value);
+                }
+            }
+
+            if (!(status->flags & AYMO_SCORE_FLAG_EOF)) {
+                delay_length = status->delay;
+            }
+            else if (app_args.loops) {
+                if (pending_loops) {
+                    pending_loops--;
+                    aymo_score_restart(&score.base);
+                }
+                else if (score_after) {
+                    status->flags |= AYMO_SCORE_FLAG_DELAY;
+                    status->delay = score_after;
+                    delay_length = status->delay;
+                    score_after = 0u;
+                }
+                else {
+                    playing = false;
+                }
+            }
+            else {
+                aymo_score_restart(&score.base);
+            }
+        }
+
+        if (out_file) {
+            if (fwrite(out_buffer_ptr, sizeof(int16_t), out_sample_length, out_file) != out_sample_length) {
+                perror("fwrite(out_buffer)");
+                return 2;
+            }
+        }
+    }
+
+    if (out_file && !out_stdout) {
+        if (fseek(out_file, 0, SEEK_SET)) {
+            perror("fseek(out_file)");
+            return 2;
+        }
+        aymo_wave_heading_setup(
+            &wave_head,
+            AYMO_WAVE_FMT_TYPE_PCM,
+            (uint16_t)out_channels,
+            16u,
+            AYMO_YMF262_SAMPLE_RATE,
+            frame_total
+        );
+        if (fwrite(&wave_head, sizeof(wave_head), 1u, out_file) != 1u) {
+            perror("fwrite(wave_head)");
+            return 2;
+        }
+    }
+
+    clock_end = clock();
+
+    if (app_args.benchmark) {
+        clock_t clock_duration = (clock_end - clock_start);
+        double seconds = ((double)clock_duration * (1. / (double)CLOCKS_PER_SEC));
+        printf("Render time: %.3f seconds\n", seconds);
+    }
+
+    return 0;
+}
+
+
+int main(int argc, char** argv)
+{
+    app_return = app_boot();
+    if (app_return) goto catch_;
+
+    app_return = app_args_init(argc, argv);
+    if (app_return) goto catch_;
+
+    app_return = app_args_parse();
+    if (app_return == -1) {  // help
+        app_return = 0;
+        goto finally_;
+    }
+    if (app_return) goto catch_;
+
+    app_return = app_setup();
+    if (app_return) goto catch_;
+
+    app_return = app_run();
+    if (app_return) goto catch_;
+
+    goto finally_;
+
+catch_:
+finally_:
+    app_teardown();
+    return app_return;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/apps/meson.build b/apps/meson.build
new file mode 100644
index 0000000..83e404e
--- /dev/null
+++ b/apps/meson.build
@@ -0,0 +1,22 @@
+apps_includes = include_directories(
+  '.',
+)
+
+apps_sources = files(
+)
+
+if not opt_apps.disabled()
+  app_names = [
+    'aymo_ymf262_play',
+  ]
+
+  foreach app_name : app_names
+    executable(
+      app_name,
+      apps_sources + files('@0@.c'.format(app_name)),
+      include_directories: [apps_includes, aymo_includes],
+      link_with: [aymo_static_lib, aymo_libc_lib],
+      install: false,
+    )
+  endforeach
+endif
diff --git a/aymo.pc.in b/aymo.pc.in
new file mode 100644
index 0000000..53955a3
--- /dev/null
+++ b/aymo.pc.in
@@ -0,0 +1,15 @@
+# AYMO pkg-config file
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: AYMO
+Description: Accelerated YaMaha Operator
+URL: @URL@
+Version: @VERSION@
+Requires:
+Conflicts:
+Libs: -L${libdir} -laymo
+Cflags: -I${includedir}/aymo
diff --git a/contrib/meson.build b/contrib/meson.build
new file mode 100644
index 0000000..c11bdaa
--- /dev/null
+++ b/contrib/meson.build
@@ -0,0 +1,26 @@
+#add_project_arguments('-DTDA8425_FLOAT=float')
+
+aymo_contrib_includes = include_directories(
+  'Nuked-OPL3',
+  'TDA8425_emu/src',
+  'YM7128B_emu/src',
+)
+
+aymo_contrib_sources = files(
+  'Nuked-OPL3/opl3.c',
+  'TDA8425_emu/src/TDA8425_emu.c',
+  'YM7128B_emu/src/YM7128B_emu.c',
+)
+
+aymo_contrib_lib = static_library(
+  'aymo-contrib',
+  aymo_contrib_sources,
+  include_directories: aymo_contrib_includes,
+  dependencies: libm,
+  install: false,
+)
+
+aymo_contrib_dep = declare_dependency(
+  include_directories: aymo_contrib_includes,
+  link_whole: aymo_contrib_lib,
+)
diff --git a/doc/.gitkeep b/doc/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/include/aymo.h b/include/aymo.h
new file mode 100644
index 0000000..179c0bd
--- /dev/null
+++ b/include/aymo.h
@@ -0,0 +1,33 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_h
+#define _include_aymo_h
+
+#include "aymo_cc.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+AYMO_PUBLIC void aymo_boot(void);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_h
diff --git a/include/aymo_cc.h b/include/aymo_cc.h
new file mode 100644
index 0000000..bca79a0
--- /dev/null
+++ b/include/aymo_cc.h
@@ -0,0 +1,189 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cc_h
+#define _include_aymo_cc_h
+
+// Use "aymo_cc.h" as the common file including "aymo_config.h" (if required)
+#ifdef AYMO_HAVE_CONFIG_H  // command line / build system
+    #include "aymo_config.h"
+#endif
+
+
+// Usual macros to generate strings
+#ifndef AYMO_STRINGIFY
+    #define AYMO_STRINGIFY(token)   #token
+    #define AYMO_STRINGIFY2(token)  AYMO_STRINGIFY(token)
+#endif
+
+
+// Unused variable
+#define AYMO_UNUSED_VAR(x)  ((void)(x))
+
+
+// Common C++ name mangling wrappers.
+#ifndef AYMO_CXX_EXTERN_C_BEGIN
+    #ifdef __cplusplus
+        #define AYMO_CXX_EXTERN_C_BEGIN     extern "C" {
+        #define AYMO_CXX_EXTERN_C_END       }  // extern "C"
+    #else
+        #define AYMO_CXX_EXTERN_C_BEGIN     // ignore
+        #define AYMO_CXX_EXTERN_C_END       // ignore
+    #endif
+#endif
+
+
+// #pragma pack(push/pop) equivalents.
+// Originally by MSVC, also supported by modern GCC/Clang.
+#ifndef AYMO_PRAGMA_POP
+    #if (defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__))
+        #define AYMO_PRAGMA_PACK_PUSH_N(n)      _Pragma(AYMO_STRINGIFY(pack(push, n)))
+        #define AYMO_PRAGMA_PACK_PUSH_1         AYMO_PRAGMA_PACK_PUSH_N(1)
+        #define AYMO_PRAGMA_PACK_PUSH_DEFAULT   _Pragma("pack(push)")  _Pragma("pack()")
+        #define AYMO_PRAGMA_PACK_POP            _Pragma("pack(pop)")
+    #else
+        // Use default packing (i.e. compiler forced to 1-byte packing)
+        #ifndef _include_aymo_cc_h_AYMO_PRAGMA_POP
+            #define _include_aymo_cc_h_AYMO_PRAGMA_POP
+            #warning "Unsupported packing directives. Please set 1-byte packing to your compiler."
+        #endif
+        #define AYMO_PRAGMA_PACK_PUSH_N(n)      // keep default
+        #define AYMO_PRAGMA_PACK_PUSH_1         // keep default
+        #define AYMO_PRAGMA_PACK_PUSH_DEFAULT   // keep default
+        #define AYMO_PRAGMA_PACK_POP            // keep default
+    #endif
+#endif
+
+
+// Aligns to some bytes.
+// To be put AFTER the typename.
+#ifndef AYMO_ALIGN
+    #if defined(_MSC_VER)
+        #define AYMO_ALIGN(n)   __declspec(align(n))
+    #elif (defined(__GNUC__) || defined(__clang__))
+        #define AYMO_ALIGN(n)   __attribute__((aligned(n)))
+    #else
+        #define AYMO_ALIGN(n)   // default
+    #endif
+#endif
+
+
+// Sets bit-fields order as little-endian.
+#ifndef AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+    #if defined(__GNUC__) && defined(AYMO_CC_ID_GCC)
+        #define AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN  _Pragma("scalar_storage_order little-endian")
+        #define AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT        _Pragma("scalar_storage_order default")
+    #else
+        #define AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN  // default
+        #define AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT        // default
+    #endif
+#endif
+
+
+// Mark the symbol as public for shared objects (aka DLL).
+#ifndef AYMO_PUBLIC
+    #if (defined(AYMO_CC_HOST_WINDOWS)        )//FIXME: || defined(AYMO_CC_HOST_CYGWIN))
+        // Using MSVC attribute, also supported by modern GCC/Clang.
+        #ifdef AYMO_BUILD
+            #define AYMO_PUBLIC     extern __declspec(dllexport)
+        #else
+            #define AYMO_PUBLIC     extern __declspec(dllimport)
+        #endif
+        #define AYMO_PRIVATE        // ignore
+
+    #elif (defined(__GNUC__) || defined(__clang__))
+        // Using GCC-specific attribute
+        #ifdef AYMO_BUILD
+            #define AYMO_PUBLIC     extern __attribute__((visibility("default")))
+        #else
+            #define AYMO_PUBLIC     // ignore
+        #endif
+        #define AYMO_PRIVATE        extern __attribute__((visibility("hidden")))
+
+    #else
+        #ifndef _include_aymo_cc_h_AYMO_PUBLIC
+            #define _include_aymo_cc_h_AYMO_PUBLIC
+            #warning "Cannot assume a proper way to declare shared object functions."
+        #endif
+        #define AYMO_PUBLIC         extern  // ignore
+        #define AYMO_PRIVATE        extern  // ignore
+    #endif
+#endif
+
+
+// Wrap the condition expression with this
+#ifndef AYMO_LIKELY
+    #if (defined(__GNUC__) || defined(__clang__))
+        #define AYMO_LIKELY(x)      (__builtin_expect(!!(x), 1))
+        #define AYMO_UNLIKELY(x)    (__builtin_expect(!!(x), 0))
+    #else
+        #define AYMO_LIKELY(x)      (x)  // ignore
+        #define AYMO_UNLIKELY(x)    (x)  // ignore
+    #endif
+#endif
+
+
+// Usual macro to get 1D array size
+#ifndef AYMO_VECTOR_LENGTH
+    #define AYMO_VECTOR_LENGTH(name)    (sizeof(name) / sizeof((name)[0]))
+#endif
+
+
+// Cheap alternative to memset()
+// No care for performance; made just to avoid a library call
+static inline void aymo_memset(void* data, int value, unsigned long size)
+{
+    char* ptr = (char*)data;
+    const char* end = (char*)data + size;
+    while (ptr != end) {
+        *(ptr++) = value;
+    }
+}
+
+
+// Cheap alternative to memcpy()
+// No care for performance; made just to avoid a library call
+static inline void aymo_memcpy(void* dst, void* src, unsigned long size)
+{
+    char* dstp = (char*)dst;
+    const char* srcp = (const char*)src;
+    const char* end = (const char*)src + size;
+    while (srcp != end) {
+        *(dstp++) = *(srcp++);
+    }
+}
+
+
+// Cheap alternative to strcmp()
+// No care for performance; made just to avoid a library call
+static inline int aymo_strcmp(const char* a, const char* b)
+{
+    if (a && b) {
+        do {
+            if (*a != *b) {
+                return (*a - *b);
+            }
+        } while (*(a++) && *(b++));
+        return 0;
+    }
+    return -0x8000;
+}
+
+
+#endif  // _include_aymo_cc_h
diff --git a/include/aymo_convert.h b/include/aymo_convert.h
new file mode 100644
index 0000000..82ae358
--- /dev/null
+++ b/include/aymo_convert.h
@@ -0,0 +1,54 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_convert_h
+#define _include_aymo_convert_h
+
+#include "aymo_cc.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+AYMO_PUBLIC void aymo_convert_boot(void);
+
+AYMO_PUBLIC void aymo_convert_i16_f32(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_convert_f32_i16(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_convert_i16_f32_1(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_convert_f32_i16_1(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_convert_i16_f32_k(size_t n, const int16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_convert_f32_i16_k(size_t n, const float f32v[], int16_t i16v[], float scale);
+
+AYMO_PUBLIC void aymo_convert_u16_f32(size_t n, const uint16_t u16v[], float f32v[]);
+AYMO_PUBLIC void aymo_convert_f32_u16(size_t n, const float f32v[], uint16_t u16v[]);
+
+AYMO_PUBLIC void aymo_convert_u16_f32_1(size_t n, const uint16_t u16v[], float f32v[]);
+AYMO_PUBLIC void aymo_convert_f32_u16_1(size_t n, const float f32v[], uint16_t u16v[]);
+
+AYMO_PUBLIC void aymo_convert_u16_f32_k(size_t n, const uint16_t u16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_convert_f32_u16_k(size_t n, const float f32v[], uint16_t u16v[], float scale);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_convert_h
diff --git a/include/aymo_convert_arm_neon.h b/include/aymo_convert_arm_neon.h
new file mode 100644
index 0000000..cfe093b
--- /dev/null
+++ b/include/aymo_convert_arm_neon.h
@@ -0,0 +1,66 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_convert_arm_neon_h
+#define _include_aymo_convert_arm_neon_h
+
+#include "aymo_cc.h"
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+#include <stddef.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_CONVERT_ARM_NEON_##_token_
+#define aymo_(_token_)  aymo_convert_arm_neon_##_token_
+
+
+AYMO_PUBLIC void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale);
+
+AYMO_PUBLIC void aymo_(u16_f32)(size_t n, const uint16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(u16_f32_1)(size_t n, const uint16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(u16_f32_k)(size_t n, const uint16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t i16v[], float scale);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
+#endif  // _include_aymo_convert_arm_neon_h
diff --git a/include/aymo_convert_none.h b/include/aymo_convert_none.h
new file mode 100644
index 0000000..f665f4c
--- /dev/null
+++ b/include/aymo_convert_none.h
@@ -0,0 +1,64 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_convert_none_h
+#define _include_aymo_convert_none_h
+
+#include "aymo_cc.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_CONVERT_NONE_##_token_
+#define aymo_(_token_)  aymo_convert_none_##_token_
+
+
+AYMO_PUBLIC void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale);
+
+AYMO_PUBLIC void aymo_(u16_f32)(size_t n, const uint16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(u16_f32_1)(size_t n, const uint16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(u16_f32_k)(size_t n, const uint16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t i16v[], float scale);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_convert_none_h
diff --git a/include/aymo_convert_x86_avx2.h b/include/aymo_convert_x86_avx2.h
new file mode 100644
index 0000000..76651b1
--- /dev/null
+++ b/include/aymo_convert_x86_avx2.h
@@ -0,0 +1,66 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_convert_x86_avx2_h
+#define _include_aymo_convert_x86_avx2_h
+
+#include "aymo_cc.h"
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+#include <stddef.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_CONVERT_X86_AVX2_##_token_
+#define aymo_(_token_)  aymo_convert_x86_avx2_##_token_
+
+
+AYMO_PUBLIC void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale);
+
+AYMO_PUBLIC void aymo_(u16_f32)(size_t n, const uint16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(u16_f32_1)(size_t n, const uint16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(u16_f32_k)(size_t n, const uint16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t i16v[], float scale);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
+#endif  // _include_aymo_convert_x86_avx2_h
diff --git a/include/aymo_convert_x86_sse41.h b/include/aymo_convert_x86_sse41.h
new file mode 100644
index 0000000..c83f7a7
--- /dev/null
+++ b/include/aymo_convert_x86_sse41.h
@@ -0,0 +1,66 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_convert_x86_sse41_h
+#define _include_aymo_convert_x86_sse41_h
+
+#include "aymo_cc.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#include <stddef.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_CONVERT_X86_SSE41_##_token_
+#define aymo_(_token_)  aymo_convert_x86_sse41_##_token_
+
+
+AYMO_PUBLIC void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale);
+
+AYMO_PUBLIC void aymo_(u16_f32)(size_t n, const uint16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(u16_f32_1)(size_t n, const uint16_t i16v[], float f32v[]);
+AYMO_PUBLIC void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t i16v[]);
+
+AYMO_PUBLIC void aymo_(u16_f32_k)(size_t n, const uint16_t i16v[], float f32v[], float scale);
+AYMO_PUBLIC void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t i16v[], float scale);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
+#endif  // _include_aymo_convert_x86_sse41_h
diff --git a/include/aymo_cpu.h b/include/aymo_cpu.h
new file mode 100644
index 0000000..4697836
--- /dev/null
+++ b/include/aymo_cpu.h
@@ -0,0 +1,55 @@
+// Main CPU header file.
+// Always include this one, not the CPU-specific ones.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_h
+#define _include_aymo_cpu_h
+
+#include "aymo_cc.h"
+
+#if (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64))
+    #include "aymo_cpu_x86.h"
+
+    #if defined(AYMO_CPU_SUPPORT_X86_AVX2)
+        #include "aymo_cpu_x86_avx2.h"
+    #endif
+
+    #if defined(AYMO_CPU_SUPPORT_X86_SSE41)
+        #include "aymo_cpu_x86_sse41.h"
+    #endif
+#endif
+
+#if (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64))
+    #include "aymo_cpu_arm.h"
+
+    #if defined(AYMO_CPU_SUPPORT_ARM_NEON)
+        #include "aymo_cpu_arm_neon.h"
+    #endif
+#endif
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+AYMO_PUBLIC void aymo_cpu_boot(void);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_cpu_h
diff --git a/include/aymo_cpu_arm.h b/include/aymo_cpu_arm.h
new file mode 100644
index 0000000..2ee7aa0
--- /dev/null
+++ b/include/aymo_cpu_arm.h
@@ -0,0 +1,43 @@
+// CPU-specific header file for ARM.
+// DO NOT include this file; #include "aymo_cpu.h" instead.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_arm_h
+#define _include_aymo_cpu_arm_h
+#if (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64))
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#define AYMO_CPU_ARM_EXT_ARMV7      (1u << 0u)
+#define AYMO_CPU_ARM_EXT_NEON       (1u << 1u)
+#define AYMO_CPU_ARM_EXT_AARCH32    (1u << 2u)
+#define AYMO_CPU_ARM_EXT_AARCH64    (1u << 3u)
+#define AYMO_CPU_ARM_EXT_NEON64     (1u << 4u)
+
+
+AYMO_PUBLIC void aymo_cpu_arm_boot(void);
+AYMO_PUBLIC unsigned aymo_cpu_arm_get_extensions(void);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64))
+#endif  // _include_aymo_cpu_arm_h
diff --git a/include/aymo_cpu_arm_neon.h b/include/aymo_cpu_arm_neon.h
new file mode 100644
index 0000000..b391840
--- /dev/null
+++ b/include/aymo_cpu_arm_neon.h
@@ -0,0 +1,53 @@
+// CPU-specific header file for ARM NEON.
+// DO NOT include this file; #include "aymo_cpu.h" instead.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_arm_neon_h
+#define _include_aymo_cpu_arm_neon_h
+
+#include <arm_neon.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+typedef int16x4_t   vi16x4_t;
+typedef uint16x4_t  vu16x4_t;
+
+typedef int32x2_t   vi32x2_t;
+typedef uint32x2_t  vu32x2_t;
+
+typedef int16x8_t   vi16x8_t;
+typedef uint16x8_t  vu16x8_t;
+
+typedef int32x4_t   vi32x4_t;
+typedef uint32x4_t  vu32x4_t;
+
+typedef float32x4_t vf32x4_t;
+typedef float32x2_t vf32x2_t;
+
+
+#ifndef AYMO_ALIGN_V128
+    #define AYMO_ALIGN_V128     AYMO_ALIGN(16)
+#endif
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_cpu_arm_neon_h
diff --git a/include/aymo_cpu_arm_neon_inline.h b/include/aymo_cpu_arm_neon_inline.h
new file mode 100644
index 0000000..bfcf1a1
--- /dev/null
+++ b/include/aymo_cpu_arm_neon_inline.h
@@ -0,0 +1,382 @@
+// CPU-specific inline methods for ARM NEON.
+// Only #include after "aymo_cpu.h" to have inline methods.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_arm_neon_inline_h
+#define _include_aymo_cpu_arm_neon_inline_h
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+#include <stdint.h>
+#ifdef _MSC_VER
+    #include <armintr.h>
+#endif
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// Generic CPU shorthands
+
+#if defined(_MSC_VER)
+    #define AYMO_ARM_DSB(n)     (__dsb(n))
+    #define AYMO_ARM_DSB_ST()   (AYMO_ARM_DSB((unsigned)_ARM_BARRIER_ST))
+#elif (defined(__GNUC__) || defined(__clang__))
+    #define AYMO_ARM_DSB_ST()   {asm volatile ("dsb st");}
+#endif
+
+#define vsfence         AYMO_ARM_DSB_ST
+
+
+// SIMD type shorthands
+typedef vi16x8_t vi16_t;
+typedef vu16x8_t vu16_t;
+typedef vi32x4_t vi32_t;
+
+
+// v*() methods are for vi16_t = int16_t[8]
+
+#define vi2u            vreinterpretq_u16_s16
+#define vu2i            vreinterpretq_s16_u16
+
+#define vcreate         vcreate_s16
+#define vload           vld1q_s16
+#define vstore          vst1q_s16
+
+#define vsetx           vsetz
+#define vset1           vdupq_n_s16
+#define vseta           vseta_s16
+#define vsetr           vsetr_s16
+#define vsetz()         (vset1(0))
+#define vsetf()         (vset1(-1))
+#define vsetm           vsetm_s16
+
+#define vnot            vmvnq_s16
+#define vand            vandq_s16
+#define vor             vorrq_s16
+#define vxor            veorq_s16
+#define vandnot(a,b)    (vbicq_s16((b), (a)))  // ~A & B
+#define vblendv(a,b,m)  (vbslq_s16(vi2u(m), (b), (a)))  // B if M else A
+
+#define vcmpeq(a, b)    (vu2i(vceqq_s16((a), (b))))
+#define vcmpgt(a, b)    (vu2i(vcgtq_s16((a), (b))))
+#define vcmpz(x)        (vcmpeq((x), vsetz()))
+#define vcmpp(x)        (vcmpgt((x), vsetz()))
+#define vcmpn(x)        (vcmpgt(vsetz(), (x)))
+
+#define vadd            vaddq_s16
+#define vaddsi          vqaddq_s16
+#define vaddsu          vqaddq_u16
+
+#define vsub            vsubq_s16
+#define vsubsi          vqsubq_s16
+#define vsubsu          vqsubq_u16
+#define vneg            vqnegq_s16
+
+#define vslli           vshlq_n_s16
+#define vsrli(x,n)      (vu2i(vshrq_n_u16(vi2u(x), (n))))
+#define vsrai           vshrq_n_s16
+#define vsllv           vshlq_s16
+#define vsrlv(a,b)      (vu2i(vshlq_u16(vi2u(a), vnegq_s16(b))))
+#define vsrav(a,b)      (vshlq_s16((a), vnegq_s16(b)))
+
+#define vmulhrs         vqrdmulhq_s16
+
+#define vmullo          vmulq_s16
+
+#define vmini           vminq_s16
+#define vminu           vminq_u16
+
+#define vmaxi           vmaxq_s16
+#define vmaxu           vmaxq_u16
+
+#define vextract        vgetq_lane_s16
+#define vextractn       vextractn_s16
+#define vextractv(x,i)  (((const int16_t*)(const void*)&(x))[(i)])
+
+#define vinsert(x,n,i)  (vsetq_lane_s16((n), (x), (i)))
+#define vinsertn        vinsertn_s16
+#define vinsertv(x,n,i) {((int16_t*)(void*)&(x))[(i)] = (n);}
+
+#define vgather         vgather_s16
+
+#define vhsum           vhsum_s16
+#define vhsums          vhsum
+
+#define vpow2m1lt4      vpow2m1lt4_s16
+#define vpow2lt4        vpow2lt4_s16
+
+#define vgetlo          vget_low_s16
+#define vgethi          vget_high_s16
+#define vswap(x)        (vcombine(vgethi(x), vgetlo(x)))
+
+#define vrev            vrev64q_s16
+#define vrevv(x)        (vvcastv(vrev64q_s32(vcastvv(x))))
+#define vext            vextq_s16
+
+#define vcombine        vcombine_s16
+
+#define vunpack         vmovl_s16
+#define vunpacklo(x)    (vunpack(vgetlo(x)))
+#define vunpackhi(x)    (vunpack(vgethi(x)))
+
+#define v2vv            vunpack
+#define vlo2vv          vunpacklo
+#define vhi2vv          vunpackhi
+
+#define vcastvv         vreinterpretq_s32_s16
+
+
+// w*() methods are for widening/narrowing vi16_t = int16_t[8] <--> vi32_t = int32_t[4]
+
+#define wmullo          vmull_s16
+
+#define wcombine        vcombine_s16
+
+
+// vv*() methods are for vi32_t = int32_t[4]
+
+#define vvi2u           vreinterpretq_u32_s32
+#define vvu2i           vreinterpretq_s32_u32
+
+#define vvsetx          vvsetz
+#define vvset1          vdupq_n_s32
+#define vvsetz()        (vvset1(0))
+#define vvsetf()        (vvset1(-1))
+
+#define vvand           vandq_s32
+#define vvor            vorrq_s32
+#define vvxor           veorq_s32
+#define vvandnot(a,b)   (vbicq_s32((b), (a)))  // ~A & B
+
+#define vvadd           vaddq_s32
+#define vwadd           vaddw_s32
+
+#define vvsrli(x,n)     (vvu2i(vshrq_n_u32(vvi2u(x), (n))))
+
+#define vvsllv          vshlq_s32
+
+#define vvmullo         vmulq_s32
+
+#define vvextract       vgetq_lane_s32
+#define vvextractn      vvextractn_s32
+
+#define vvinsert(x,n,i) (vsetq_lane_s32((n), (x), (i)))
+#define vvinsertn       vvinsertn_s32
+
+#define vvgetlo         vget_low_s32
+#define vvgethi         vget_high_s32
+#define vvswap(x)       (vvcombine(vvgethi(x), vvgetlo(x)))
+
+#define vvrev           vrev64q_s32
+#define vvext           vextq_s32
+
+#define vvcombine       vcombine_s32
+
+#define vvpack(a,b)     (vcombine_s16(vmovn_s32(a), vmovn_s32(b)))
+#define vvpacks(a,b)    (vcombine_s16(vqmovn_s32(a), vqmovn_s32(b)))
+
+#define vvcastv         vreinterpretq_s16_s32
+
+
+static inline
+int16x8_t vseta_s16(
+    int16_t i7,
+    int16_t i6,
+    int16_t i5,
+    int16_t i4,
+    int16_t i3,
+    int16_t i2,
+    int16_t i1,
+    int16_t i0
+)
+{
+    int16_t r_n128_i16[8] = { i0, i1, i2, i3, i4, i5, i6, i7 };
+    return vld1q_s16(r_n128_i16);
+}
+
+
+static inline
+int16x8_t vsetr_s16(
+    int16_t i7,
+    int16_t i6,
+    int16_t i5,
+    int16_t i4,
+    int16_t i3,
+    int16_t i2,
+    int16_t i1,
+    int16_t i0
+)
+{
+    int16_t r_n128_i16[8] = { i7, i6, i5, i4, i3, i2, i1, i0 };
+    return vld1q_s16(r_n128_i16);
+}
+
+
+static inline
+int16x8_t vsetm_s16(uint8_t m)
+{
+    static const int16_t kk[8] = { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 };
+    int16x8_t k = vld1q_s16(kk);
+    return vcmpeq(vand(vset1((int16_t)(uint16_t)m), k), k);
+}
+
+
+static inline
+short vextractn_s16(int16x8_t x, const int i)
+{
+    int16_t x_n128_i16[8];
+    vst1q_s16(x_n128_i16, x);
+    return x_n128_i16[i];
+}
+
+
+static inline
+int16x8_t vinsertn_s16(int16x8_t x, short n, const int i)
+{
+    int16_t x_n128_i16[8];
+    vst1q_s16(x_n128_i16, x);
+    x_n128_i16[i] = n;
+    return vld1q_s16(x_n128_i16);
+}
+
+
+// Gathers 16x 16-bit words via 16x 8-bit (low) indexes
+static inline
+int16x8_t vgather_s16(const int16_t* v, int16x8_t i)
+{
+    // Plain C lookup, smallest cache footprint
+    uint8_t i_n128_u8[16];
+    vst1q_s16((void*)i_n128_u8, i);
+    int16_t* r_n128_i16 = (int16_t*)(void*)i_n128_u8;
+    r_n128_i16[0] = v[i_n128_u8[0x0]];
+    r_n128_i16[1] = v[i_n128_u8[0x2]];
+    r_n128_i16[2] = v[i_n128_u8[0x4]];
+    r_n128_i16[3] = v[i_n128_u8[0x6]];
+    r_n128_i16[4] = v[i_n128_u8[0x8]];
+    r_n128_i16[5] = v[i_n128_u8[0xA]];
+    r_n128_i16[6] = v[i_n128_u8[0xC]];
+    r_n128_i16[7] = v[i_n128_u8[0xE]];
+    return vld1q_s16(r_n128_i16);
+}
+
+
+static inline
+int32_t vhsum_s16(int16x8_t x)
+{
+    int32x4_t sum16 = vpaddlq_s16(x);
+    int64x2_t sum32 = vpaddlq_s32(sum16);
+    int32x2_t lo32 = vreinterpret_s32_s64(vget_low_s64(sum32));
+    int32x2_t hi32 = vreinterpret_s32_s64(vget_high_s64(sum32));
+    return (vget_lane_s32(lo32, 0) + vget_lane_s32(hi32, 0));
+}
+
+
+// 0 <= x < 4  -->  (1 << (x - 1))  -->  0, 1, 2, 4
+static inline
+int16x8_t vpow2m1lt4_s16(int16x8_t x)
+{
+    return vsub(x, vcmpgt(x, vset1(2)));
+}
+
+
+// 0 <= x < 4  -->  (1 << x)
+static inline
+int16x8_t vpow2lt4_s16(int16x8_t x)
+{
+    return vsllv(vset1(1), x);
+}
+
+
+static inline
+int32_t vvextractn_s32(int32x4_t x, const int i)
+{
+    int32_t x_n128_i32[4];
+    vst1q_s32(x_n128_i32, x);
+    return x_n128_i32[i];
+}
+
+
+static inline
+int32x4_t vvinsertn_s32(int32x4_t x, int32_t n, const int i)
+{
+    int32_t x_n128_i32[4];
+    vst1q_s32(x_n128_i32, x);
+    x_n128_i32[i] = n;
+    return vld1q_s32(x_n128_i32);
+}
+
+
+static inline
+int16_t clamp16(int x)
+{
+    if (x < INT16_MIN) {
+        return (int16_t)INT16_MIN;
+    }
+    if (x >= INT16_MAX) {
+        return (int16_t)INT16_MAX;
+    }
+    return (int16_t)x;
+}
+
+
+// Finds first set bit = Counts trailing zeros
+// Emulates the BSD function
+static inline
+int uffsll(unsigned long long x)
+{
+#if defined(_MSC_VER)
+    unsigned long i = 0;
+#if defined(_WIN32)
+    if (_BitScanForward(&i, (uint32_t)x)) {
+        return (int)(i + 1);
+    }
+    if (_BitScanForward(&i, (uint32_t)(x >> 32))) {
+        return (int)(i + 33);
+    }
+#else
+    if (_BitScanForward64(&i, (unsigned long long)x)) {
+        return (int)(i + 1);
+    }
+#endif
+    return 0;
+
+#elif (defined(__GNUC__) || defined(__clang__))
+    return __builtin_ffsll((long long)x);
+
+#else
+    if (x) {
+        int i = 0;
+        do {
+            ++i;
+            x <<= 1;
+        } while(x);
+        return (64 - i);
+    }
+    return 0;
+#endif
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
+
+#endif  // _include_aymo_cpu_arm_neon_inline_h
diff --git a/include/aymo_cpu_x86.h b/include/aymo_cpu_x86.h
new file mode 100644
index 0000000..a808cc2
--- /dev/null
+++ b/include/aymo_cpu_x86.h
@@ -0,0 +1,47 @@
+// CPU-specific header file for x86.
+// DO NOT include this file; #include "aymo_cpu.h" instead.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_x86_h
+#define _include_aymo_cpu_x86_h
+#if (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64))
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#define AYMO_CPU_X86_EXT_SSE        (1u << 0u)
+#define AYMO_CPU_X86_EXT_SSE2       (1u << 1u)
+#define AYMO_CPU_X86_EXT_SSE3       (1u << 2u)
+#define AYMO_CPU_X86_EXT_SSSE3      (1u << 3u)
+#define AYMO_CPU_X86_EXT_SSE41      (1u << 4u)
+#define AYMO_CPU_X86_EXT_SSE42      (1u << 5u)
+#define AYMO_CPU_X86_EXT_AVX        (1u << 6u)
+#define AYMO_CPU_X86_EXT_AVX2       (1u << 7u)
+#define AYMO_CPU_X86_EXT_FMA3       (1u << 8u)
+
+
+AYMO_PUBLIC void aymo_cpu_x86_boot(void);
+AYMO_PUBLIC unsigned aymo_cpu_x86_get_extensions(void);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86))
+#endif  // _include_aymo_cpu_x86_h
diff --git a/include/aymo_cpu_x86_avx2.h b/include/aymo_cpu_x86_avx2.h
new file mode 100644
index 0000000..b8deeb4
--- /dev/null
+++ b/include/aymo_cpu_x86_avx2.h
@@ -0,0 +1,46 @@
+// CPU-specific header file for x86 AVX2.
+// DO NOT include this file; #include "aymo_cpu.h" instead.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_x86_avx2_h
+#define _include_aymo_cpu_x86_avx2_h
+
+#include <immintrin.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+typedef __m256i vi16x16_t;
+typedef __m256i vu16x16_t;
+
+typedef __m256i vi32x8_t;
+typedef __m256i vu32x8_t;
+
+typedef __m256 vf32x8_t;
+
+
+#ifndef AYMO_ALIGN_V256
+    #define AYMO_ALIGN_V256     AYMO_ALIGN(32)
+#endif
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_cpu_x86_avx2_h
diff --git a/include/aymo_cpu_x86_avx2_inline.h b/include/aymo_cpu_x86_avx2_inline.h
new file mode 100644
index 0000000..34062c5
--- /dev/null
+++ b/include/aymo_cpu_x86_avx2_inline.h
@@ -0,0 +1,428 @@
+// CPU-specific inline methods for x86 AVX2.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_x86_avx2_inline_h
+#define _include_aymo_cpu_x86_avx2_inline_h
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+#include <intrin.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#ifndef AYMO_CPU_X86_AVX2_GATHER16_STRATEGY
+    #define AYMO_CPU_X86_AVX2_GATHER16_STRATEGY 2
+#endif
+
+
+// Generic CPU shorthands
+
+#define vsfence         _mm_sfence
+
+
+// SIMD type shorthands
+typedef vi16x16_t vi16_t;
+typedef vu16x16_t vu16_t;
+typedef vi32x8_t vi32_t;
+
+
+// v*() methods are for vi16_t = int16_t[16]
+
+#define vi2u(x)         x
+#define vu2i(x)         x
+
+#define vsetx           _mm256_undefined_si256
+#define vset1           _mm256_set1_epi16
+#define vseta           _mm256_set_epi16
+#define vsetr           _mm256_setr_epi16
+#define vsetz           _mm256_setzero_si256
+#define vsetf()         (vset1(-1))
+#define vsetm            mm256_setm_epi16
+
+#define vnot(x)         (vxor((x), vsetf()))
+#define vand            _mm256_and_si256
+#define vor             _mm256_or_si256
+#define vxor            _mm256_xor_si256
+#define vandnot         _mm256_andnot_si256  // ~A & B
+#define vblendi         _mm256_blend_epi16
+#define vblendv         _mm256_blendv_epi8
+
+#define vcmpeq          _mm256_cmpeq_epi16
+#define vcmpgt          _mm256_cmpgt_epi16
+#define vcmpz(x)        (vcmpeq((x), vsetz()))
+#define vcmpp(x)        (vcmpgt((x), vsetz()))
+#define vcmpn(x)        (vcmpgt(vsetz(), (x)))
+
+#define vadd            _mm256_add_epi16
+#define vaddsi          _mm256_adds_epi16
+#define vaddsu          _mm256_adds_epu16
+
+#define vsub            _mm256_sub_epi16
+#define vsubsi          _mm256_subs_epi16
+#define vsubsu          _mm256_subs_epu16
+#define vneg(x)         (vsub(vsetz(), (x)))
+
+#define vslli           _mm256_slli_epi16
+#define vsrli           _mm256_srli_epi16
+#define vsrai           _mm256_srai_epi16
+#define vsllv            mm256_sllv_epi16
+#define vsrlv            mm256_srlv_epi16
+#define vsrav            mm256_srav_epi16
+
+#define vmulihi         _mm256_mulhi_epi16
+#define vmuluhi         _mm256_mulhi_epu16
+
+#define vmulilo         _mm256_mullo_epi16
+#define vmululo         _mm256_mullo_epi16
+
+#define vmini           _mm256_min_epi16
+#define vminu           _mm256_min_epu16
+
+#define vmaxi           _mm256_max_epi16
+#define vmaxu           _mm256_max_epu16
+
+#define vextract        _mm256_extract_epi16
+#define vextractn        mm256_extractn_epi16
+#define vextractv(x,i)   (((const int16_t*)(const void*)&(x))[(i)])
+
+#define vinsert         _mm256_insert_epi16
+#define vinsertn         mm256_insertn_epi16
+#define vinsertv(x,n,i)  {((int16_t*)(void*)&(x))[(i)] = (n);}
+
+#define vgather          mm256_i16gather_epi16lo
+
+#define vhsum            mm256_hsum_epi16
+#define vhsums           mm256_hsums_epi16
+
+#define vpow2m1lt4       mm256_pow2m1lt4_epi16
+#define vpow2lt4         mm256_pow2lt4_epi16
+
+#define vunpacklo       _mm256_unpacklo_epi16
+#define vunpackhi       _mm256_unpackhi_epi16
+
+
+// vv*() methods are for vi32_t = int32_t[8]
+
+#define vvi2u(x)        x
+#define vvu2i(x)        x
+
+#define vvsetx          _mm256_undefined_si256
+#define vvset1          _mm256_set1_epi32
+#define vvseta          _mm256_set_epi32
+#define vvsetr          _mm256_setr_epi32
+#define vvsetz          _mm256_setzero_si256
+#define vvsetf()        (vvset1(-1))
+
+#define vvand           vand
+#define vvor            vor
+#define vvxor           vxor
+#define vvandnot        vandnot
+
+#define vvadd           _mm256_add_epi32
+
+#define vvsrli          _mm256_srli_epi32
+
+#define vvsllv          _mm256_sllv_epi32
+
+#define vvextract       _mm256_extract_epi32
+#define vvextractn       mm256_extractn_epi32
+
+#define vvinsert        _mm256_insert_epi32
+#define vvinsertn        mm256_insertn_epi32
+
+#define vvmullo         _mm256_mullo_epi32
+
+#define vvpackus        _mm256_packus_epi32
+
+
+static inline
+__m256i mm256_setm_epi16(uint16_t m)
+{
+    const __m256i k = vsetr(
+        0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, -0x8000
+    );
+    return vcmpeq(vand(vset1((int16_t)m), k), k);
+}
+
+
+// see: https://stackoverflow.com/questions/51789685/reproduce-mm256-sllv-epi16-and-mm256-sllv-epi8-in-avx2/51805592#51805592
+static inline
+__m256i mm256_sllv_epi16(__m256i x, __m256i n)
+{
+    const __m256i m = _mm256_set1_epi32(0xFFFF0000);
+    __m256i lo = _mm256_sllv_epi32(x, _mm256_andnot_si256(m, n));
+    __m256i hi = _mm256_sllv_epi32(
+        _mm256_and_si256(m, x),
+        _mm256_srli_epi32(n, 16)
+    );
+    return _mm256_blend_epi16(lo, hi, 0xAA);
+}
+
+
+// see: https://stackoverflow.com/questions/51789685/reproduce-mm256-sllv-epi16-and-mm256-sllv-epi8-in-avx2/51805592#51805592
+static inline
+__m256i mm256_srlv_epi16(__m256i x, __m256i n)
+{
+    const __m256i m = _mm256_set1_epi32(0x0000FFFF);
+    __m256i lo = _mm256_srlv_epi32(
+        _mm256_and_si256(m, x),
+        _mm256_and_si256(m, n)
+    );
+    __m256i hi = _mm256_srlv_epi32(x, _mm256_srli_epi32(n, 16));
+    return _mm256_blend_epi16(lo, hi, 0xAA);
+}
+
+
+// see: https://stackoverflow.com/questions/51789685/reproduce-mm256-sllv-epi16-and-mm256-sllv-epi8-in-avx2/51805592#51805592
+static inline
+__m256i mm256_srav_epi16(__m256i x, __m256i n)
+{
+    const __m256i m = _mm256_set1_epi32(0x0000FFFF);
+    __m256i lo = _mm256_srav_epi32(
+        _mm256_and_si256(m, x),
+        _mm256_and_si256(m, n)
+    );
+    __m256i hi = _mm256_srav_epi32(x, _mm256_srli_epi32(n, 16));
+    return _mm256_blend_epi16(lo, hi, 0xAA);
+}
+
+
+static inline
+short mm256_extractn_epi16(__m256i x, const int i)
+{
+    int16_t AYMO_ALIGN_V256 x_m256i_i16[16];
+    _mm256_store_si256((__m256i*)(void*)x_m256i_i16, x);
+    return x_m256i_i16[i];
+}
+
+
+static inline
+__m256i mm256_insertn_epi16(__m256i x, short n, const int i)
+{
+    int16_t AYMO_ALIGN_V256 x_m256i_i16[16];
+    _mm256_store_si256((__m256i*)(void*)x_m256i_i16, x);
+    x_m256i_i16[i] = n;
+    return _mm256_load_si256((__m256i*)(void*)x_m256i_i16);
+}
+
+
+// Gathers 16x 16-bit words via 16x 8-bit (low) indexes
+static inline
+__m256i mm256_i16gather_epi16lo(const int16_t* v, __m256i i)
+{
+#if (AYMO_CPU_X86_AVX2_GATHER16_STRATEGY == 2)
+    // 2x 32-bit gatherings, 16-bit words, smallest cache footprint
+    const __m256i sl = _mm256_set_epi8(
+        -1, -1, -1, 12, -1, -1, -1,  8, -1, -1, -1,  4, -1, -1, -1,  0,
+        -1, -1, -1, 12, -1, -1, -1,  8, -1, -1, -1,  4, -1, -1, -1,  0
+    );
+    const __m256i sh = _mm256_set_epi8(
+        -1, -1, -1, 14, -1, -1, -1, 10, -1, -1, -1,  6, -1, -1, -1,  2,
+        -1, -1, -1, 14, -1, -1, -1, 10, -1, -1, -1,  6, -1, -1, -1,  2
+    );
+    __m256i jh = _mm256_shuffle_epi8(i, sh);
+    __m256i rh = _mm256_i32gather_epi32((const int32_t*)(const void*)v, jh, 2);
+    rh = _mm256_slli_epi32(rh, 16);
+    __m256i jl = _mm256_shuffle_epi8(i, sl);
+    __m256i rl = _mm256_i32gather_epi32((const int32_t*)(const void*)v, jl, 2);
+    return _mm256_blend_epi16(rl, rh, 0xAA);
+
+#elif (CONFIG_AYMO_X86_AVX2_GATHER16_STRATEGY == 1)
+    // 1x 32-bit gathering, joint 16-bit words, squared cache footprint
+    const __m256i s = _mm256_set_epi8(
+        -1, -1, 14, 12, -1, -1, 10, 8, -1, -1, 6, 4, -1, -1, 2, 0,
+        -1, -1, 14, 12, -1, -1, 10, 8, -1, -1, 6, 4, -1, -1, 2, 0
+    );
+    __m256i j = _mm256_shuffle_epi8(i, s);
+    return _mm256_i32gather_epi32((const int32_t*)(const void *)v, j, 4);
+
+#else  // CONFIG_AYMO_X86_AVX2_GATHER16_STRATEGY
+    // Plain C lookup, smallest cache footprint
+    return vsetr(
+        v[vextract(i, 0x0)],
+        v[vextract(i, 0x1)],
+        v[vextract(i, 0x2)],
+        v[vextract(i, 0x3)],
+        v[vextract(i, 0x4)],
+        v[vextract(i, 0x5)],
+        v[vextract(i, 0x6)],
+        v[vextract(i, 0x7)],
+        v[vextract(i, 0x8)],
+        v[vextract(i, 0x9)],
+        v[vextract(i, 0xA)],
+        v[vextract(i, 0xB)],
+        v[vextract(i, 0xC)],
+        v[vextract(i, 0xD)],
+        v[vextract(i, 0xE)],
+        v[vextract(i, 0xF)]
+    );
+#endif  // CONFIG_AYMO_X86_AVX2_GATHER16_STRATEGY
+}
+
+
+// see: https://stackoverflow.com/questions/60108658/fastest-method-to-calculate-sum-of-all-packed-32-bit-integers-using-avx512-or-av/
+static inline
+int mm_hsum_epi32(__m128i x)
+{
+    __m128i hi64 = _mm_unpackhi_epi64(x, x);
+    __m128i sum64 = _mm_add_epi32(hi64, x);
+    __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i sum32 = _mm_add_epi32(sum64, hi32);
+    return _mm_cvtsi128_si32(sum32);
+}
+
+
+static inline
+int mm256_hsum_epi32(__m256i x)
+{
+    __m128i lo128 = _mm256_castsi256_si128(x);
+    __m128i hi128 = _mm256_extracti128_si256(x, 1);
+    __m128i sum32 = _mm_add_epi32(lo128, hi128);
+    return mm_hsum_epi32(sum32);
+}
+
+
+// see: https://stackoverflow.com/questions/55057933/simd-accumulate-adjacent-pairs
+static inline
+int mm256_hsum_epi16(__m256i x)
+{
+    __m256i sum16 = _mm256_madd_epi16(x, vset1(1));
+    return mm256_hsum_epi32(sum16);
+}
+
+
+static inline
+int mm256_hsums_epi16(__m256i x)
+{
+    __m256i hs16 = _mm256_hadds_epi16(x, vsetz());
+    __m256i sum16 = _mm256_unpacklo_epi16(hs16, vsetz());
+    return mm256_hsum_epi32(sum16);
+}
+
+
+// 0 <= x < 4  -->  (1 << (x - 1))  -->  0, 1, 2, 4
+static inline
+__m256i mm256_pow2m1lt4_epi16(__m256i x)
+{
+    return vsub(x, vcmpgt(x, vset1(2)));
+}
+
+
+// 0 <= x < 4  -->  (1 << x)
+static inline
+__m256i mm256_pow2lt4_epi16(__m256i x)
+{
+    __m256i a = vadd(x, vset1(1));
+    __m256i b = vu2i(vsubsu(vi2u(x), vi2u(vset1(2))));
+    __m256i c = vmululo(b, b);
+    return vadd(a, c);
+}
+
+
+static inline
+long mm256_extractn_epi32(__m256i x, const int i)
+{
+    int32_t AYMO_ALIGN_V256 x_m256i_i32[8];
+    _mm256_store_si256((__m256i*)(void*)x_m256i_i32, x);
+    return x_m256i_i32[i];
+}
+
+
+static inline
+__m256i mm256_insertn_epi32(__m256i x, long n, const int i)
+{
+    int32_t AYMO_ALIGN_V256 x_m256i_i32[8];
+    _mm256_store_si256((__m256i*)(void*)x_m256i_i32, x);
+    x_m256i_i32[i] = n;
+    return _mm256_load_si256((__m256i*)(void*)x_m256i_i32);
+}
+
+
+static inline
+float mm256_extractn_ps(__m256 x, const int i)
+{
+    float AYMO_ALIGN_V256 x_m256_f32[8];
+    _mm256_store_ps(x_m256_f32, x);
+    return x_m256_f32[i];
+}
+
+
+static inline
+__m256 mm256_insertn_ps(__m256 x, float f, const int i)
+{
+    float AYMO_ALIGN_V256 x_m256_f32[8];
+    _mm256_store_ps(x_m256_f32, x);
+    x_m256_f32[i] = f;
+    return _mm256_load_ps(x_m256_f32);
+}
+
+
+static inline
+int16_t clamp16(int x)
+{
+    if (x < INT16_MIN) {
+        return (int16_t)INT16_MIN;
+    }
+    if (x >= INT16_MAX) {
+        return (int16_t)INT16_MAX;
+    }
+    return (int16_t)x;
+}
+
+
+// Finds first set bit = Counts trailing zeros
+// Emulates the BSD function
+static inline
+int uffsll(unsigned long long x)
+{
+#if defined(_MSC_VER)
+    unsigned long i = 0;
+    if (_BitScanForward64(&i, x)) {
+        return (int)(i + 1);
+    }
+    return 0;
+
+#elif (defined(__GNUC__) || defined(__clang__))
+    return __builtin_ffsll((long long)x);
+
+#else
+    if (x) {
+        int i = 0;
+        do {
+            ++i;
+            x <<= 1;
+        } while(x);
+        return (64 - i);
+    }
+    return 0;
+#endif
+}
+
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
+
+#endif  // _include_aymo_cpu_x86_avx2_inline_h
diff --git a/include/aymo_cpu_x86_sse41.h b/include/aymo_cpu_x86_sse41.h
new file mode 100644
index 0000000..7e12d01
--- /dev/null
+++ b/include/aymo_cpu_x86_sse41.h
@@ -0,0 +1,46 @@
+// CPU-specific header file for x86 SSE4.1.
+// DO NOT include this file; #include "aymo_cpu.h" instead.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_x86_sse41_h
+#define _include_aymo_cpu_x86_sse41_h
+
+#include <smmintrin.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+typedef __m128i vi16x8_t;
+typedef __m128i vu16x8_t;
+
+typedef __m128i vi32x4_t;
+typedef __m128i vu32x4_t;
+
+typedef __m128 vf32x4_t;
+
+
+#ifndef AYMO_ALIGN_V128
+    #define AYMO_ALIGN_V128     AYMO_ALIGN(16)
+#endif
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_cpu_x86_sse41_h
diff --git a/include/aymo_cpu_x86_sse41_inline.h b/include/aymo_cpu_x86_sse41_inline.h
new file mode 100644
index 0000000..13f2a13
--- /dev/null
+++ b/include/aymo_cpu_x86_sse41_inline.h
@@ -0,0 +1,426 @@
+// CPU-specific inline methods for x86 SSE4.1.
+// Only #include after "aymo_cpu.h" to have inline methods.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_cpu_x86_sse41_inline_h
+#define _include_aymo_cpu_x86_sse41_inline_h
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#include <intrin.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#ifndef AYMO_CPU_X86_SSE41_GATHER16_STRATEGY
+    #define AYMO_CPU_X86_SSE41_GATHER16_STRATEGY 1
+#endif
+
+
+// Generic CPU shorthands
+
+#define vsfence         _mm_sfence
+
+
+// SIMD type shorthands
+typedef vi16x8_t vi16_t;
+typedef vu16x8_t vu16_t;
+typedef vi32x4_t vi32_t;
+
+
+// v*() methods are for vi16_t = int16_t[8]
+
+#define vi2u(x)         x
+#define vu2i(x)         x
+
+#define vload           _mm_loadu_si128
+#define vstore          _mm_storeu_si128
+#define vstorelo        _mm_storel_epi64
+
+#define vsetx           _mm_undefined_si128
+#define vset1           _mm_set1_epi16
+#define vseta           _mm_set_epi16
+#define vsetr           _mm_setr_epi16
+#define vsetz           _mm_setzero_si128
+#define vsetf()         (vset1(-1))
+#define vsetm            mm_setm_epi16
+
+#define vnot(x)         (vxor((x), vsetf()))
+#define vand            _mm_and_si128
+#define vor             _mm_or_si128
+#define vxor            _mm_xor_si128
+#define vandnot         _mm_andnot_si128  // ~A & B
+#define vblendi         _mm_blend_epi16
+#define vblendv         _mm_blendv_epi8
+
+#define vcmpeq          _mm_cmpeq_epi16
+#define vcmpgt          _mm_cmpgt_epi16
+#define vcmpz(x)        (vcmpeq((x), vsetz()))
+#define vcmpp(x)        (vcmpgt((x), vsetz()))
+#define vcmpn(x)        (vcmpgt(vsetz(), (x)))
+
+#define vadd            _mm_add_epi16
+#define vaddsi          _mm_adds_epi16
+#define vaddsu          _mm_adds_epu16
+
+#define vsub            _mm_sub_epi16
+#define vsubsi          _mm_subs_epi16
+#define vsubsu          _mm_subs_epu16
+#define vneg(x)         (vsub(vsetz(), (x)))
+
+#define vslli           _mm_slli_epi16
+#define vsrli           _mm_srli_epi16
+#define vsrai           _mm_srai_epi16
+#define vsllv            mm_sllv_epi16
+#define vsrlv            mm_srlv_epi16
+#define vsrav            mm_srav_epi16
+
+#define vmulihi         _mm_mulhi_epi16
+#define vmuluhi         _mm_mulhi_epu16
+#define vmulhrs         _mm_mulhrs_epi16
+
+#define vmulilo         _mm_mullo_epi16
+#define vmululo         _mm_mullo_epi16
+
+#define vmadd           _mm_madd_epi16
+
+#define vmini           _mm_min_epi16
+#define vminu           _mm_min_epu16
+
+#define vmaxi           _mm_max_epi16
+#define vmaxu           _mm_max_epu16
+
+#define vextract        _mm_extract_epi16
+#define vextractn        mm_extractn_epi16
+#define vextractv(x,i)   (((const int16_t*)(const void*)&(x))[(i)])
+
+#define vinsert         _mm_insert_epi16
+#define vinsertn         mm_insertn_epi16
+#define vinsertv(x,n,i)  {((int16_t*)(void*)&(x))[(i)] = (n);}
+
+#define vgather          mm_i16gather_epi16lo
+
+#define vhsum            mm_hsum_epi16
+#define vhsums           mm_hsums_epi16
+
+#define vpow2m1lt4       mm_pow2m1lt4_epi16
+#define vpow2lt4         mm_pow2lt4_epi16
+
+#define vshufflelo      _mm_shufflelo_epi16
+#define vshufflehi      _mm_shufflehi_epi16
+#define valignr         _mm_alignr_epi8
+
+#define vunpacklo       _mm_unpacklo_epi16
+#define vunpackhi       _mm_unpackhi_epi16
+
+#define v2vv            _mm_cvtepi16_epi32
+#define vlo2vv(x)       (v2vv(x))
+#define vhi2vv(x)       (v2vv(vvshuffle((x), KSHUFFLE(3, 2, 3, 2))))
+
+
+// vv*() methods are for vi32_t = int32_t[4]
+
+#define vvi2u(x)        x
+#define vvu2i(x)        x
+
+#define vvsetx          _mm_undefined_si128
+#define vvset1          _mm_set1_epi32
+#define vvseta          _mm_set_epi32
+#define vvsetr          _mm_setr_epi32
+#define vvsetz          _mm_setzero_si128
+#define vvsetf()        (vvset1(-1))
+
+#define vvand           vand
+#define vvor            vor
+#define vvxor           vxor
+#define vvandnot        vandnot
+
+#define vvadd           _mm_add_epi32
+
+#define vvsrli          _mm_srli_epi32
+
+#define vvsllv           mm_sllv_epi32
+
+#define vvextract       _mm_extract_epi32
+#define vvextractn       mm_extractn_epi32
+
+#define vvinsert        _mm_insert_epi32
+#define vvinsertn        mm_insertn_epi32
+
+#define vvmullo         _mm_mullo_epi32
+
+#define vvshuffle       _mm_shuffle_epi32
+#define KSHUFFLE        _MM_SHUFFLE
+#define vvswap(x)       vvshuffle((x), KSHUFFLE(1, 0, 3, 2))
+
+#define vvpacks         _mm_packs_epi32
+#define vvpackus        _mm_packus_epi32
+
+
+static inline
+__m128i mm_setm_epi16(uint8_t m)
+{
+    const __m128i k = vsetr(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+    return vcmpeq(vand(vset1((int16_t)(uint16_t)m), k), k);
+}
+
+
+static inline
+__m128i mm_sllv_epi16(__m128i x, __m128i n)
+{
+    // There's no quick way to perform variable bit shifts; resort to basic x86
+    int16_t AYMO_ALIGN_V128 x_m128i_i16[8];
+    uint16_t AYMO_ALIGN_V128 n_m128i_u16[8];
+    _mm_store_si128((__m128i*)(void*)x_m128i_i16, x);
+    _mm_store_si128((__m128i*)(void*)n_m128i_u16, n);
+    x_m128i_i16[0] <<= n_m128i_u16[0];
+    x_m128i_i16[1] <<= n_m128i_u16[1];
+    x_m128i_i16[2] <<= n_m128i_u16[2];
+    x_m128i_i16[3] <<= n_m128i_u16[3];
+    x_m128i_i16[4] <<= n_m128i_u16[4];
+    x_m128i_i16[5] <<= n_m128i_u16[5];
+    x_m128i_i16[6] <<= n_m128i_u16[6];
+    x_m128i_i16[7] <<= n_m128i_u16[7];
+    return _mm_load_si128((__m128i*)(void*)x_m128i_i16);
+}
+
+
+// see: https://stackoverflow.com/questions/51789685/reproduce-mm256-sllv-epi16-and-mm256-sllv-epi8-in-sse41/51805592#51805592
+static inline
+__m128i mm_srlv_epi16(__m128i x, __m128i n)
+{
+    // There's no quick way to perform variable bit shifts; resort to basic x86
+    uint16_t AYMO_ALIGN_V128 x_m128i_u16[8];
+    uint16_t AYMO_ALIGN_V128 n_m128i_u16[8];
+    _mm_store_si128((__m128i*)(void*)x_m128i_u16, x);
+    _mm_store_si128((__m128i*)(void*)n_m128i_u16, n);
+    x_m128i_u16[0] >>= n_m128i_u16[0];
+    x_m128i_u16[1] >>= n_m128i_u16[1];
+    x_m128i_u16[2] >>= n_m128i_u16[2];
+    x_m128i_u16[3] >>= n_m128i_u16[3];
+    x_m128i_u16[4] >>= n_m128i_u16[4];
+    x_m128i_u16[5] >>= n_m128i_u16[5];
+    x_m128i_u16[6] >>= n_m128i_u16[6];
+    x_m128i_u16[7] >>= n_m128i_u16[7];
+    return _mm_load_si128((__m128i*)(void*)x_m128i_u16);
+}
+
+
+static inline
+__m128i mm_srav_epi16(__m128i x, __m128i n)
+{
+    // There's no quick way to perform variable bit shifts; resort to basic x86
+    int16_t AYMO_ALIGN_V128 x_m128i_i16[8];
+    uint16_t AYMO_ALIGN_V128 n_m128i_u16[8];
+    _mm_store_si128((__m128i*)(void*)x_m128i_i16, x);
+    _mm_store_si128((__m128i*)(void*)n_m128i_u16, n);
+    x_m128i_i16[0] >>= n_m128i_u16[0];
+    x_m128i_i16[1] >>= n_m128i_u16[1];
+    x_m128i_i16[2] >>= n_m128i_u16[2];
+    x_m128i_i16[3] >>= n_m128i_u16[3];
+    x_m128i_i16[4] >>= n_m128i_u16[4];
+    x_m128i_i16[5] >>= n_m128i_u16[5];
+    x_m128i_i16[6] >>= n_m128i_u16[6];
+    x_m128i_i16[7] >>= n_m128i_u16[7];
+    return _mm_load_si128((__m128i*)(void*)x_m128i_i16);
+}
+
+
+static inline
+short mm_extractn_epi16(__m128i x, const int i)
+{
+    int16_t AYMO_ALIGN_V128 x_m128i_i16[8];
+    _mm_store_si128((__m128i*)(void*)x_m128i_i16, x);
+    return x_m128i_i16[i];
+}
+
+
+static inline
+__m128i mm_insertn_epi16(__m128i x, short n, const int i)
+{
+    int16_t AYMO_ALIGN_V128 x_m128i_i16[8];
+    _mm_store_si128((__m128i*)(void*)x_m128i_i16, x);
+    x_m128i_i16[i] = n;
+    return _mm_load_si128((__m128i*)(void*)x_m128i_i16);
+}
+
+
+// Gathers 16x 16-bit words via 16x 8-bit (low) indexes
+static inline
+__m128i mm_i16gather_epi16lo(const int16_t* v, __m128i i)
+{
+    // Plain C lookup, smallest cache footprint
+    uint8_t AYMO_ALIGN_V128 i_m128i_u8[16];
+    _mm_store_si128((__m128i*)(void*)i_m128i_u8, i);
+    int16_t* r_m128i_i16 = (int16_t*)(void*)i_m128i_u8;
+    r_m128i_i16[0] = v[i_m128i_u8[0x0]];
+    r_m128i_i16[1] = v[i_m128i_u8[0x2]];
+    r_m128i_i16[2] = v[i_m128i_u8[0x4]];
+    r_m128i_i16[3] = v[i_m128i_u8[0x6]];
+    r_m128i_i16[4] = v[i_m128i_u8[0x8]];
+    r_m128i_i16[5] = v[i_m128i_u8[0xA]];
+    r_m128i_i16[6] = v[i_m128i_u8[0xC]];
+    r_m128i_i16[7] = v[i_m128i_u8[0xE]];
+    return _mm_load_si128((__m128i*)(void*)r_m128i_i16);
+}
+
+
+// see: https://stackoverflow.com/questions/60108658/fastest-method-to-calculate-sum-of-all-packed-32-bit-integers-using-avx512-or-av/
+static inline
+int mm_hsum_epi32(__m128i x)
+{
+    __m128i hi64 = _mm_unpackhi_epi64(x, x);
+    __m128i sum64 = _mm_add_epi32(hi64, x);
+    __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i sum32 = _mm_add_epi32(sum64, hi32);
+    return _mm_cvtsi128_si32(sum32);
+}
+
+
+// see: https://stackoverflow.com/questions/55057933/simd-accumulate-adjacent-pairs
+static inline
+int mm_hsum_epi16(__m128i x)
+{
+    __m128i sum16 = _mm_madd_epi16(x, vset1(1));
+    return mm_hsum_epi32(sum16);
+}
+
+
+static inline
+int mm_hsums_epi16(__m128i x)
+{
+    __m128i hs16 = _mm_hadds_epi16(x, vsetz());
+    __m128i sum16 = _mm_unpacklo_epi16(hs16, vsetz());
+    return mm_hsum_epi32(sum16);
+}
+
+
+// 0 <= x < 4  -->  (1 << (x - 1))  -->  0, 1, 2, 4
+static inline
+__m128i mm_pow2m1lt4_epi16(__m128i x)
+{
+    return vsub(x, vcmpgt(x, vset1(2)));
+}
+
+
+// 0 <= x < 4  -->  (1 << x)
+static inline
+__m128i mm_pow2lt4_epi16(__m128i x)
+{
+    __m128i a = vadd(x, vset1(1));
+    __m128i b = vu2i(vsubsu(vi2u(x), vi2u(vset1(2))));
+    __m128i c = vmulilo(b, b);
+    return vadd(a, c);
+}
+
+
+static inline
+__m128i mm_sllv_epi32(__m128i x, __m128i n)
+{
+    // There's no quick way to perform variable bit shifts; resort to basic x86
+    int32_t AYMO_ALIGN_V128 x_m128i_i32[4];
+    uint32_t AYMO_ALIGN_V128 n_m128i_u32[4];
+    _mm_store_si128((__m128i*)(void*)x_m128i_i32, x);
+    _mm_store_si128((__m128i*)(void*)n_m128i_u32, n);
+    x_m128i_i32[0] <<= n_m128i_u32[0];
+    x_m128i_i32[1] <<= n_m128i_u32[1];
+    x_m128i_i32[2] <<= n_m128i_u32[2];
+    x_m128i_i32[3] <<= n_m128i_u32[3];
+    return _mm_load_si128((__m128i*)(void*)x_m128i_i32);
+}
+
+
+static inline
+long mm_extractn_epi32(__m128i x, const int i)
+{
+    int32_t AYMO_ALIGN_V128 x_m128i_i32[4];
+    _mm_store_si128((__m128i*)(void*)x_m128i_i32, x);
+    return x_m128i_i32[i];
+}
+
+
+static inline
+__m128i mm_insertn_epi32(__m128i x, long n, const int i)
+{
+    int32_t AYMO_ALIGN_V128 x_m128i_i32[4];
+    _mm_store_si128((__m128i*)(void*)x_m128i_i32, x);
+    x_m128i_i32[i] = n;
+    return _mm_load_si128((__m128i*)(void*)x_m128i_i32);
+}
+
+
+static inline
+int16_t clamp16(int x)
+{
+    if (x < INT16_MIN) {
+        return (int16_t)INT16_MIN;
+    }
+    if (x >= INT16_MAX) {
+        return (int16_t)INT16_MAX;
+    }
+    return (int16_t)x;
+}
+
+
+// Finds first set bit = Counts trailing zeros
+// Emulates the BSD function
+static inline
+int uffsll(unsigned long long x)
+{
+#if defined(_MSC_VER)
+    unsigned long i = 0;
+#if defined(_WIN32)
+    if (_BitScanForward(&i, (uint32_t)x)) {
+        return (int)(i + 1);
+    }
+    if (_BitScanForward(&i, (uint32_t)(x >> 32))) {
+        return (int)(i + 33);
+    }
+#else
+    if (_BitScanForward64(&i, (unsigned long long)x)) {
+        return (int)(i + 1);
+    }
+#endif
+    return 0;
+
+#elif (defined(__GNUC__) || defined(__clang__))
+    return __builtin_ffsll((long long)x);
+
+#else
+    if (x) {
+        int i = 0;
+        do {
+            ++i;
+            x <<= 1;
+        } while(x);
+        return (64 - i);
+    }
+    return 0;
+#endif
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
+
+#endif  // _include_aymo_cpu_x86_sse41_inline_h
diff --git a/include/aymo_file.h b/include/aymo_file.h
new file mode 100644
index 0000000..985fa0c
--- /dev/null
+++ b/include/aymo_file.h
@@ -0,0 +1,42 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_utils_h
+#define _include_aymo_utils_h
+
+#include "aymo_cc.h"
+
+#include <stddef.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#ifndef AYMO_FILE_CHUNK_SIZE
+#define AYMO_FILE_CHUNK_SIZE    (1000000uL)  // 1 MB
+#endif
+
+
+AYMO_PUBLIC int aymo_file_save(const char* pathp, const void* datap, size_t size);
+AYMO_PUBLIC int aymo_file_load(const char* pathp, void** datapp, size_t* sizep);
+AYMO_PUBLIC void aymo_file_unload(void* datap);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_utils_h
diff --git a/include/aymo_score.h b/include/aymo_score.h
new file mode 100644
index 0000000..f51567e
--- /dev/null
+++ b/include/aymo_score.h
@@ -0,0 +1,145 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_score_h
+#define _include_aymo_score_h
+
+#include "aymo_cc.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+enum aymo_score_type {
+    aymo_score_type_avd,
+    aymo_score_type_dro,
+    aymo_score_type_imf,
+    aymo_score_type_raw,
+    aymo_score_type_unknown
+};
+
+
+#define AYMO_SCORE_OPL_RATE_DEFAULT     49716u
+
+#define AYMO_SCORE_FLAG_EVENT   1u
+#define AYMO_SCORE_FLAG_DELAY   2u
+#define AYMO_SCORE_FLAG_EOF     4u
+
+struct aymo_score_status {
+    uint32_t delay;  // after
+    uint16_t address;
+    uint8_t value;
+    uint8_t flags;
+};
+
+struct aymo_score_instance;  // forward
+
+typedef int (*aymo_score_ctor_f)(
+    struct aymo_score_instance* score
+);
+
+typedef void (*aymo_score_dtor_f)(
+    struct aymo_score_instance* score
+);
+
+typedef int (*aymo_score_load_f)(
+    struct aymo_score_instance* score,
+    const void* data,
+    uint32_t size
+);
+
+typedef void (*aymo_score_unload_f)(
+    struct aymo_score_instance* score
+);
+
+typedef struct aymo_score_status* (*aymo_score_get_status_f)(
+    struct aymo_score_instance* score
+);
+
+typedef void (*aymo_score_restart_f)(
+    struct aymo_score_instance* score
+);
+
+typedef uint32_t (*aymo_score_tick_f)(
+    struct aymo_score_instance* score,
+    uint32_t count
+);
+
+struct aymo_score_vt {
+    const char* class_name;
+    aymo_score_ctor_f ctor;
+    aymo_score_dtor_f dtor;
+    aymo_score_load_f load;
+    aymo_score_unload_f unload;
+    aymo_score_get_status_f get_status;
+    aymo_score_restart_f restart;
+    aymo_score_tick_f tick;
+};
+
+struct aymo_score_instance {
+    const struct aymo_score_vt* vt;
+};
+
+
+AYMO_PUBLIC int aymo_score_ctor(
+    struct aymo_score_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_dtor(
+    struct aymo_score_instance* score
+);
+
+AYMO_PUBLIC int aymo_score_load(
+    struct aymo_score_instance* score,
+    const void* data,
+    uint32_t size
+);
+
+AYMO_PUBLIC void aymo_score_unload(
+    struct aymo_score_instance* score
+);
+
+AYMO_PUBLIC struct aymo_score_status* aymo_score_get_status(
+    struct aymo_score_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_restart(
+    struct aymo_score_instance* score
+);
+
+AYMO_PUBLIC uint32_t aymo_score_tick(
+    struct aymo_score_instance* score,
+    uint32_t count
+);
+
+
+AYMO_PUBLIC enum aymo_score_type aymo_score_ext_to_type(
+    const char *tag
+);
+
+AYMO_PUBLIC const struct aymo_score_vt* aymo_score_type_to_vt(
+    enum aymo_score_type score_type
+);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_score_h
diff --git a/include/aymo_score_avd.h b/include/aymo_score_avd.h
new file mode 100644
index 0000000..dfe28b7
--- /dev/null
+++ b/include/aymo_score_avd.h
@@ -0,0 +1,89 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_score_avd_h
+#define _include_aymo_score_avd_h
+
+#include "aymo_score.h"
+
+#include <stddef.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+AYMO_PRAGMA_PACK_PUSH_1
+
+struct aymo_score_avd_event {
+    uint8_t address_hi;
+    uint8_t address_lo;
+    uint8_t value;
+};
+
+AYMO_PRAGMA_PACK_POP
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+struct aymo_score_avd_instance {
+    const struct aymo_score_vt* vt;
+    struct aymo_score_status status;
+    const struct aymo_score_avd_event* events;
+    uint32_t length;
+    uint32_t index;
+};
+
+
+AYMO_PUBLIC const struct aymo_score_vt aymo_score_avd_vt;
+
+
+AYMO_PUBLIC int aymo_score_avd_ctor(
+    struct aymo_score_avd_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_avd_dtor(
+    struct aymo_score_avd_instance* score
+);
+
+AYMO_PUBLIC int aymo_score_avd_load(
+    struct aymo_score_avd_instance* score,
+    const void* data,
+    uint32_t size
+);
+
+AYMO_PUBLIC void aymo_score_avd_unload(
+    struct aymo_score_avd_instance* score
+);
+
+AYMO_PUBLIC struct aymo_score_status* aymo_score_avd_get_status(
+    struct aymo_score_avd_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_avd_restart(
+    struct aymo_score_avd_instance* score
+);
+
+AYMO_PUBLIC uint32_t aymo_score_avd_tick(
+    struct aymo_score_avd_instance* score,
+    uint32_t count
+);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_score_avd_h
diff --git a/include/aymo_score_dro.h b/include/aymo_score_dro.h
new file mode 100644
index 0000000..8793508
--- /dev/null
+++ b/include/aymo_score_dro.h
@@ -0,0 +1,166 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_score_dro_h
+#define _include_aymo_score_dro_h
+
+#include "aymo_score.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// See: https://moddingwiki.shikadi.net/wiki/DRO_Format
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+AYMO_PRAGMA_PACK_PUSH_1
+
+#define AYMO_DRO_SIGNATURE  "DBRAWOPL"
+
+// Common DRO header, at the very beginning of the file
+// NOTE: v0.1 == v1.0
+struct aymo_score_dro_header {
+    char signature[8u];
+    uint16_t version_major;
+    uint16_t version_minor;
+//  struct aymo_score_dro_v?_header versioned_header;
+};
+
+
+// DRO v1.0 hardware type
+enum aymo_score_dro_v1_hardware_type {
+    aymo_score_dro_v1_hardware_type_opl2 = 0,
+    aymo_score_dro_v1_hardware_type_opl2x2,
+    aymo_score_dro_v1_hardware_type_opl3,
+};
+
+// DRO v1.0 sub-header
+struct aymo_score_dro_v1_header {
+    uint32_t length_ms;
+    uint32_t length_bytes;
+    uint8_t hardware_type;
+    uint8_t hardware_extra[3];
+};
+
+// DRO v1.0 special codes
+enum aymo_score_dro_v1_code {
+    aymo_score_dro_v1_code_delay_byte = 0,
+    aymo_score_dro_v1_code_delay_word,
+    aymo_score_dro_v1_code_switch_low,
+    aymo_score_dro_v1_code_switch_high,
+    aymo_score_dro_v1_code_escape,
+    aymo_score_dro_v1_code_invalid = 0xFF
+};
+
+
+// DRO v2.0 hardware type
+enum aymo_score_dro_v2_hardware_type {
+    aymo_score_dro_v2_hardware_type_opl2 = 0,
+    aymo_score_dro_v2_hardware_type_opl2x2,
+    aymo_score_dro_v2_hardware_type_opl3,
+};
+
+// DRO v2.0 format
+enum aymo_score_dro_v2_format {
+    aymo_score_dro_v2_format_interleaved = 0
+};
+
+// DRO v2.0 sub-header
+struct aymo_score_dro_v2_header {
+    uint32_t length_pairs;
+    uint32_t length_ms;
+    uint8_t hardware_type;
+    uint8_t format;
+    uint8_t compression;
+    uint8_t short_delay_code;
+    uint8_t long_delay_code;
+    uint8_t codemap_length;
+//  uint8_t codemap_table[codemap_length];
+};
+
+
+// <Code, Value> score event pair
+struct aymo_score_dro_pair {
+    uint8_t code;
+    uint8_t value;
+};
+
+AYMO_PRAGMA_PACK_POP
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+// Score player score
+struct aymo_score_dro_instance {
+    const struct aymo_score_vt* vt;
+    struct aymo_score_status status;
+    const struct aymo_score_dro_header *header;
+    const struct aymo_score_dro_v1_header *v1_header;
+    const struct aymo_score_dro_v2_header *v2_header;
+    const uint8_t* codemap;
+    const uint8_t* events;
+    uint32_t opl_rate;
+    uint32_t division;
+    uint32_t length;
+    uint32_t offset;
+    uint8_t address_hi;
+};
+
+
+AYMO_PUBLIC const struct aymo_score_vt aymo_score_dro_vt;
+
+
+AYMO_PUBLIC int aymo_score_dro_init_specific(
+    struct aymo_score_dro_instance* score,
+    uint32_t opl_rate
+);
+
+AYMO_PUBLIC int aymo_score_dro_ctor(
+    struct aymo_score_dro_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_dro_dtor(
+    struct aymo_score_dro_instance* score
+);
+
+AYMO_PUBLIC int aymo_score_dro_load(
+    struct aymo_score_dro_instance* score,
+    const void* data,
+    uint32_t size
+);
+
+AYMO_PUBLIC void aymo_score_dro_unload(
+    struct aymo_score_dro_instance* score
+);
+
+AYMO_PUBLIC struct aymo_score_status* aymo_score_dro_get_status(
+    struct aymo_score_dro_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_dro_restart(
+    struct aymo_score_dro_instance* score
+);
+
+AYMO_PUBLIC uint32_t aymo_score_dro_tick(
+    struct aymo_score_dro_instance* score,
+    uint32_t count
+);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_score_dro_h
diff --git a/include/aymo_score_imf.h b/include/aymo_score_imf.h
new file mode 100644
index 0000000..5699f00
--- /dev/null
+++ b/include/aymo_score_imf.h
@@ -0,0 +1,131 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_score_imf_h
+#define _include_aymo_score_imf_h
+
+#include "aymo_score.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// Common IMF score event rates
+#define aymo_score_imf_rate_280Hz                       280u
+#define aymo_score_imf_rate_duke_nukem_ii               aymo_score_imf_rate_280Hz
+
+#define aymo_score_imf_rate_560Hz                       560u
+#define aymo_score_imf_rate_bio_menace                  aymo_score_imf_rate_560Hz
+#define aymo_score_imf_rate_commander_keen              aymo_score_imf_rate_560Hz
+#define aymo_score_imf_rate_cosmos_cosmic_adventures    aymo_score_imf_rate_560Hz
+#define aymo_score_imf_rate_monster_bash                aymo_score_imf_rate_560Hz
+#define aymo_score_imf_rate_major_stryker               aymo_score_imf_rate_560Hz
+
+#define aymo_score_imf_rate_700Hz                       700u
+#define aymo_score_imf_rate_blake_stone                 aymo_score_imf_rate_700Hz
+#define aymo_score_imf_rate_operation_body_count        aymo_score_imf_rate_700Hz
+#define aymo_score_imf_rate_wolfenstein_3d              aymo_score_imf_rate_700Hz
+#define aymo_score_imf_rate_corridor_7                  aymo_score_imf_rate_700Hz
+
+#define aymo_score_imf_rate_default                     aymo_score_imf_rate_560Hz
+
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+AYMO_PRAGMA_PACK_PUSH_1
+
+struct aymo_score_imf_event {
+    uint8_t address_lo;
+    uint8_t value;
+    uint8_t delay_lo;
+    uint8_t delay_hi;
+};
+
+AYMO_PRAGMA_PACK_POP
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+struct aymo_score_imf_instance {
+    const struct aymo_score_vt* vt;
+    struct aymo_score_status status;
+    const struct aymo_score_imf_event* events;
+    uint32_t imf_rate;
+    uint32_t opl_rate;
+    uint32_t division;
+    uint32_t length;
+    uint32_t index;
+    uint8_t type;
+    uint8_t address_hi;
+};
+
+
+AYMO_PUBLIC const struct aymo_score_vt aymo_score_imf_vt;
+
+
+AYMO_PUBLIC uint8_t aymo_score_imf_guess_type(
+    const void* data,
+    uint32_t size
+);
+
+AYMO_PUBLIC int aymo_score_imf_ctor_specific(
+    struct aymo_score_imf_instance* score,
+    uint32_t imf_rate,
+    uint32_t opl_rate
+);
+
+AYMO_PUBLIC int aymo_score_imf_ctor(
+    struct aymo_score_imf_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_imf_dtor(
+    struct aymo_score_imf_instance* score
+);
+
+AYMO_PUBLIC int aymo_score_imf_load_specific(
+    struct aymo_score_imf_instance* score,
+    const void* data,
+    uint32_t size,
+    uint8_t type
+);
+
+AYMO_PUBLIC int aymo_score_imf_load(
+    struct aymo_score_imf_instance* score,
+    const void* data,
+    uint32_t size
+);
+
+AYMO_PUBLIC void aymo_score_imf_unload(
+    struct aymo_score_imf_instance* score
+);
+
+AYMO_PUBLIC struct aymo_score_status* aymo_score_imf_get_status(
+    struct aymo_score_imf_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_imf_restart(
+    struct aymo_score_imf_instance* score
+);
+
+AYMO_PUBLIC uint32_t aymo_score_imf_tick(
+    struct aymo_score_imf_instance* score,
+    uint32_t count
+);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_score_imf_h
diff --git a/include/aymo_score_raw.h b/include/aymo_score_raw.h
new file mode 100644
index 0000000..56b72e7
--- /dev/null
+++ b/include/aymo_score_raw.h
@@ -0,0 +1,99 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_score_raw_h
+#define _include_aymo_score_raw_h
+
+#include "aymo_score.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+AYMO_PRAGMA_PACK_PUSH_1
+
+#define AYMO_SCORE_RAW_RAWADATA "RAWADATA"
+#define AYMO_SCORE_RAW_REFCLK   1193180L
+
+struct aymo_score_raw_header {
+    uint8_t rawadata[8];
+    uint16_t clock;
+};
+
+struct aymo_score_raw_event {
+    uint8_t data;
+    uint8_t ctrl;
+};
+
+AYMO_PRAGMA_PACK_POP
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+struct aymo_score_raw_instance {
+    const struct aymo_score_vt* vt;
+    struct aymo_score_status status;
+    const struct aymo_score_raw_event* events;
+    uint32_t raw_rate;
+    uint32_t division;
+    uint32_t length;
+    uint32_t index;
+    uint16_t clock;
+    uint16_t clock_initial;
+    uint8_t address_hi;
+};
+
+
+AYMO_PUBLIC const struct aymo_score_vt aymo_score_raw_vt;
+
+
+AYMO_PUBLIC int aymo_score_raw_ctor(
+    struct aymo_score_raw_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_raw_dtor(
+    struct aymo_score_raw_instance* score
+);
+
+AYMO_PUBLIC int aymo_score_raw_load(
+    struct aymo_score_raw_instance* score,
+    const void* data,
+    uint32_t size
+);
+
+AYMO_PUBLIC void aymo_score_raw_unload(
+    struct aymo_score_raw_instance* score
+);
+
+AYMO_PUBLIC struct aymo_score_status* aymo_score_raw_get_status(
+    struct aymo_score_raw_instance* score
+);
+
+AYMO_PUBLIC void aymo_score_raw_restart(
+    struct aymo_score_raw_instance* score
+);
+
+AYMO_PUBLIC uint32_t aymo_score_raw_tick(
+    struct aymo_score_raw_instance* score,
+    uint32_t count
+);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_score_raw_h
diff --git a/include/aymo_sys_linux.h b/include/aymo_sys_linux.h
new file mode 100644
index 0000000..0673d7d
--- /dev/null
+++ b/include/aymo_sys_linux.h
@@ -0,0 +1,25 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_sys_linux_h
+#define _include_aymo_sys_linux_h
+
+// TODO:
+
+#endif  // _include_aymo_sys_linux_h
diff --git a/include/aymo_sys_windows.h b/include/aymo_sys_windows.h
new file mode 100644
index 0000000..da75692
--- /dev/null
+++ b/include/aymo_sys_windows.h
@@ -0,0 +1,25 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_sys_windows_h
+#define _include_aymo_sys_windows_h
+
+// TODO:
+
+#endif  // _include_aymo_sys_windows_h
diff --git a/include/aymo_tda8425.h b/include/aymo_tda8425.h
new file mode 100644
index 0000000..e372208
--- /dev/null
+++ b/include/aymo_tda8425.h
@@ -0,0 +1,45 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_tda8425_h
+#define _include_aymo_tda8425_h
+
+#include "aymo_tda8425_common.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+AYMO_PUBLIC const struct aymo_tda8425_math* aymo_tda8425_math;
+
+
+AYMO_PUBLIC void aymo_tda8425_boot(const struct aymo_tda8425_math* math);
+AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_tda8425_get_vt(const char* cpu_ext);
+AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_tda8425_get_best_vt(void);
+
+AYMO_PUBLIC uint32_t aymo_tda8425_get_sizeof(struct aymo_tda8425_chip* chip);
+AYMO_PUBLIC void aymo_tda8425_ctor(struct aymo_tda8425_chip* chip, float sample_rate);
+AYMO_PUBLIC void aymo_tda8425_dtor(struct aymo_tda8425_chip* chip);
+AYMO_PUBLIC uint8_t aymo_tda8425_read(struct aymo_tda8425_chip* chip, uint16_t address);
+AYMO_PUBLIC void aymo_tda8425_write(struct aymo_tda8425_chip* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_tda8425_process_f32(struct aymo_tda8425_chip* chip, uint32_t count, const float x[], float y[]);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_tda8425_h
diff --git a/include/aymo_tda8425_arm_neon.h b/include/aymo_tda8425_arm_neon.h
new file mode 100644
index 0000000..4cf36fb
--- /dev/null
+++ b/include/aymo_tda8425_arm_neon.h
@@ -0,0 +1,107 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_tda8425_arm_neon_h
+#define _include_aymo_tda8425_arm_neon_h
+
+#include "aymo_cpu.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_TDA8425_ARM_NEON_##_token_
+#define aymo_(_token_)  aymo_tda8425_arm_neon_##_token_
+
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V128
+struct aymo_(chip) {
+    // 128-bit data
+    vf32x4_t hb1l;
+    vf32x4_t hb1r;
+    vf32x4_t hb2l;
+    vf32x4_t hb2r;
+    vf32x4_t kb2;
+
+    vf32x4_t ha1l;
+    vf32x4_t ha1r;
+    vf32x4_t ha2l;
+    vf32x4_t ha2r;
+    vf32x4_t ka2;
+
+    vf32x4_t hb0l;
+    vf32x4_t hb0r;
+    vf32x4_t kb1;
+
+    vf32x4_t ha0l;
+    vf32x4_t ha0r;
+    vf32x4_t ka1;
+
+    vf32x4_t kb0;
+
+    // 64-bit data
+    vf32x2_t krl;
+    vf32x2_t klr;
+
+    vf32x2_t kv;
+
+    // 32-bit data
+    float sample_rate;  // [Hz]
+    float pseudo_c1;  // [F]
+    float pseudo_c2;  // [F]
+
+    // 8-bit data
+    uint8_t reg_vl;
+    uint8_t reg_vr;
+    uint8_t reg_ba;
+    uint8_t reg_tr;
+    uint8_t reg_pp;
+    uint8_t reg_sf;
+    uint8_t pad32_[2];
+};
+
+
+AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
+
+#endif  // _include_aymo_tda8425_arm_neon_h
diff --git a/include/aymo_tda8425_common.h b/include/aymo_tda8425_common.h
new file mode 100644
index 0000000..4803f76
--- /dev/null
+++ b/include/aymo_tda8425_common.h
@@ -0,0 +1,84 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_tda8425_common_h
+#define _include_aymo_tda8425_common_h
+
+#include "aymo_cc.h"
+
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// Object-oriented API
+
+struct aymo_tda8425_chip;  // forward
+typedef uint32_t (*aymo_tda8425_get_sizeof_f)(void);
+typedef void (*aymo_tda8425_ctor_f)(struct aymo_tda8425_chip* chip, float sample_rate);
+typedef void (*aymo_tda8425_dtor_f)(struct aymo_tda8425_chip* chip);
+typedef uint8_t (*aymo_tda8425_read_f)(struct aymo_tda8425_chip* chip, uint16_t address);
+typedef void (*aymo_tda8425_write_f)(struct aymo_tda8425_chip* chip, uint16_t address, uint8_t value);
+typedef void (*aymo_tda8425_process_f32_f)(struct aymo_tda8425_chip* chip, uint32_t count, const float x[], float y[]);
+
+struct aymo_tda8425_vt {
+    const char* class_name;
+    aymo_tda8425_get_sizeof_f get_sizeof;
+    aymo_tda8425_ctor_f ctor;
+    aymo_tda8425_dtor_f dtor;
+    aymo_tda8425_read_f read;
+    aymo_tda8425_write_f write;
+    aymo_tda8425_process_f32_f process_f32;
+};
+
+struct aymo_tda8425_chip {
+    const struct aymo_tda8425_vt* vt;
+};
+
+
+// Math API
+
+typedef double (*aymo_tda8425_math1_f)(double a);
+typedef double (*aymo_tda8425_math2_f)(double a, double b);
+
+struct aymo_tda8425_math {
+    aymo_tda8425_math1_f cos;
+    aymo_tda8425_math1_f fabs;
+    aymo_tda8425_math1_f log10;
+    aymo_tda8425_math2_f pow;
+    aymo_tda8425_math1_f sqrt;
+    aymo_tda8425_math1_f tan;
+};
+
+// Defines the default math functions, after #include <math.h>
+#define AYMO_TDA8425_DEFINE_MATH_DEFAULT(name__)  \
+    const struct aymo_tda8425_math name__ = { (cos), (fabs), (log10), (pow), (sqrt), (tan) }
+
+
+AYMO_PUBLIC const int8_t aymo_tda8425_reg_v_to_db[64];
+AYMO_PUBLIC const int8_t aymo_tda8425_reg_ba_to_db[16];
+AYMO_PUBLIC const int8_t aymo_tda8425_reg_tr_to_db[16];
+
+AYMO_PUBLIC const float aymo_tda8425_pseudo_preset_c1[3];
+AYMO_PUBLIC const float aymo_tda8425_pseudo_preset_c2[3];
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_tda8425_common_h
diff --git a/include/aymo_tda8425_none.h b/include/aymo_tda8425_none.h
new file mode 100644
index 0000000..ada3d05
--- /dev/null
+++ b/include/aymo_tda8425_none.h
@@ -0,0 +1,61 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_tda8425_none_h
+#define _include_aymo_tda8425_none_h
+
+#include "aymo_cpu.h"
+
+#include "TDA8425_emu.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_TDA8425_NONE_##_token_
+#define aymo_(_token_)  aymo_tda8425_none_##_token_
+
+#define AYMO_TDA8425_NONE_DELAY 4
+
+
+struct aymo_(chip) {
+    TDA8425_Chip emu;
+    float yh[AYMO_TDA8425_NONE_DELAY][2];
+};
+
+
+AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_tda8425_none_h
diff --git a/include/aymo_tda8425_x86_avx2.h b/include/aymo_tda8425_x86_avx2.h
new file mode 100644
index 0000000..618feda
--- /dev/null
+++ b/include/aymo_tda8425_x86_avx2.h
@@ -0,0 +1,100 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_tda8425_x86_avx2_h
+#define _include_aymo_tda8425_x86_avx2_h
+
+#include "aymo_cpu.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_TDA8425_X86_AVX2_##_token_
+#define aymo_(_token_)  aymo_tda8425_x86_avx2_##_token_
+
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V256
+struct aymo_(chip) {
+    // Vector data
+    vf32x8_t hb1;
+    vf32x8_t hb2;
+    vf32x8_t kb2;
+
+    vf32x8_t ha1;
+    vf32x8_t ha2;
+    vf32x8_t ka2;
+
+    vf32x8_t hb0;
+    vf32x8_t kb1;
+
+    vf32x8_t ha0;
+    vf32x8_t ka1;
+
+    vf32x8_t krl;
+    vf32x8_t klr;
+
+    vf32x8_t kb0;
+
+    vf32x8_t kv;
+
+    // 32-bit data
+    float sample_rate;  // [Hz]
+    float pseudo_c1;  // [F]
+    float pseudo_c2;  // [F]
+
+    // 8-bit data
+    uint8_t reg_vl;
+    uint8_t reg_vr;
+    uint8_t reg_ba;
+    uint8_t reg_tr;
+    uint8_t reg_pp;
+    uint8_t reg_sf;
+    uint8_t pad32_[2];
+};
+
+
+AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
+
+#endif  // _include_aymo_tda8425_x86_avx2_h
diff --git a/include/aymo_tda8425_x86_sse41.h b/include/aymo_tda8425_x86_sse41.h
new file mode 100644
index 0000000..8626811
--- /dev/null
+++ b/include/aymo_tda8425_x86_sse41.h
@@ -0,0 +1,106 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_tda8425_x86_sse41_h
+#define _include_aymo_tda8425_x86_sse41_h
+
+#include "aymo_cpu.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_TDA8425_X86_SSE41_##_token_
+#define aymo_(_token_)  aymo_tda8425_x86_sse41_##_token_
+
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V128
+struct aymo_(chip) {
+    // Vector data
+    vf32x4_t hb1l;
+    vf32x4_t hb1r;
+    vf32x4_t hb2l;
+    vf32x4_t hb2r;
+    vf32x4_t kb2;
+
+    vf32x4_t ha1l;
+    vf32x4_t ha1r;
+    vf32x4_t ha2l;
+    vf32x4_t ha2r;
+    vf32x4_t ka2;
+
+    vf32x4_t hb0l;
+    vf32x4_t hb0r;
+    vf32x4_t kb1;
+
+    vf32x4_t ha0l;
+    vf32x4_t ha0r;
+    vf32x4_t ka1;
+
+    vf32x4_t krl;
+    vf32x4_t klr;
+
+    vf32x4_t kb0;
+
+    vf32x4_t kv;
+
+    // 32-bit data
+    float sample_rate;  // [Hz]
+    float pseudo_c1;  // [F]
+    float pseudo_c2;  // [F]
+
+    // 8-bit data
+    uint8_t reg_vl;
+    uint8_t reg_vr;
+    uint8_t reg_ba;
+    uint8_t reg_tr;
+    uint8_t reg_pp;
+    uint8_t reg_sf;
+    uint8_t pad32_[2];
+};
+
+
+AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
+
+#endif  // _include_aymo_tda8425_x86_sse41_h
diff --git a/include/aymo_wave.h b/include/aymo_wave.h
new file mode 100644
index 0000000..bd87fa7
--- /dev/null
+++ b/include/aymo_wave.h
@@ -0,0 +1,85 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_wave_h
+#define _include_aymo_wave_h
+
+#include "aymo_cc.h"
+
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+AYMO_PRAGMA_PACK_PUSH_1
+
+#define AYMO_WAVE_FMT_TYPE_PCM      1u
+#define AYMO_WAVE_FMT_TYPE_FLOAT    3u
+
+/* Basic WAVE heading part. */
+struct aymo_wave_heading {
+    char riff_fourcc[4];
+    uint32_t riff_size;
+
+    char wave_fourcc[4];
+
+    char wave_fmt_fourcc[4];
+    uint32_t wave_fmt_size;
+    uint16_t wave_fmt_type;
+    uint16_t wave_fmt_channel_count;
+    uint32_t wave_fmt_sample_rate;
+    uint32_t wave_fmt_byte_rate;
+    uint16_t wave_fmt_block_align;
+    uint16_t wave_fmt_sample_bits;
+
+    char wave_data_fourcc[4];
+    uint32_t wave_data_size;
+
+//  sample_t wave_data_samples[...];
+};
+
+AYMO_PRAGMA_PACK_POP
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+/* Basic setup of a WAVE heading part.
+ *
+ * Made for the common audio formats used with AYMO:
+ * - around 50 kHz sample rate
+ * - little-endian
+ * - 8/16/32-bit signed integers
+ * - 1/2/4 channel_count
+ * - up to a few minutes
+ *
+ * NOTE: Function arguments are not checked in depth!
+ *       Please make sure they are valid!
+ */
+AYMO_PUBLIC void aymo_wave_heading_setup(
+    struct aymo_wave_heading* heading,
+    uint16_t wave_fmt_type,
+    uint16_t channel_count,
+    uint16_t sample_bits,
+    uint32_t sample_rate,
+    uint32_t sample_count
+);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_wave_h
diff --git a/include/aymo_ym7128.h b/include/aymo_ym7128.h
new file mode 100644
index 0000000..c885ab1
--- /dev/null
+++ b/include/aymo_ym7128.h
@@ -0,0 +1,46 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ym7128_h
+#define _include_aymo_ym7128_h
+
+#include "aymo_ym7128_common.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#define AYMO_YM7128_SAMPLE_RATE_IN      23550  // [Hz]
+#define AYMO_YM7128_SAMPLE_RATE_OUT     47100  // [Hz]
+
+
+AYMO_PUBLIC void aymo_ym7128_boot(void);
+AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_ym7128_get_vt(const char* cpu_ext);
+AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_ym7128_get_best_vt(void);
+
+AYMO_PUBLIC uint32_t aymo_ym7128_get_sizeof(struct aymo_ym7128_chip* chip);
+AYMO_PUBLIC void aymo_ym7128_ctor(struct aymo_ym7128_chip* chip);
+AYMO_PUBLIC void aymo_ym7128_dtor(struct aymo_ym7128_chip* chip);
+AYMO_PUBLIC uint8_t aymo_ym7128_read(struct aymo_ym7128_chip* chip, uint16_t address);
+AYMO_PUBLIC void aymo_ym7128_write(struct aymo_ym7128_chip* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_ym7128_process_i16(struct aymo_ym7128_chip* chip, uint32_t count, const int16_t x[], int16_t y[]);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_ym7128_h
diff --git a/include/aymo_ym7128_arm_neon.h b/include/aymo_ym7128_arm_neon.h
new file mode 100644
index 0000000..70bc002
--- /dev/null
+++ b/include/aymo_ym7128_arm_neon.h
@@ -0,0 +1,93 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ym7128_arm_neon_h
+#define _include_aymo_ym7128_arm_neon_h
+
+#include "aymo_cpu.h"
+#include "aymo_ym7128_common.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_YM7128_ARM_NEON_##_token_
+#define aymo_(_token_)  aymo_ym7128_arm_neon_##_token_
+
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V128
+struct aymo_(chip) {
+    // 128-bit data
+    int16_t xxv[8];
+    vi16x8_t kk1;
+    vi16x8_t kk2;
+    vi16x8_t kkm;
+    int16_t tiv[8];
+    vi16x8_t kgl;
+    vi16x8_t kgr;
+    vi16x8_t kv;
+
+    vi16x8_t zc;
+    vi16x8_t zb;
+    vi16x8_t kf;
+    vi16x8_t ke;
+    vi16x8_t za;
+    vi16x8_t kd;
+    vi16x8_t kc;
+    vi16x8_t kb;
+    vi16x8_t ka;
+
+    // 16-bit data
+    int16_t uh[AYMO_YM7128_DELAY_LENGTH];
+
+    // 8-bit data
+    uint8_t regs[AYMO_YM7128_REG_COUNT];
+
+    uint8_t pad32_[3];
+};
+
+
+AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
+
+#endif  // _include_aymo_ym7128_arm_neon_h
diff --git a/include/aymo_ym7128_common.h b/include/aymo_ym7128_common.h
new file mode 100644
index 0000000..9a288b3
--- /dev/null
+++ b/include/aymo_ym7128_common.h
@@ -0,0 +1,118 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ym7128_common_h
+#define _include_aymo_ym7128_common_h
+
+#include "aymo_cc.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// Object-oriented API
+
+struct aymo_ym7128_chip;  // forward
+typedef uint32_t (*aymo_ym7128_get_sizeof_f)(void);
+typedef void (*aymo_ym7128_ctor_f)(struct aymo_ym7128_chip* chip);
+typedef void (*aymo_ym7128_dtor_f)(struct aymo_ym7128_chip* chip);
+typedef uint8_t (*aymo_ym7128_read_f)(struct aymo_ym7128_chip* chip, uint16_t address);
+typedef void (*aymo_ym7128_write_f)(struct aymo_ym7128_chip* chip, uint16_t address, uint8_t value);
+typedef void (*aymo_ym7128_process_i16_f)(struct aymo_ym7128_chip* chip, uint32_t count, const int16_t x[], int16_t y[]);
+
+struct aymo_ym7128_vt {
+    const char* class_name;
+    aymo_ym7128_get_sizeof_f get_sizeof;
+    aymo_ym7128_ctor_f ctor;
+    aymo_ym7128_dtor_f dtor;
+    aymo_ym7128_read_f read;
+    aymo_ym7128_write_f write;
+    aymo_ym7128_process_i16_f process_i16;
+};
+
+struct aymo_ym7128_chip {
+    const struct aymo_ym7128_vt* vt;
+};
+
+
+#define AYMO_YM7128_REG_COUNT       31
+#define AYMO_YM7128_GAIN_BITS       6
+#define AYMO_YM7128_GAIN_COUNT      64
+#define AYMO_YM7128_TAP_BITS        5
+#define AYMO_YM7128_TAP_COUNT       32
+#define AYMO_YM7128_COEFF_BITS      6
+#define AYMO_YM7128_KERNEL_LENGTH   19
+#define AYMO_YM7128_DELAY_LENGTH    2356
+#define AYMO_YM7128_GAIN_UNIT       0x7FFF
+#define AYMO_YM7128_GAIN_MASK       0xFFF0
+#define AYMO_YM7128_SIGNAL_BITS     14
+#define AYMO_YM7128_SIGNAL_MASK     0xFFFC
+
+
+enum aymo_ym7128_reg {
+    aymo_ym7128_reg_gl1 = 0,
+    aymo_ym7128_reg_gl2,
+    aymo_ym7128_reg_gl3,
+    aymo_ym7128_reg_gl4,
+    aymo_ym7128_reg_gl5,
+    aymo_ym7128_reg_gl6,
+    aymo_ym7128_reg_gl7,
+    aymo_ym7128_reg_gl8,
+
+    aymo_ym7128_reg_gr1,
+    aymo_ym7128_reg_gr2,
+    aymo_ym7128_reg_gr3,
+    aymo_ym7128_reg_gr4,
+    aymo_ym7128_reg_gr5,
+    aymo_ym7128_reg_gr6,
+    aymo_ym7128_reg_gr7,
+    aymo_ym7128_reg_gr8,
+
+    aymo_ym7128_reg_vm,
+    aymo_ym7128_reg_vc,
+
+    aymo_ym7128_reg_vl,
+    aymo_ym7128_reg_vr,
+
+    aymo_ym7128_reg_c0,
+    aymo_ym7128_reg_c1,
+
+    aymo_ym7128_reg_t0,
+    aymo_ym7128_reg_t1,
+    aymo_ym7128_reg_t2,
+    aymo_ym7128_reg_t3,
+    aymo_ym7128_reg_t4,
+    aymo_ym7128_reg_t5,
+    aymo_ym7128_reg_t6,
+    aymo_ym7128_reg_t7,
+    aymo_ym7128_reg_t8
+};
+
+
+AYMO_PUBLIC const int16_t aymo_ym7128_gain[AYMO_YM7128_GAIN_COUNT];
+AYMO_PUBLIC const int16_t aymo_ym7128_tap[AYMO_YM7128_TAP_COUNT];
+AYMO_PUBLIC const int16_t aymo_ym7128_kernel_linear[AYMO_YM7128_KERNEL_LENGTH];
+AYMO_PUBLIC const int16_t aymo_ym7128_kernel_minèhase[AYMO_YM7128_KERNEL_LENGTH];
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_ym7128_common_h
diff --git a/include/aymo_ym7128_none.h b/include/aymo_ym7128_none.h
new file mode 100644
index 0000000..a9212ce
--- /dev/null
+++ b/include/aymo_ym7128_none.h
@@ -0,0 +1,61 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ym7128_none_h
+#define _include_aymo_ym7128_none_h
+
+#include "aymo_cpu.h"
+#include "aymo_ym7128_common.h"
+
+#include "YM7128B_emu.h"
+
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_YM7128_NONE_##_token_
+#define aymo_(_token_)  aymo_ym7128_none_##_token_
+
+
+struct aymo_(chip) {
+    YM7128B_ChipFixed emu;
+};
+
+
+AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_ym7128_none_h
diff --git a/include/aymo_ym7128_x86_sse41.h b/include/aymo_ym7128_x86_sse41.h
new file mode 100644
index 0000000..236deba
--- /dev/null
+++ b/include/aymo_ym7128_x86_sse41.h
@@ -0,0 +1,93 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ym7128_x86_sse41_h
+#define _include_aymo_ym7128_x86_sse41_h
+
+#include "aymo_cpu.h"
+#include "aymo_ym7128_common.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_YM7128_X86_SSE41_##_token_
+#define aymo_(_token_)  aymo_ym7128_x86_sse41_##_token_
+
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V128
+struct aymo_(chip) {
+    // Vector data
+    int16_t xxv[8];
+    vi16x8_t kk1;
+    vi16x8_t kk2;
+    vi16x8_t kkm;
+    int16_t tiv[8];
+    vi16x8_t kgl;
+    vi16x8_t kgr;
+    vi16x8_t kv;
+
+    vi16x8_t zc;
+    vi16x8_t zb;
+    vi16x8_t kf;
+    vi16x8_t ke;
+    vi16x8_t za;
+    vi16x8_t kd;
+    vi16x8_t kc;
+    vi16x8_t kb;
+    vi16x8_t ka;
+
+    // 16-bit data
+    int16_t uh[AYMO_YM7128_DELAY_LENGTH];
+
+    // 8-bit data
+    uint8_t regs[AYMO_YM7128_REG_COUNT];
+
+    uint8_t pad32_[3];
+};
+
+
+AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
+
+#endif  // _include_aymo_ym7128_x86_sse41_h
diff --git a/include/aymo_ymf262.h b/include/aymo_ymf262.h
new file mode 100644
index 0000000..1d46004
--- /dev/null
+++ b/include/aymo_ymf262.h
@@ -0,0 +1,56 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ymf262_h
+#define _include_aymo_ymf262_h
+
+#include "aymo_ymf262_common.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#define AYMO_YMF262_SLOT_NUM        36
+#define AYMO_YMF262_CHANNEL_NUM     18
+#define AYMO_YMF262_CONN_NUM_MAX    6
+
+#define AYMO_YMF262_SAMPLE_RATE     47916  // [Hz]
+
+
+AYMO_PUBLIC void aymo_ymf262_boot(void);
+AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_ymf262_get_vt(const char* cpu_ext);
+AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_ymf262_get_best_vt(void);
+
+AYMO_PUBLIC uint32_t aymo_ymf262_get_sizeof(struct aymo_ymf262_chip* chip);
+AYMO_PUBLIC void aymo_ymf262_ctor(struct aymo_ymf262_chip* chip);
+AYMO_PUBLIC void aymo_ymf262_dtor(struct aymo_ymf262_chip* chip);
+AYMO_PUBLIC uint8_t aymo_ymf262_read(struct aymo_ymf262_chip* chip, uint16_t address);
+AYMO_PUBLIC void aymo_ymf262_write(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_ymf262_enqueue_write(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_ymf262_enqueue_delay(struct aymo_ymf262_chip* chip, uint32_t count);
+AYMO_PUBLIC int16_t aymo_ymf262_get_output(struct aymo_ymf262_chip* chip, uint8_t channel);
+AYMO_PUBLIC void aymo_ymf262_tick(struct aymo_ymf262_chip* chip, uint32_t count);
+AYMO_PUBLIC void aymo_ymf262_generate_i16x2(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_ymf262_generate_i16x4(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_ymf262_generate_f32x2(struct aymo_ymf262_chip* chip, uint32_t count, float y[]);
+AYMO_PUBLIC void aymo_ymf262_generate_f32x4(struct aymo_ymf262_chip* chip, uint32_t count, float y[]);
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_ymf262_h
diff --git a/include/aymo_ymf262_arm_neon.h b/include/aymo_ymf262_arm_neon.h
new file mode 100644
index 0000000..b7a6116
--- /dev/null
+++ b/include/aymo_ymf262_arm_neon.h
@@ -0,0 +1,333 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ymf262_arm_neon_h
+#define _include_aymo_ymf262_arm_neon_h
+
+#include "aymo_cpu.h"
+#include "aymo_ymf262_common.h"
+
+#include <stddef.h>
+
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_YMF262_ARM_NEON_##_token_
+#define aymo_(_token_)  aymo_ymf262_arm_neon_##_token_
+
+
+#define AYMO_YMF262_ARM_NEON_SLOT_NUM_MAX           64
+#define AYMO_YMF262_ARM_NEON_CHANNEL_NUM_MAX        32
+#define AYMO_YMF262_ARM_NEON_SLOT_GROUP_NUM         8
+#define AYMO_YMF262_ARM_NEON_SLOT_GROUP_LENGTH      8
+
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+
+// Wave descriptor for single slot
+struct aymo_(wave) {
+    int16_t wg_phase_shl;
+    int16_t wg_phase_zero;
+    int16_t wg_phase_neg;
+    int16_t wg_phase_flip;
+    int16_t wg_phase_mask;
+    int16_t wg_sine_gate;
+};
+
+// Waveform enumerator
+enum aymo_(wf) {
+    aymo_(wf_sin) = 0,
+    aymo_(wf_sinup),
+    aymo_(wf_sinabs),
+    aymo_(wf_sinabsqrt),
+    aymo_(wf_sinfast),
+    aymo_(wf_sinabsfast),
+    aymo_(wf_square),
+    aymo_(wf_log)
+};
+
+
+// Connection descriptor for a single slot
+struct aymo_(conn) {
+    int16_t wg_fbmod_gate;
+    int16_t wg_prmod_gate;
+    int16_t og_out_gate;
+};
+
+
+// TODO: move reg queue outside YMF262
+#ifndef AYMO_YMF262_ARM_NEON_REG_QUEUE_LENGTH
+#define AYMO_YMF262_ARM_NEON_REG_QUEUE_LENGTH       256
+#endif
+#ifndef AYMO_YMF262_ARM_NEON_REG_QUEUE_LATENCY
+#define AYMO_YMF262_ARM_NEON_REG_QUEUE_LATENCY      2
+#endif
+
+struct aymo_(reg_queue_item) {
+    uint16_t address;
+    uint8_t value;
+};
+
+
+#define AYMO_YMF262_ARM_NEON_EG_GEN_ATTACK          0
+#define AYMO_YMF262_ARM_NEON_EG_GEN_DECAY           1
+#define AYMO_YMF262_ARM_NEON_EG_GEN_SUSTAIN         2
+#define AYMO_YMF262_ARM_NEON_EG_GEN_RELEASE         3
+
+#define AYMO_YMF262_ARM_NEON_EG_GEN_SHL_ATTACK      0
+#define AYMO_YMF262_ARM_NEON_EG_GEN_SHL_DECAY       4
+#define AYMO_YMF262_ARM_NEON_EG_GEN_SHL_SUSTAIN     8
+#define AYMO_YMF262_ARM_NEON_EG_GEN_SHL_RELEASE     12
+#define AYMO_YMF262_ARM_NEON_EG_GEN_SRLHI           10
+
+#define AYMO_YMF262_ARM_NEON_EG_KEY_NORMAL          (1 << 0)
+#define AYMO_YMF262_ARM_NEON_EG_KEY_DRUM            (1 << 8)
+
+// Packed ADSR register values
+AYMO_PRAGMA_PACK_PUSH_1
+struct aymo_(eg_adsr) {
+    uint16_t rr : 4;
+    uint16_t sr : 4;
+    uint16_t dr : 4;
+    uint16_t ar : 4;
+};
+AYMO_PRAGMA_PACK_POP
+
+
+// Slot SIMD group status
+// Processing order (kinda)
+AYMO_ALIGN_V128
+struct aymo_(slot_group) {
+    // Updated each sample cycle
+    vi16x8_t eg_rout;
+    vi16x8_t eg_tremolo_am;
+    vi16x8_t eg_ksl_sh_tl_x4;
+    vi32x4_t pg_phase_lo;
+    vi32x4_t pg_phase_hi;
+    vi16x8_t pg_phase_out;
+    vi16x8_t eg_gen;
+    vi16x8_t eg_key;           // bit 8 = drum, bit 0 = normal
+    vi16x8_t eg_gen_shl;       // depends on reg_type for reg_sr
+    vi16x8_t eg_adsr;          // struct aymo_(eg_adsr)
+    vi16x8_t eg_ks;
+    vi32x4_t pg_deltafreq_lo;
+    vi32x4_t pg_deltafreq_hi;
+    vi16x8_t wg_out;
+    vi16x8_t wg_prout;
+    vi16x8_t wg_fb_shs;         // signed
+    vi16x8_t wg_prmod_gate;
+    vi16x8_t wg_fbmod_gate;
+    vi16x8_t wg_phase_shl;
+    vi16x8_t wg_phase_zero;
+    vi16x8_t wg_phase_flip;
+    vi16x8_t wg_phase_mask;
+    vi16x8_t wg_sine_gate;
+    vi16x8_t eg_out;
+    vi16x8_t wg_phase_neg;
+    vi16x8_t eg_sl;
+    vi16x8_t og_prout;
+    vi16x8_t og_prout_ac;
+    vi16x8_t og_prout_bd;
+    vi16x8_t og_out_ch_gate_a;
+    vi16x8_t og_out_ch_gate_c;
+    vi16x8_t og_out_ch_gate_b;
+    vi16x8_t og_out_ch_gate_d;
+
+    // Updated infrequently
+    vi16x8_t pg_vib;
+    vi16x8_t pg_mult_x2;
+
+    // Updated only by writing registers
+    vi16x8_t eg_am;
+    vi16x8_t og_out_gate;
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+    vi16x8_t eg_ksl;
+    vi16x8_t eg_rate;
+    vi16x8_t eg_inc;
+    vi16x8_t wg_fbmod;
+    vi16x8_t wg_mod;
+#endif  // AYMO_DEBUG
+};
+
+// Channel_2xOP SIMD group status
+// Processing order (kinda)
+AYMO_ALIGN_V128
+struct aymo_(ch2x_group) {
+    // Updated infrequently
+    vi16x8_t pg_fnum;
+    vi16x8_t pg_block;
+
+    // Updated only by writing registers
+    vi16x8_t eg_ksv;
+
+    vi16x8_t og_ch_gate_a;
+    vi16x8_t og_ch_gate_b;
+    vi16x8_t og_ch_gate_c;
+    vi16x8_t og_ch_gate_d;
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+#endif  // AYMO_DEBUG
+};
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V128
+struct aymo_(chip) {
+    struct aymo_ymf262_chip parent;
+
+    // 128-bit data
+    struct aymo_(slot_group) sg[AYMO_(SLOT_GROUP_NUM)];
+    struct aymo_(ch2x_group) cg[AYMO_(SLOT_GROUP_NUM) / 2];
+
+    vi16x8_t eg_add;
+    vi16x8_t wg_mod;
+    vi16x8_t eg_incstep;
+    vi16x8_t og_acc_a;
+    vi16x8_t og_acc_c;
+    vi16x8_t og_acc_b;
+    vi16x8_t og_acc_d;
+    vi16x4_t og_out;  // coupled 64-bit variables
+    vi16x4_t og_old;  // coupled 64-bit variables
+
+    vi16x8_t pg_vib_shs;    // signed
+    vi16x8_t pg_vib_sign;
+
+    // 64-bit data
+    uint64_t eg_timer;
+    uint64_t tm_timer;
+
+    // 32-bit data
+    uint32_t rq_delay;
+    uint32_t og_ch2x_pairing;
+    uint32_t og_ch2x_drum;
+    uint32_t ng_noise;
+
+    // 16-bit data
+    uint16_t rq_head;
+    uint16_t rq_tail;
+
+    // 8-bit data
+    uint8_t eg_state;
+    uint8_t eg_timerrem;
+    uint8_t rm_hh_bit2;
+    uint8_t rm_hh_bit3;
+    uint8_t rm_hh_bit7;
+    uint8_t rm_hh_bit8;
+    uint8_t rm_tc_bit3;
+    uint8_t rm_tc_bit5;
+    uint8_t eg_tremolopos;
+    uint8_t eg_tremoloshift;
+    uint8_t eg_vibshift;
+    uint8_t pg_vibpos;
+    uint8_t process_all_slots;
+    uint8_t pad32_[1];
+
+    struct aymo_ymf262_chip_regs chip_regs;
+    struct aymo_ymf262_slot_regs slot_regs[AYMO_(SLOT_NUM_MAX)];
+    struct aymo_ymf262_chan_regs ch2x_regs[AYMO_(CHANNEL_NUM_MAX)];
+
+    struct aymo_(reg_queue_item) rq_buffer[AYMO_(REG_QUEUE_LENGTH)];
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+#endif  // AYMO_DEBUG
+};
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+AYMO_PUBLIC const int8_t aymo_(sgo_side)[8];
+AYMO_PUBLIC const int8_t aymo_(sgo_cell)[8];
+
+AYMO_PUBLIC const int16_t aymo_(eg_incstep_table)[4];
+
+AYMO_PUBLIC const struct aymo_(wave) aymo_(wave_table)[8];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */];
+
+AYMO_PUBLIC const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)];
+AYMO_PUBLIC const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)];
+
+AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt);
+
+
+AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel);
+AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+
+
+// Slot group index to Channel group index
+static inline
+int aymo_(sgi_to_cgi)(int sgi)
+{
+//    return (((sgi / 4) * 2) | (sgi % 2));
+    return (((sgi >> 1) & 2) | (sgi & 1));
+}
+
+
+// Address to Slot index
+static inline
+int8_t aymo_(addr_to_slot)(uint16_t address)
+{
+    uint16_t subaddr = ((address & 0x1F) | ((address >> 8) & 1));
+    int8_t slot = aymo_ymf262_subaddr_to_slot[subaddr];
+    return slot;
+}
+
+
+// Address to Channel_2xOP index
+static inline
+int8_t aymo_(addr_to_ch2x)(uint16_t address)
+{
+    uint16_t subaddr = ((address & 0x0F) | ((address >> 8) & 1));
+    int8_t ch2x = aymo_ymf262_subaddr_to_ch2x[subaddr];
+    return ch2x;
+}
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
+
+#endif  // _include_aymo_ymf262_arm_neon_h
diff --git a/include/aymo_ymf262_common.h b/include/aymo_ymf262_common.h
new file mode 100644
index 0000000..3cd10b0
--- /dev/null
+++ b/include/aymo_ymf262_common.h
@@ -0,0 +1,230 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ymf262_common_h
+#define _include_aymo_ymf262_common_h
+
+#include "aymo_cc.h"
+
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// Object-oriented API
+
+struct aymo_ymf262_chip;  // forward
+typedef uint32_t (*aymo_ymf262_get_sizeof_f)(void);
+typedef void (*aymo_ymf262_ctor_f)(struct aymo_ymf262_chip* chip);
+typedef void (*aymo_ymf262_dtor_f)(struct aymo_ymf262_chip* chip);
+typedef uint8_t (*aymo_ymf262_read_f)(struct aymo_ymf262_chip* chip, uint16_t address);
+typedef void (*aymo_ymf262_write_f)(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value);
+typedef int (*aymo_ymf262_enqueue_write_f)(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value);
+typedef int (*aymo_ymf262_enqueue_delay_f)(struct aymo_ymf262_chip* chip, uint32_t count);
+typedef int16_t (*aymo_ymf262_get_output_f)(struct aymo_ymf262_chip* chip, uint8_t channel);
+typedef void (*aymo_ymf262_tick_f)(struct aymo_ymf262_chip* chip, uint32_t count);
+typedef void (*aymo_ymf262_generate_i16x2_f)(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]);
+typedef void (*aymo_ymf262_generate_i16x4_f)(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]);
+typedef void (*aymo_ymf262_generate_f32x2_f)(struct aymo_ymf262_chip* chip, uint32_t count, float y[]);
+typedef void (*aymo_ymf262_generate_f32x4_f)(struct aymo_ymf262_chip* chip, uint32_t count, float y[]);
+
+struct aymo_ymf262_vt {
+    const char* class_name;
+    aymo_ymf262_get_sizeof_f get_sizeof;
+    aymo_ymf262_ctor_f ctor;
+    aymo_ymf262_dtor_f dtor;
+    aymo_ymf262_read_f read;
+    aymo_ymf262_write_f write;
+    aymo_ymf262_enqueue_write_f enqueue_write;
+    aymo_ymf262_enqueue_delay_f enqueue_delay;
+    aymo_ymf262_get_output_f get_output;
+    aymo_ymf262_tick_f tick;
+    aymo_ymf262_generate_i16x2_f generate_i16x2;
+    aymo_ymf262_generate_i16x4_f generate_i16x4;
+    aymo_ymf262_generate_f32x2_f generate_f32x2;
+    aymo_ymf262_generate_f32x4_f generate_f32x4;
+};
+
+struct aymo_ymf262_chip {
+    const struct aymo_ymf262_vt* vt;
+};
+
+
+// Limits
+#define AYMO_YMF262_SLOT_NUM_MAX        64
+#define AYMO_YMF262_CHANNEL_NUM_MAX     32
+
+
+// Registers; little-endian bitfields
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+
+AYMO_PRAGMA_PACK_PUSH_1
+
+
+struct aymo_ymf262_reg_01h {
+    uint8_t lsitest_lo : 8;
+};
+struct aymo_ymf262_reg_101h {
+    uint8_t lsitest_hi : 6;
+    uint8_t _7_6 : 2;
+};
+struct aymo_ymf262_reg_02h {
+    uint8_t timer1 : 8;
+};
+struct aymo_ymf262_reg_03h {
+    uint8_t timer2 : 8;
+};
+struct aymo_ymf262_reg_04h {
+    uint8_t st1 : 1;
+    uint8_t st2 : 1;
+    uint8_t _4_2 : 3;
+    uint8_t mt2 : 1;
+    uint8_t mt1 : 1;
+    uint8_t rst : 1;
+};
+struct aymo_ymf262_reg_104h {
+    uint8_t conn : 6;
+    uint8_t _7_6 : 2;
+};
+struct aymo_ymf262_reg_105h {
+    uint8_t newm : 1;
+    uint8_t stereo : 1;
+    uint8_t _7_2 : 6;
+};
+struct aymo_ymf262_reg_08h {
+    uint8_t _5_0 : 6;
+    uint8_t nts : 1;
+    uint8_t csm : 1;
+};
+struct aymo_ymf262_reg_20h {
+    uint8_t mult : 4;
+    uint8_t ksr : 1;
+    uint8_t egt : 1;
+    uint8_t vib : 1;
+    uint8_t am : 1;
+};
+struct aymo_ymf262_reg_40h {
+    uint8_t tl : 6;
+    uint8_t ksl : 2;
+};
+struct aymo_ymf262_reg_60h {
+    uint8_t dr : 4;
+    uint8_t ar : 4;
+};
+struct aymo_ymf262_reg_80h {
+    uint8_t rr : 4;
+    uint8_t sl : 4;
+};
+struct aymo_ymf262_reg_A0h {
+    uint8_t fnum_lo : 8;
+};
+struct aymo_ymf262_reg_B0h {
+    uint8_t fnum_hi : 2;
+    uint8_t block : 3;
+    uint8_t kon : 1;
+    uint8_t _7_6 : 2;
+};
+struct aymo_ymf262_reg_BDh {
+    uint8_t hh : 1;
+    uint8_t tc : 1;
+    uint8_t tom : 1;
+    uint8_t sd : 1;
+    uint8_t bd : 1;
+    uint8_t ryt : 1;
+    uint8_t dvb : 1;
+    uint8_t dam : 1;
+};
+struct aymo_ymf262_reg_C0h {
+    uint8_t cnt : 1;
+    uint8_t fb : 3;
+    uint8_t cha : 1;
+    uint8_t chb : 1;
+    uint8_t chc : 1;
+    uint8_t chd : 1;
+};
+struct aymo_ymf262_reg_E0h {
+    uint8_t ws : 3;
+    uint8_t _7_3 : 5;
+};
+
+struct aymo_ymf262_chip_regs {
+    struct aymo_ymf262_reg_01h reg_01h;
+    struct aymo_ymf262_reg_02h reg_02h;
+    struct aymo_ymf262_reg_03h reg_03h;
+    struct aymo_ymf262_reg_04h reg_04h;
+    struct aymo_ymf262_reg_08h reg_08h;
+    struct aymo_ymf262_reg_BDh reg_BDh;
+    struct aymo_ymf262_reg_101h reg_101h;
+    struct aymo_ymf262_reg_104h reg_104h;
+    struct aymo_ymf262_reg_105h reg_105h;
+    uint8_t _pad32[3];
+};
+
+struct aymo_ymf262_slot_regs {
+    struct aymo_ymf262_reg_20h reg_20h;
+    struct aymo_ymf262_reg_40h reg_40h;
+    struct aymo_ymf262_reg_60h reg_60h;
+    struct aymo_ymf262_reg_80h reg_80h;
+    struct aymo_ymf262_reg_E0h reg_E0h;
+    uint8_t _pad32[3];
+};
+
+struct aymo_ymf262_chan_regs {
+    struct aymo_ymf262_reg_A0h reg_A0h;
+    struct aymo_ymf262_reg_B0h reg_B0h;
+    struct aymo_ymf262_reg_C0h reg_C0h;
+    struct aymo_ymf262_reg_C0h reg_D0h;
+};
+
+
+// Packed ADSR register values
+struct aymo_ymf262_adsr {
+    uint16_t rr : 4;
+    uint16_t sr : 4;
+    uint16_t dr : 4;
+    uint16_t ar : 4;
+};
+
+
+AYMO_PRAGMA_PACK_POP
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+AYMO_PUBLIC const int16_t aymo_ymf262_exp_x2_table[256 + 4];
+AYMO_PUBLIC const int16_t aymo_ymf262_logsin_table[256 + 4];
+
+AYMO_PUBLIC const int8_t aymo_ymf262_word_to_slot[AYMO_YMF262_SLOT_NUM_MAX];
+AYMO_PUBLIC const int8_t aymo_ymf262_slot_to_word[AYMO_YMF262_SLOT_NUM_MAX];
+AYMO_PUBLIC const int8_t aymo_ymf262_word_to_ch2x[AYMO_YMF262_SLOT_NUM_MAX];
+AYMO_PUBLIC const int8_t aymo_ymf262_ch2x_to_word[AYMO_YMF262_SLOT_NUM_MAX / 2][2/* slot */];
+AYMO_PUBLIC const int8_t aymo_ymf262_word_to_ch4x[AYMO_YMF262_SLOT_NUM_MAX];
+AYMO_PUBLIC const int8_t aymo_ymf262_ch4x_to_word[AYMO_YMF262_SLOT_NUM_MAX / 4][4/* slot */];
+AYMO_PUBLIC const int8_t aymo_ymf262_ch4x_to_pair[AYMO_YMF262_CHANNEL_NUM_MAX / 2][2/* slot */];
+AYMO_PUBLIC const int8_t aymo_ymf262_ch2x_paired[AYMO_YMF262_CHANNEL_NUM_MAX];
+AYMO_PUBLIC const int8_t aymo_ymf262_subaddr_to_slot[AYMO_YMF262_SLOT_NUM_MAX];
+AYMO_PUBLIC const int8_t aymo_ymf262_subaddr_to_ch2x[AYMO_YMF262_CHANNEL_NUM_MAX];
+AYMO_PUBLIC const int8_t aymo_ymf262_pg_mult_x2_table[16];
+AYMO_PUBLIC const int8_t aymo_ymf262_eg_ksl_table[16];
+AYMO_PUBLIC const int8_t aymo_ymf262_eg_kslsh_table[4];
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_ymf262_common_h
diff --git a/include/aymo_ymf262_none.h b/include/aymo_ymf262_none.h
new file mode 100644
index 0000000..e2e7f10
--- /dev/null
+++ b/include/aymo_ymf262_none.h
@@ -0,0 +1,79 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ymf262_none_h
+#define _include_aymo_ymf262_none_h
+
+#include "aymo_cpu.h"
+#include "aymo_ymf262_common.h"
+
+#include "opl3.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_YMF262_NONE_##_token_
+#define aymo_(_token_)  aymo_ymf262_none_##_token_
+
+
+#define AYMO_YMF262_NONE_SLOT_NUM_MAX               AYMO_YMF262_SLOT_NUM
+#define AYMO_YMF262_NONE_CHANNEL_NUM_MAX            AYMO_YMF262_CHANNEL_NUM
+#define AYMO_YMF262_NONE_SLOT_GROUP_NUM             AYMO_YMF262_SLOT_NUM
+#define AYMO_YMF262_NONE_SLOT_GROUP_LENGTH          1
+
+
+struct aymo_(chip) {
+    struct aymo_ymf262_chip parent;
+    int16_t outs[4];
+    opl3_chip opl3;
+};
+
+AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt);
+
+
+AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel);
+AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // _include_aymo_ymf262_none_h
diff --git a/include/aymo_ymf262_x86_avx.h b/include/aymo_ymf262_x86_avx.h
new file mode 100644
index 0000000..b808f5c
--- /dev/null
+++ b/include/aymo_ymf262_x86_avx.h
@@ -0,0 +1,333 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ymf262_x86_avx_h
+#define _include_aymo_ymf262_x86_avx_h
+
+#include "aymo_cpu.h"
+#include "aymo_ymf262_common.h"
+
+#include <stddef.h>
+
+#ifdef AYMO_CPU_SUPPORT_X86_AVX
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// YMF262 via x86 AVX is actually the SSE4.1 code compiled with VEX prefix
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_YMF262_X86_AVX_##_token_
+#define aymo_(_token_)  aymo_ymf262_x86_avx_##_token_
+
+
+#define AYMO_YMF262_X86_AVX_SLOT_NUM_MAX          64
+#define AYMO_YMF262_X86_AVX_CHANNEL_NUM_MAX       32
+#define AYMO_YMF262_X86_AVX_SLOT_GROUP_NUM        8
+#define AYMO_YMF262_X86_AVX_SLOT_GROUP_LENGTH     8
+
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+
+// Wave descriptor for single slot
+struct aymo_(wave) {
+    int16_t wg_phase_mullo;
+    int16_t wg_phase_zero;
+    int16_t wg_phase_neg;
+    int16_t wg_phase_flip;
+    int16_t wg_phase_mask;
+    int16_t wg_sine_gate;
+};
+
+// Waveform enumerator
+enum aymo_(wf) {
+    aymo_(wf_sin) = 0,
+    aymo_(wf_sinup),
+    aymo_(wf_sinabs),
+    aymo_(wf_sinabsqrt),
+    aymo_(wf_sinfast),
+    aymo_(wf_sinabsfast),
+    aymo_(wf_square),
+    aymo_(wf_log)
+};
+
+
+// Connection descriptor for a single slot
+struct aymo_(conn) {  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+    int16_t wg_fbmod_gate;
+    int16_t wg_prmod_gate;
+    int16_t og_out_gate;
+};
+
+
+// TODO: move reg queue outside YMF262
+#ifndef AYMO_YMF262_X86_AVX_REG_QUEUE_LENGTH
+#define AYMO_YMF262_X86_AVX_REG_QUEUE_LENGTH        256
+#endif
+#ifndef AYMO_YMF262_X86_AVX_REG_QUEUE_LATENCY
+#define AYMO_YMF262_X86_AVX_REG_QUEUE_LATENCY       2
+#endif
+
+struct aymo_(reg_queue_item) {
+    uint16_t address;
+    uint8_t value;
+};
+
+
+#define AYMO_YMF262_X86_AVX_EG_GEN_ATTACK           0
+#define AYMO_YMF262_X86_AVX_EG_GEN_DECAY            1
+#define AYMO_YMF262_X86_AVX_EG_GEN_SUSTAIN          2
+#define AYMO_YMF262_X86_AVX_EG_GEN_RELEASE          3
+
+#define AYMO_YMF262_X86_AVX_EG_GEN_MULLO_ATTACK     (1 <<  0)
+#define AYMO_YMF262_X86_AVX_EG_GEN_MULLO_DECAY      (1 <<  4)
+#define AYMO_YMF262_X86_AVX_EG_GEN_MULLO_SUSTAIN    (1 <<  8)
+#define AYMO_YMF262_X86_AVX_EG_GEN_MULLO_RELEASE    (1 << 12)
+#define AYMO_YMF262_X86_AVX_EG_GEN_SRLHI            10
+
+#define AYMO_YMF262_X86_AVX_EG_KEY_NORMAL           (1 << 0)
+#define AYMO_YMF262_X86_AVX_EG_KEY_DRUM             (1 << 8)
+
+// Packed ADSR register values
+AYMO_PRAGMA_PACK_PUSH_1
+struct aymo_(eg_adsr) {
+    uint16_t rr : 4;
+    uint16_t sr : 4;
+    uint16_t dr : 4;
+    uint16_t ar : 4;
+};
+AYMO_PRAGMA_PACK_POP
+
+
+// Slot SIMD group status
+// Processing order (kinda)
+AYMO_ALIGN_V128
+struct aymo_(slot_group) {
+    // Updated each sample cycle
+    vi16x8_t eg_rout;
+    vi16x8_t eg_tremolo_am;
+    vi16x8_t eg_ksl_sh_tl_x4;
+    vi32x4_t pg_phase_lo;
+    vi32x4_t pg_phase_hi;
+    vi16x8_t pg_phase_out;
+    vi16x8_t eg_gen;
+    vi16x8_t eg_key;           // bit 8 = drum, bit 0 = normal
+    vi16x8_t eg_gen_mullo;     // depends on reg_type for reg_sr
+    vi16x8_t eg_adsr;          // struct aymo_(eg_adsr)
+    vi16x8_t eg_ks;
+    vi32x4_t pg_deltafreq_lo;
+    vi32x4_t pg_deltafreq_hi;
+    vi16x8_t wg_out;
+    vi16x8_t wg_prout;
+    vi16x8_t wg_fb_mulhi;
+    vi16x8_t wg_prmod_gate;
+    vi16x8_t wg_fbmod_gate;
+    vi16x8_t wg_phase_mullo;
+    vi16x8_t wg_phase_zero;
+    vi16x8_t wg_phase_flip;
+    vi16x8_t wg_phase_mask;
+    vi16x8_t wg_sine_gate;
+    vi16x8_t eg_out;
+    vi16x8_t wg_phase_neg;
+    vi16x8_t eg_sl;
+    vi16x8_t og_prout;
+    vi16x8_t og_prout_ac;
+    vi16x8_t og_prout_bd;
+    vi16x8_t og_out_ch_gate_a;
+    vi16x8_t og_out_ch_gate_c;
+    vi16x8_t og_out_ch_gate_b;
+    vi16x8_t og_out_ch_gate_d;
+
+    // Updated infrequently
+    vi16x8_t pg_vib;
+    vi16x8_t pg_mult_x2;
+
+    // Updated only by writing registers
+    vi16x8_t eg_am;
+    vi16x8_t og_out_gate;
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+    vi16x8_t eg_ksl;
+    vi16x8_t eg_rate;
+    vi16x8_t eg_inc;
+    vi16x8_t wg_fbmod;
+    vi16x8_t wg_mod;
+#endif  // AYMO_DEBUG
+};
+
+// Channel_2xOP SIMD group status
+// Processing order (kinda)
+AYMO_ALIGN_V128
+struct aymo_(ch2x_group) {
+    // Updated infrequently
+    vi16x8_t pg_fnum;
+    vi16x8_t pg_block;
+
+    // Updated only by writing registers
+    vi16x8_t eg_ksv;
+
+    vi16x8_t og_ch_gate_a;
+    vi16x8_t og_ch_gate_b;
+    vi16x8_t og_ch_gate_c;
+    vi16x8_t og_ch_gate_d;
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+#endif  // AYMO_DEBUG
+};
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V128
+struct aymo_(chip) {
+    struct aymo_ymf262_chip parent;
+
+    // 128-bit data
+    struct aymo_(slot_group) sg[AYMO_(SLOT_GROUP_NUM)];
+    struct aymo_(ch2x_group) cg[AYMO_(SLOT_GROUP_NUM) / 2];
+
+    vi16x8_t eg_add;
+    vi16x8_t wg_mod;
+    vu16x8_t eg_incstep;
+    vi16x8_t og_acc_a;
+    vi16x8_t og_acc_c;
+    vi16x8_t og_acc_b;
+    vi16x8_t og_acc_d;
+    vi16x8_t og_out;
+
+    vi16x8_t pg_vib_mulhi;
+    vi16x8_t pg_vib_neg;
+
+    // 64-bit data
+    uint64_t eg_timer;
+    uint64_t tm_timer;
+
+    // 32-bit data
+    uint32_t rq_delay;
+    uint32_t og_ch2x_pairing;
+    uint32_t og_ch2x_drum;
+    uint32_t ng_noise;
+
+    // 16-bit data
+    uint16_t rq_head;
+    uint16_t rq_tail;
+
+    // 8-bit data
+    uint8_t eg_state;
+    uint8_t eg_timerrem;
+    uint8_t rm_hh_bit2;
+    uint8_t rm_hh_bit3;
+    uint8_t rm_hh_bit7;
+    uint8_t rm_hh_bit8;
+    uint8_t rm_tc_bit3;
+    uint8_t rm_tc_bit5;
+    uint8_t eg_tremolopos;
+    uint8_t eg_tremoloshift;
+    uint8_t eg_vibshift;
+    uint8_t pg_vibpos;
+    uint8_t process_all_slots;
+    uint8_t pad32_[1];
+
+    struct aymo_ymf262_chip_regs chip_regs;
+    struct aymo_ymf262_slot_regs slot_regs[AYMO_(SLOT_NUM_MAX)];
+    struct aymo_ymf262_chan_regs ch2x_regs[AYMO_(CHANNEL_NUM_MAX)];
+
+    struct aymo_(reg_queue_item) rq_buffer[AYMO_(REG_QUEUE_LENGTH)];
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+#endif  // AYMO_DEBUG
+};
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+AYMO_PUBLIC const int8_t aymo_(sgo_side)[8];
+AYMO_PUBLIC const int8_t aymo_(sgo_cell)[8];
+
+AYMO_PUBLIC const uint16_t aymo_(eg_incstep_table)[4];
+
+AYMO_PUBLIC const struct aymo_(wave) aymo_(wave_table)[8];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */];
+
+AYMO_PUBLIC const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)];
+AYMO_PUBLIC const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)];
+
+AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt);
+
+
+AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel);
+AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+
+
+// Slot group index to Channel group index
+static inline
+int aymo_(sgi_to_cgi)(int sgi)
+{
+//    return (((sgi / 4) * 2) | (sgi % 2));
+    return (((sgi >> 1) & 2) | (sgi & 1));
+}
+
+
+// Address to Slot index
+static inline
+int8_t aymo_(addr_to_slot)(uint16_t address)
+{
+    uint16_t subaddr = ((address & 0x1F) | ((address >> 8) & 1));
+    int8_t slot = aymo_ymf262_subaddr_to_slot[subaddr];
+    return slot;
+}
+
+
+// Address to Channel_2xOP index
+static inline
+int8_t aymo_(addr_to_ch2x)(uint16_t address)
+{
+    uint16_t subaddr = ((address & 0x0F) | ((address >> 8) & 1));
+    int8_t ch2x = aymo_ymf262_subaddr_to_ch2x[subaddr];
+    return ch2x;
+}
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX
+
+#endif  // _include_aymo_ymf262_x86_avx_h
diff --git a/include/aymo_ymf262_x86_avx2.h b/include/aymo_ymf262_x86_avx2.h
new file mode 100644
index 0000000..98afc70
--- /dev/null
+++ b/include/aymo_ymf262_x86_avx2.h
@@ -0,0 +1,332 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ymf262_x86_avx2_h
+#define _include_aymo_ymf262_x86_avx2_h
+
+#include "aymo_cpu.h"
+#include "aymo_ymf262_common.h"
+
+#include <stddef.h>
+
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_YMF262_X86_AVX2_##_token_
+#define aymo_(_token_)  aymo_ymf262_x86_avx2_##_token_
+
+
+#define AYMO_YMF262_X86_AVX2_SLOT_NUM_MAX           64
+#define AYMO_YMF262_X86_AVX2_CHANNEL_NUM_MAX        32
+#define AYMO_YMF262_X86_AVX2_SLOT_GROUP_NUM         4
+#define AYMO_YMF262_X86_AVX2_SLOT_GROUP_LENGTH      16
+
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+
+// Wave descriptor for single slot
+struct aymo_(wave) {
+    int16_t wg_phase_mullo;
+    int16_t wg_phase_zero;
+    int16_t wg_phase_neg;
+    int16_t wg_phase_flip;
+    int16_t wg_phase_mask;
+    int16_t wg_sine_gate;
+};
+
+// Waveform enumerator
+enum aymo_(wf) {
+    aymo_(wf_sin) = 0,
+    aymo_(wf_sinup),
+    aymo_(wf_sinabs),
+    aymo_(wf_sinabsqrt),
+    aymo_(wf_sinfast),
+    aymo_(wf_sinabsfast),
+    aymo_(wf_square),
+    aymo_(wf_log)
+};
+
+
+// Connection descriptor for a single slot
+struct aymo_(conn) {
+    int16_t wg_fbmod_gate;
+    int16_t wg_prmod_gate;
+    int16_t og_out_gate;
+};
+
+
+// TODO: move reg queue outside YMF262
+#ifndef AYMO_YMF262_X86_AVX2_REG_QUEUE_LENGTH
+#define AYMO_YMF262_X86_AVX2_REG_QUEUE_LENGTH       256
+#endif
+#ifndef AYMO_YMF262_X86_AVX2_REG_QUEUE_LATENCY
+#define AYMO_YMF262_X86_AVX2_REG_QUEUE_LATENCY      2
+#endif
+
+struct aymo_(reg_queue_item) {
+    uint16_t address;
+    uint8_t value;
+};
+
+
+#define AYMO_YMF262_X86_AVX2_EG_GEN_ATTACK          0
+#define AYMO_YMF262_X86_AVX2_EG_GEN_DECAY           1
+#define AYMO_YMF262_X86_AVX2_EG_GEN_SUSTAIN         2
+#define AYMO_YMF262_X86_AVX2_EG_GEN_RELEASE         3
+
+#define AYMO_YMF262_X86_AVX2_EG_GEN_MULLO_ATTACK    (1 <<  0)
+#define AYMO_YMF262_X86_AVX2_EG_GEN_MULLO_DECAY     (1 <<  4)
+#define AYMO_YMF262_X86_AVX2_EG_GEN_MULLO_SUSTAIN   (1 <<  8)
+#define AYMO_YMF262_X86_AVX2_EG_GEN_MULLO_RELEASE   (1 << 12)
+#define AYMO_YMF262_X86_AVX2_EG_GEN_SRLHI           10
+
+#define AYMO_YMF262_X86_AVX2_EG_KEY_NORMAL          (1 << 0)
+#define AYMO_YMF262_X86_AVX2_EG_KEY_DRUM            (1 << 8)
+
+// Packed ADSR register values
+AYMO_PRAGMA_PACK_PUSH_1
+struct aymo_(eg_adsr) {
+    uint16_t rr : 4;
+    uint16_t sr : 4;
+    uint16_t dr : 4;
+    uint16_t ar : 4;
+};
+AYMO_PRAGMA_PACK_POP
+
+
+// Slot SIMD group status
+// Processing order (kinda)
+AYMO_ALIGN_V256
+struct aymo_(slot_group) {
+    // Updated each sample cycle
+    vi16x16_t eg_rout;
+    vi16x16_t eg_tremolo_am;
+    vi16x16_t eg_ksl_sh_tl_x4;
+    vi32x8_t pg_phase_lo;
+    vi32x8_t pg_phase_hi;
+    vi16x16_t pg_phase_out;
+    vi16x16_t eg_gen;
+    vi16x16_t eg_key;           // bit 8 = drum, bit 0 = normal
+    vi16x16_t eg_gen_mullo;     // depends on reg_type for reg_sr
+    vi16x16_t eg_adsr;          // struct aymo_(eg_adsr)
+    vi16x16_t eg_ks;
+    vi32x8_t pg_deltafreq_lo;
+    vi32x8_t pg_deltafreq_hi;
+    vi16x16_t wg_out;
+    vi16x16_t wg_prout;
+    vi16x16_t wg_fb_mulhi;
+    vi16x16_t wg_prmod_gate;
+    vi16x16_t wg_fbmod_gate;
+    vi16x16_t wg_phase_mullo;
+    vi16x16_t wg_phase_zero;
+    vi16x16_t wg_phase_flip;
+    vi16x16_t wg_phase_mask;
+    vi16x16_t wg_sine_gate;
+    vi16x16_t eg_out;
+    vi16x16_t wg_phase_neg;
+    vi16x16_t eg_sl;
+    vi16x16_t og_prout;
+    vi16x16_t og_prout_ac;
+    vi16x16_t og_prout_bd;
+    vi16x16_t og_out_ch_gate_a;
+    vi16x16_t og_out_ch_gate_c;
+    vi16x16_t og_out_ch_gate_b;
+    vi16x16_t og_out_ch_gate_d;
+
+    // Updated infrequently
+    vi16x16_t pg_vib;
+    vi16x16_t pg_mult_x2;
+
+    // Updated only by writing registers
+    vi16x16_t eg_am;
+    vi16x16_t og_out_gate;
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+    vi16x16_t eg_ksl;
+    vi16x16_t eg_rate;
+    vi16x16_t eg_inc;
+    vi16x16_t wg_fbmod;
+    vi16x16_t wg_mod;
+#endif  // AYMO_DEBUG
+};
+
+// Channel_2xOP SIMD group status
+// Processing order (kinda)
+AYMO_ALIGN_V256
+struct aymo_(ch2x_group) {
+    // Updated infrequently
+    vi16x16_t pg_fnum;
+    vi16x16_t pg_block;
+
+    // Updated only by writing registers
+    vi16x16_t eg_ksv;
+    vi16x16_t og_ch_gate_a;
+    vi16x16_t og_ch_gate_b;
+    vi16x16_t og_ch_gate_c;
+    vi16x16_t og_ch_gate_d;
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+#endif  // AYMO_DEBUG
+};
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V256
+struct aymo_(chip) {
+    struct aymo_ymf262_chip parent;
+    uint8_t align32_[sizeof(vi16x16_t) - sizeof(struct aymo_ymf262_chip)];
+
+    // 256-bit data
+    struct aymo_(slot_group) sg[AYMO_(SLOT_GROUP_NUM)];
+    struct aymo_(ch2x_group) cg[AYMO_(SLOT_GROUP_NUM) / 2];
+
+    vi16x16_t eg_add;
+    vi16x16_t wg_mod;
+    vu16x16_t eg_incstep;
+    vi16x16_t og_acc_a;
+    vi16x16_t og_acc_c;
+    vi16x16_t og_acc_b;
+    vi16x16_t og_acc_d;
+
+    vi16x16_t pg_vib_mulhi;
+    vi16x16_t pg_vib_neg;
+
+    // 128-bit data
+    vi16x8_t og_out;
+
+    // 64-bit data
+    uint64_t eg_timer;
+    uint64_t tm_timer;
+
+    // 32-bit data
+    uint32_t rq_delay;
+    uint32_t og_ch2x_pairing;
+    uint32_t og_ch2x_drum;
+    uint32_t ng_noise;
+
+    // 16-bit data
+    uint16_t rq_head;
+    uint16_t rq_tail;
+
+    // 8-bit data
+    uint8_t eg_state;
+    uint8_t eg_timerrem;
+    uint8_t rm_hh_bit2;
+    uint8_t rm_hh_bit3;
+    uint8_t rm_hh_bit7;
+    uint8_t rm_hh_bit8;
+    uint8_t rm_tc_bit3;
+    uint8_t rm_tc_bit5;
+    uint8_t eg_tremolopos;
+    uint8_t eg_tremoloshift;
+    uint8_t eg_vibshift;
+    uint8_t pg_vibpos;
+    uint8_t pad32_[2];
+
+    struct aymo_ymf262_chip_regs chip_regs;
+    struct aymo_ymf262_slot_regs slot_regs[AYMO_(SLOT_NUM_MAX)];
+    struct aymo_ymf262_chan_regs ch2x_regs[AYMO_(CHANNEL_NUM_MAX)];
+
+    struct aymo_(reg_queue_item) rq_buffer[AYMO_(REG_QUEUE_LENGTH)];
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+#endif  // AYMO_DEBUG
+};
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+AYMO_PUBLIC const int8_t aymo_(sgo_side)[16];
+AYMO_PUBLIC const int8_t aymo_(sgo_cell)[16];
+
+AYMO_PUBLIC const uint16_t aymo_(eg_incstep_table)[4];
+
+AYMO_PUBLIC const struct aymo_(wave) aymo_(wave_table)[8];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */];
+
+AYMO_PUBLIC const uint16_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)];
+AYMO_PUBLIC const uint16_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)];
+
+AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt);
+
+
+AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel);
+AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+
+
+// Slot group index to Channel group index
+static inline
+int aymo_(sgi_to_cgi)(int sgi)
+{
+    return (sgi / 2);
+}
+
+
+// Address to Slot index
+static inline
+int8_t aymo_(addr_to_slot)(uint16_t address)
+{
+    uint16_t subaddr = ((address & 0x1F) | ((address >> 8) & 1));
+    int8_t slot = aymo_ymf262_subaddr_to_slot[subaddr];
+    return slot;
+}
+
+
+// Address to Channel_2xOP index
+static inline
+int8_t aymo_(addr_to_ch2x)(uint16_t address)
+{
+    uint16_t subaddr = ((address & 0x0F) | ((address >> 8) & 1));
+    int8_t ch2x = aymo_ymf262_subaddr_to_ch2x[subaddr];
+    return ch2x;
+}
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
+
+#endif  // _include_aymo_ymf262_x86_avx2_h
diff --git a/include/aymo_ymf262_x86_sse41.h b/include/aymo_ymf262_x86_sse41.h
new file mode 100644
index 0000000..b9814ee
--- /dev/null
+++ b/include/aymo_ymf262_x86_sse41.h
@@ -0,0 +1,332 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_ymf262_x86_sse41_h
+#define _include_aymo_ymf262_x86_sse41_h
+
+#include "aymo_cpu.h"
+#include "aymo_ymf262_common.h"
+
+#include <stddef.h>
+
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#undef AYMO_
+#undef aymo_
+#define AYMO_(_token_)  AYMO_YMF262_X86_SSE41_##_token_
+#define aymo_(_token_)  aymo_ymf262_x86_sse41_##_token_
+
+
+#define AYMO_YMF262_X86_SSE41_SLOT_NUM_MAX          64
+#define AYMO_YMF262_X86_SSE41_CHANNEL_NUM_MAX       32
+#define AYMO_YMF262_X86_SSE41_SLOT_GROUP_NUM        8
+#define AYMO_YMF262_X86_SSE41_SLOT_GROUP_LENGTH     8
+
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN
+
+// Wave descriptor for single slot
+struct aymo_(wave) {
+    int16_t wg_phase_mullo;
+    int16_t wg_phase_zero;
+    int16_t wg_phase_neg;
+    int16_t wg_phase_flip;
+    int16_t wg_phase_mask;
+    int16_t wg_sine_gate;
+};
+
+// Waveform enumerator
+enum aymo_(wf) {
+    aymo_(wf_sin) = 0,
+    aymo_(wf_sinup),
+    aymo_(wf_sinabs),
+    aymo_(wf_sinabsqrt),
+    aymo_(wf_sinfast),
+    aymo_(wf_sinabsfast),
+    aymo_(wf_square),
+    aymo_(wf_log)
+};
+
+
+// Connection descriptor for a single slot
+struct aymo_(conn) {
+    int16_t wg_fbmod_gate;
+    int16_t wg_prmod_gate;
+    int16_t og_out_gate;
+};
+
+
+// TODO: move reg queue outside YMF262
+#ifndef AYMO_YMF262_X86_SSE41_REG_QUEUE_LENGTH
+#define AYMO_YMF262_X86_SSE41_REG_QUEUE_LENGTH      256
+#endif
+#ifndef AYMO_YMF262_X86_SSE41_REG_QUEUE_LATENCY
+#define AYMO_YMF262_X86_SSE41_REG_QUEUE_LATENCY     2
+#endif
+
+struct aymo_(reg_queue_item) {
+    uint16_t address;
+    uint8_t value;
+};
+
+
+#define AYMO_YMF262_X86_SSE41_EG_GEN_ATTACK         0
+#define AYMO_YMF262_X86_SSE41_EG_GEN_DECAY          1
+#define AYMO_YMF262_X86_SSE41_EG_GEN_SUSTAIN        2
+#define AYMO_YMF262_X86_SSE41_EG_GEN_RELEASE        3
+
+#define AYMO_YMF262_X86_SSE41_EG_GEN_MULLO_ATTACK   (1 <<  0)
+#define AYMO_YMF262_X86_SSE41_EG_GEN_MULLO_DECAY    (1 <<  4)
+#define AYMO_YMF262_X86_SSE41_EG_GEN_MULLO_SUSTAIN  (1 <<  8)
+#define AYMO_YMF262_X86_SSE41_EG_GEN_MULLO_RELEASE  (1 << 12)
+#define AYMO_YMF262_X86_SSE41_EG_GEN_SRLHI          10
+
+#define AYMO_YMF262_X86_SSE41_EG_KEY_NORMAL         (1 << 0)
+#define AYMO_YMF262_X86_SSE41_EG_KEY_DRUM           (1 << 8)
+
+// Packed ADSR register values
+AYMO_PRAGMA_PACK_PUSH_1
+struct aymo_(eg_adsr) {
+    uint16_t rr : 4;
+    uint16_t sr : 4;
+    uint16_t dr : 4;
+    uint16_t ar : 4;
+};
+AYMO_PRAGMA_PACK_POP
+
+
+// Slot SIMD group status
+// Processing order (kinda)
+AYMO_ALIGN_V128
+struct aymo_(slot_group) {
+    // Updated each sample cycle
+    vi16x8_t eg_rout;
+    vi16x8_t eg_tremolo_am;
+    vi16x8_t eg_ksl_sh_tl_x4;
+    vi32x4_t pg_phase_lo;
+    vi32x4_t pg_phase_hi;
+    vi16x8_t pg_phase_out;
+    vi16x8_t eg_gen;
+    vi16x8_t eg_key;           // bit 8 = drum, bit 0 = normal
+    vi16x8_t eg_gen_mullo;     // depends on reg_type for reg_sr
+    vi16x8_t eg_adsr;          // struct aymo_(eg_adsr)
+    vi16x8_t eg_ks;
+    vi32x4_t pg_deltafreq_lo;
+    vi32x4_t pg_deltafreq_hi;
+    vi16x8_t wg_out;
+    vi16x8_t wg_prout;
+    vi16x8_t wg_fb_mulhi;
+    vi16x8_t wg_prmod_gate;
+    vi16x8_t wg_fbmod_gate;
+    vi16x8_t wg_phase_mullo;
+    vi16x8_t wg_phase_zero;
+    vi16x8_t wg_phase_flip;
+    vi16x8_t wg_phase_mask;
+    vi16x8_t wg_sine_gate;
+    vi16x8_t eg_out;
+    vi16x8_t wg_phase_neg;
+    vi16x8_t eg_sl;
+    vi16x8_t og_prout;
+    vi16x8_t og_prout_ac;
+    vi16x8_t og_prout_bd;
+    vi16x8_t og_out_ch_gate_a;
+    vi16x8_t og_out_ch_gate_c;
+    vi16x8_t og_out_ch_gate_b;
+    vi16x8_t og_out_ch_gate_d;
+
+    // Updated infrequently
+    vi16x8_t pg_vib;
+    vi16x8_t pg_mult_x2;
+
+    // Updated only by writing registers
+    vi16x8_t eg_am;
+    vi16x8_t og_out_gate;
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+    vi16x8_t eg_ksl;
+    vi16x8_t eg_rate;
+    vi16x8_t eg_inc;
+    vi16x8_t wg_fbmod;
+    vi16x8_t wg_mod;
+#endif  // AYMO_DEBUG
+};
+
+// Channel_2xOP SIMD group status
+// Processing order (kinda)
+AYMO_ALIGN_V128
+struct aymo_(ch2x_group) {
+    // Updated infrequently
+    vi16x8_t pg_fnum;
+    vi16x8_t pg_block;
+
+    // Updated only by writing registers
+    vi16x8_t eg_ksv;
+
+    vi16x8_t og_ch_gate_a;
+    vi16x8_t og_ch_gate_b;
+    vi16x8_t og_ch_gate_c;
+    vi16x8_t og_ch_gate_d;
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+#endif  // AYMO_DEBUG
+};
+
+// Chip SIMD and scalar status data
+// Processing order (kinda), size/alignment order
+AYMO_ALIGN_V128
+struct aymo_(chip) {
+    struct aymo_ymf262_chip parent;
+
+    // 128-bit data
+    struct aymo_(slot_group) sg[AYMO_(SLOT_GROUP_NUM)];
+    struct aymo_(ch2x_group) cg[AYMO_(SLOT_GROUP_NUM) / 2];
+
+    vi16x8_t eg_add;
+    vi16x8_t wg_mod;
+    vu16x8_t eg_incstep;
+    vi16x8_t og_acc_a;
+    vi16x8_t og_acc_c;
+    vi16x8_t og_acc_b;
+    vi16x8_t og_acc_d;
+    vi16x8_t og_out;
+
+    vi16x8_t pg_vib_mulhi;
+    vi16x8_t pg_vib_neg;
+
+    // 64-bit data
+    uint64_t eg_timer;
+    uint64_t tm_timer;
+
+    // 32-bit data
+    uint32_t rq_delay;
+    uint32_t og_ch2x_pairing;
+    uint32_t og_ch2x_drum;
+    uint32_t ng_noise;
+
+    // 16-bit data
+    uint16_t rq_head;
+    uint16_t rq_tail;
+
+    // 8-bit data
+    uint8_t eg_state;
+    uint8_t eg_timerrem;
+    uint8_t rm_hh_bit2;
+    uint8_t rm_hh_bit3;
+    uint8_t rm_hh_bit7;
+    uint8_t rm_hh_bit8;
+    uint8_t rm_tc_bit3;
+    uint8_t rm_tc_bit5;
+    uint8_t eg_tremolopos;
+    uint8_t eg_tremoloshift;
+    uint8_t eg_vibshift;
+    uint8_t pg_vibpos;
+    uint8_t process_all_slots;
+    uint8_t pad32_[1];
+
+    struct aymo_ymf262_chip_regs chip_regs;
+    struct aymo_ymf262_slot_regs slot_regs[AYMO_(SLOT_NUM_MAX)];
+    struct aymo_ymf262_chan_regs ch2x_regs[AYMO_(CHANNEL_NUM_MAX)];
+
+    struct aymo_(reg_queue_item) rq_buffer[AYMO_(REG_QUEUE_LENGTH)];
+
+#ifdef AYMO_DEBUG
+    // Variables for debug
+#endif  // AYMO_DEBUG
+};
+
+AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT
+
+
+AYMO_PUBLIC const int8_t aymo_(sgo_side)[8];
+AYMO_PUBLIC const int8_t aymo_(sgo_cell)[8];
+
+AYMO_PUBLIC const uint16_t aymo_(eg_incstep_table)[4];
+
+AYMO_PUBLIC const struct aymo_(wave) aymo_(wave_table)[8];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */];
+AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */];
+
+AYMO_PUBLIC const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)];
+AYMO_PUBLIC const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)];
+
+AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt);
+
+
+AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void);
+AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void);
+AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip);
+AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address);
+AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value);
+AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel);
+AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count);
+AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]);
+AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]);
+
+
+// Slot group index to Channel group index
+static inline
+int aymo_(sgi_to_cgi)(int sgi)
+{
+//    return (((sgi / 4) * 2) | (sgi % 2));
+    return (((sgi >> 1) & 2) | (sgi & 1));
+}
+
+
+// Address to Slot index
+static inline
+int8_t aymo_(addr_to_slot)(uint16_t address)
+{
+    uint16_t subaddr = ((address & 0x1F) | ((address >> 8) & 1));
+    int8_t slot = aymo_ymf262_subaddr_to_slot[subaddr];
+    return slot;
+}
+
+
+// Address to Channel_2xOP index
+static inline
+int8_t aymo_(addr_to_ch2x)(uint16_t address)
+{
+    uint16_t subaddr = ((address & 0x0F) | ((address >> 8) & 1));
+    int8_t ch2x = aymo_ymf262_subaddr_to_ch2x[subaddr];
+    return ch2x;
+}
+
+
+#ifndef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_KEEP_SHORTHANDS
+    #undef AYMO_
+    #undef aymo_
+#endif  // AYMO_KEEP_SHORTHANDS
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
+
+#endif  // _include_aymo_ymf262_x86_sse41_h
diff --git a/include/meson.build b/include/meson.build
new file mode 100644
index 0000000..a094049
--- /dev/null
+++ b/include/meson.build
@@ -0,0 +1,11 @@
+
+aymo_headers = [  # TODO
+  'aymo.h',
+  'aymo_score.h',
+  'aymo_score_avd.h',
+  'aymo_score_dro.h',
+  'aymo_score_imf.h',
+  'aymo_ymf262.h',
+]
+
+install_headers(aymo_headers, subdir: 'aymo')
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..b5d6244
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,688 @@
+# This Meson build script is a heavily modified version of the
+# "/meson.build" file of the OPUS codec project, adapted to
+# suit AYMO.
+# That script is used as a template for AYMO because it has a
+# known and tested support for SIMD auto-detection within Meson.
+# This way, any changes by the OPUS project can be applied to AYMO.
+#
+# OPUS project reference links:
+#   https://opus-codec.org/
+#   https://github.com/xiph/opus/
+#
+# Reference file snapshot:
+#   https://github.com/xiph/opus/blob/20c032d27c59d65b19b8ffbb2608e5282fe817eb/meson.build
+#
+# OPUS license disclaimer:
+# --- BEGIN OPUS LICENSE ---
+#
+# Copyright 2001-2011 Xiph.Org, Skype Limited, Octasic,
+#                     Jean-Marc Valin, Timothy B. Terriberry,
+#                     CSIRO, Gregory Maxwell, Mark Borgerding,
+#                     Erik de Castro Lopo
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# - Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# - Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# - Neither the name of Internet Society, IETF or IETF Trust, nor the
+# names of specific contributors, may be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# --- END OPUS LICENSE ---
+
+# =====================================================================
+
+project('aymo', 'c',
+  version: '0.0.1',
+  meson_version: '>=0.60.0',
+  default_options: [
+    'warning_level=2',
+    'c_std=gnu99',
+    'buildtype=debugoptimized'
+  ],
+)
+
+libversion = '0.0.1'
+
+cc = meson.get_compiler('c')
+host_system = host_machine.system()
+host_cpu_family = host_machine.cpu_family()
+top_srcdir = meson.current_source_dir()  # for opt_docs
+top_builddir = meson.current_build_dir()  # for opt_docs
+
+aymo_includes = include_directories('.', 'include')
+aymo_public_includes = include_directories('include')
+
+# =====================================================================
+
+add_project_arguments('-DAYMO_BUILD', language: 'c')
+add_project_arguments('-DAYMO_HAVE_CONFIG_H', language: 'c')
+
+if host_system == 'windows'
+  if cc.get_argument_syntax() == 'msvc'
+    add_project_arguments('-D_CRT_SECURE_NO_WARNINGS', language: 'c')
+  endif
+endif
+
+if cc.get_argument_syntax() == 'gcc'
+  #add_project_arguments('-D_FORTIFY_SOURCE=2', language: 'c')
+endif
+
+# Check for extra compiler args
+additional_c_args = []
+if cc.get_argument_syntax() == 'msvc'
+  additional_c_args += [
+  ]
+else  # msvc
+  additional_c_args += [
+    '-fvisibility=hidden',
+    '-Wcast-align',
+    '-Wnested-externs',
+    '-Wshadow',
+    '-Wstrict-prototypes',
+  ]
+
+  # On Windows, -fstack-protector-strong adds a libssp-0.dll dependency and
+  # prevents static linking
+  if host_system != 'windows'
+    #additional_c_args += ['-fstack-protector-strong']
+  endif
+endif  # msvc
+
+foreach arg : additional_c_args
+  if cc.has_argument(arg)
+    add_project_arguments(arg, language: 'c')
+  endif
+endforeach
+
+# Windows MSVC warnings
+if cc.get_id() == 'msvc'
+  # Ignore several spurious warnings.
+  # If a warning is completely useless and spammy, use '/wdXXXX' to suppress it
+  # If a warning is harmless but hard to fix, use '/woXXXX' so it's shown once
+  # NOTE: Only add warnings here if you are sure they're spurious
+#  add_project_arguments('/wd4035', '/wd4715', '/wd4116', '/wd4046', '/wd4068',
+#    '/wd4820', '/wd4244', '/wd4255', '/wd4668',
+#    language : 'c')
+endif
+
+# =====================================================================
+
+aymo_version = meson.project_version()
+aymo_url = 'https://github.com/TexZK/aymo/'
+
+aymo_conf = configuration_data()
+aymo_conf.set('PACKAGE_BUGREPORT', '"texzk@email.it"')
+aymo_conf.set('PACKAGE_NAME', '"aymo"')
+aymo_conf.set('PACKAGE_STRING', '"aymo @0@"'.format(aymo_version))
+aymo_conf.set('PACKAGE_TARNAME', '"aymo"')
+aymo_conf.set('PACKAGE_URL', '"@0@"'.format(aymo_url))
+aymo_conf.set('PACKAGE_VERSION', '"@0@"'.format(aymo_version))
+
+aymo_conf.set('AYMO_CC_HOST_@0@'.format(host_system.underscorify().to_upper()), 1)
+aymo_conf.set('AYMO_CC_ID_@0@'.format(cc.get_id().underscorify().to_upper()), 1)
+aymo_conf.set('AYMO_CC_SYNTAX_@0@'.format(cc.get_argument_syntax().underscorify().to_upper()), 1)
+
+if cc.check_header('stdint.h')
+  aymo_conf.set('AYMO_CC_HAVE_STDINT_H', 1)
+endif
+
+opt_apps = get_option('apps')
+opt_asm = get_option('asm')
+opt_docs = get_option('docs')
+opt_rtcd = get_option('rtcd')
+opt_tests = get_option('tests')
+
+if get_option('buildtype').startswith('debug')
+  add_project_arguments('-DDEBUG', language : 'c')
+  add_project_arguments('-D_DEBUG', language : 'c')
+  add_project_arguments('-DAYMO_DEBUG', language : 'c')
+else
+  add_project_arguments('-DNDEBUG', language : 'c')
+endif
+
+# =====================================================================
+
+aymo_conf.set('AYMO_CPU_FAMILY_@0@'.format(host_cpu_family.underscorify().to_upper()), 1)
+
+# With GCC, Clang, ICC, etc, we differentiate between
+# 'runtime support for this SIMD' and 'presume we have this SIMD',
+# by checking whether the SIMD / intrinsics can be compiled by the compiler
+# as-is ('presume') or with SIMD cflags ('support').
+#
+# With MSVC, the compiler will always build SIMD/intrinsics targeting all
+# specific instruction sets supported by that version of the compiler.
+# No special arguments are ever needed.
+#
+# If runtime CPU detection is not disabled, we must always assume that
+# we only have runtime 'support' for it.
+
+aymo_can_presume_simd = true
+if cc.get_argument_syntax() == 'msvc'
+  if opt_rtcd.disabled()
+    warning('Building with an MSVC-like compiler and runtime CPU detection is disabled. Outputs may not run on all @0@ CPUs.'.format(host_cpu_family))
+  else
+    aymo_can_presume_simd = false
+  endif
+endif
+
+# TODO: NEON has 'hardfp' vs 'softfp' compiler configuration issues.
+# When targeting 'AArch32 softfp', we sometimes need to explicitly pass
+# '-mfloat-abi=softfp' to enable NEON (e.g. on Android).
+# It should be set in the cross file.
+arm_neon_link_args = []
+if cc.get_argument_syntax() != 'msvc'
+  arm_neon_link_args += ['-mfpu=neon']
+endif
+
+aymo_have_none = true  # always
+aymo_have_x86_sse = false
+aymo_have_x86_sse2 = false
+aymo_have_x86_sse41 = false
+aymo_have_x86_avx = false
+aymo_have_x86_avx2 = false
+
+aymo_have_arm_neon = false
+
+rtcd_support_names = []
+intrin_support_names = []
+
+if host_cpu_family in ['arm', 'aarch64']
+  # Check for ARMv7/AArch64 neon intrinsics
+  intrin_check_code = '''
+    #include <arm_neon.h>
+    int main(void) {
+      static float32x4_t A0, A1, SUMM;
+      SUMM = vmlaq_f32(SUMM, A0, A1);
+      return (int)vgetq_lane_f32(SUMM, 0);
+    }
+  '''
+  intrin_name = 'ARMv7/AArch64 NEON'
+  if cc.links(intrin_check_code,
+              name: 'compiler supports @0@ intrinsics'.format(intrin_name))
+    aymo_arm_presume_neon = aymo_can_presume_simd
+    aymo_arm_support_neon = true
+  else
+    aymo_arm_presume_neon = false
+    if cc.links(intrin_check_code,
+                args: arm_neon_link_args,
+                name: 'compiler supports @0@ intrinsics with @1@'
+                      .format(intrin_name, ' '.join(arm_neon_link_args)))
+      aymo_arm_support_neon = true
+    else
+      aymo_arm_support_neon = false
+    endif
+  endif
+
+  if aymo_arm_support_neon
+    aymo_have_arm_neon = true
+    intrin_support_names += [intrin_name]
+    aymo_conf.set('AYMO_CPU_SUPPORT_ARM_NEON', 1)
+    if aymo_arm_presume_neon
+      aymo_conf.set('AYMO_CPU_PRESUME_ARM_NEON', 1)
+    else
+      rtcd_support_names += [intrin_name]
+      aymo_arm_neon_args = arm_neon_link_args
+    endif
+  else
+    message('Compiler does not support @0@ intrinsics'.format(intrin_name))
+  endif
+
+  # Check for aarch64 neon intrinsics
+  intrin_check_code = '''
+    #include <arm_neon.h>
+    int main(void) {
+      static int32_t x;
+      static int16_t y;
+      y = vqmovns_s32(x);
+    }
+  '''
+  intrin_name = 'AArch64 NEON'
+  if cc.links(intrin_check_code,
+              name: 'compiler supports @0@ intrinsics'.format(intrin_name))
+    aymo_arm_presume_aarch64 = aymo_can_presume_simd
+    aymo_arm_support_aarch64 = true
+  else
+    aymo_arm_presume_aarch64 = false
+    if cc.links(intrin_check_code,
+                args: arm_neon_link_args,
+                name: 'compiler supports @0@ intrinsics with @1@'
+                      .format(intrin_name, ' '.join(arm_neon_link_args)))
+      aymo_arm_support_aarch64 = true
+    else
+      aymo_arm_support_aarch64 = false
+    endif
+  endif
+
+  if aymo_arm_support_aarch64
+    intrin_support_names += [intrin_name]
+    aymo_conf.set('AYMO_CPU_SUPPORT_ARM_AARCH64', 1)
+    if aymo_arm_presume_aarch64
+      aymo_conf.set('AYMO_CPU_PRESUME_ARM_AARCH64', 1)
+    endif
+  else
+    message('Compiler does not support @0@ intrinsics'.format(intrin_name))
+  endif
+
+elif host_cpu_family in ['x86', 'x86_64']
+  # allow external override/specification of the flags
+  x86_intrinsics = [
+    [ 'x86_sse',   'SSE',    'xmmintrin.h', '__m128',  '_mm_setzero_ps()',                                     [['-msse'],    ['/arch:SSE']]  ],
+    [ 'x86_sse2',  'SSE2',   'emmintrin.h', '__m128i', '_mm_setzero_si128()',                                  [['-msse2'],   ['/arch:SSE2']] ],
+    [ 'x86_sse41', 'SSE4.1', 'smmintrin.h', '__m128i', '_mm_setzero_si128(); x = _mm_cmpeq_epi64(x, x)',       [['-msse4.1'], ['/arch:SSE2']] ],
+    [ 'x86_avx',   'AVX',    'immintrin.h', '__m256',  '_mm256_setzero_ps()',                                  [['-mavx'],    ['/arch:AVX']]  ],
+    [ 'x86_avx2',  'AVX2',   'immintrin.h', '__m256i', '_mm256_setzero_si256(); x = _mm256_cmpeq_epi16(x, x)', [['-mavx2'],   ['/arch:AVX2']] ],
+  ]
+  foreach intrin : x86_intrinsics
+    intrin_check_code = '''
+      #include <@0@>
+      int main(void) {
+        @1@ x;
+        x = @2@;
+        return 0;
+      }
+    '''.format(intrin[2], intrin[3], intrin[4])
+    intrin_name = intrin[1]
+    # Intrinsics arguments are not available with MSVC-like compilers
+    intrin_args = ((cc.get_argument_syntax() == 'msvc') ? intrin[5][1] : intrin[5][0])
+    if cc.links(intrin_check_code,
+                name: 'compiler supports @0@ intrinsics'.format(intrin_name))
+      support_intrin = true
+      presume_intrin = aymo_can_presume_simd
+    elif intrin_args.length() > 0
+      presume_intrin = false
+      support_intrin = false
+      if cc.links(intrin_check_code,
+                  args: intrin_args,
+                  name: 'compiler supports @0@ intrinsics with @1@'
+                        .format(intrin_name, ' '.join(intrin_args)))
+        support_intrin = true
+      endif
+    endif  # intrin_check_code
+    if support_intrin
+      intrin_support_names += [intrin_name]
+      intrin_lower_name = intrin[0]
+      set_variable('aymo_have_@0@'.format(intrin_lower_name), true)
+      intrin_upper_name = intrin_lower_name.to_upper()
+      aymo_conf.set('AYMO_CPU_SUPPORT_@0@'.format(intrin_upper_name), 1)
+      if presume_intrin
+        aymo_conf.set('AYMO_CPU_PRESUME_@0@'.format(intrin_upper_name), 1)
+      else
+        rtcd_support_names += [intrin_name]
+        set_variable('aymo_@0@_args'.format(intrin_lower_name), intrin_args)
+      endif
+    else
+      message('Compiler does not support @0@ intrinsics'.format(intrin_name))
+    endif  # support_intrin
+  endforeach  # intrin
+
+  if not opt_rtcd.disabled()
+    cpuid_h__cpuid_code = '''
+      #include <cpuid.h>
+      int main(void) {
+        unsigned e1[4] = { 0u, 0u, 0u, 0u };
+        __cpuid(1u, e1[0], e1[1], e1[2], e1[3]);
+        return 0;
+      }
+    '''
+    cpuid_h__cpuid_count_code = '''
+      #include <cpuid.h>
+      int main(void) {
+        unsigned e7[4] = { 0u, 0u, 0u, 0u };
+        __cpuid_count(7u, 0u, e7[0], e7[1], e7[2], e7[3]);
+        return 0;
+      }
+    '''
+    intrin_h__cpuid_code = '''
+      #include <intrin.h>
+      int main(void) {
+        int e1[4] = { 0, 0, 0, 0 };
+        __cpuid(e1, 1);
+        return 0;
+      }
+    '''
+    intrin_h__cpuidex_code = '''
+      #include <intrin.h>
+      int main(void) {
+        int e7[4] = { 0, 0, 0, 0 };
+        __cpuidex(e7, 7, 0);
+        return 0;
+      }
+    '''
+    have_cpuinfo = false
+    if cc.links(cpuid_h__cpuid_code, name: '<cpuid.h> __cpuid()')
+      aymo_conf.set('AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID', 1)
+      have_cpuinfo = true
+    endif
+    if cc.links(cpuid_h__cpuid_count_code, name: '<cpuid.h> __cpuid_count()')
+      aymo_conf.set('AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID_COUNT', 1)
+      have_cpuinfo = true
+    endif
+    if cc.links(intrin_h__cpuid_code, name: '<intrin.h> __cpuid()')
+      aymo_conf.set('AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUID', 1)
+      have_cpuinfo = true
+    endif
+    if cc.links(intrin_h__cpuidex_code, name: '<intrin.h> __cpuidex()')
+      aymo_conf.set('AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUIDEX', 1)
+      have_cpuinfo = true
+    endif
+    if have_cpuinfo
+      aymo_conf.set('AYMO_CPU_HAVE_CPUINFO', 1)
+    else
+      if opt_rtcd.enabled()
+        error('rtcd option is enabled, but no Get CPU Info method detected')
+      endif
+      warning('Get CPU Info method not detected, no rtcd for intrinsics')
+    endif
+  endif  # opt_rtcd
+
+  aymo_conf.set('AYMO_CPU_X86_AVX2_GATHER16_STRATEGY', 2)  # TODO: option()
+
+else  # host_cpu_family
+  warning('No intrinsics support for @0@'.format(host_cpu_family))
+endif  # host_cpu_family
+
+# Check whether we require intrinsics and we support intrinsics on this cpu,
+# but none were detected. Can happen because of incorrect compiler flags, such
+# as missing -mfloat-abi=softfp on ARM32 softfp cpuitectures.
+if intrin_support_names.length() == 0
+  warning('"intrinsics" option was enabled, but none were detected')
+endif
+
+if opt_rtcd.disabled()
+  rtcd_support_names = 'disabled'
+else
+  if rtcd_support_names.length() > 0
+    aymo_conf.set('AYMO_CPU_HAVE_RTCD', 1)
+  else
+    if intrin_support_names.length() == 0
+      rtcd_support_names = 'none'
+      if opt_rtcd.enabled()
+        warning('"rtcd" option is enabled, but no support for intrinsics is available')
+      endif
+    else
+      rtcd_support_names = 'not needed'
+    endif
+  endif
+endif  # opt_rtcd
+
+# =====================================================================
+
+sources = {
+  'AYMO_HEADERS': files(
+    'include/aymo.h',
+    'include/aymo_cc.h',
+    'include/aymo_cpu.h',
+    'include/aymo_convert.h',
+    'include/aymo_convert_arm_neon.h',
+    'include/aymo_convert_none.h',
+    'include/aymo_convert_x86_avx2.h',
+    'include/aymo_convert_x86_sse41.h',
+    'include/aymo_score.h',
+    'include/aymo_score_avd.h',
+    'include/aymo_score_dro.h',
+    'include/aymo_score_imf.h',
+    'include/aymo_score_raw.h',
+    'include/aymo_wave.h',
+    'include/aymo_ymf262_arm_neon.h',
+    'include/aymo_ymf262_none.h',
+    'include/aymo_ymf262_x86_avx.h',
+    'include/aymo_ymf262_x86_avx2.h',
+    'include/aymo_ymf262_x86_sse41.h',
+  ),
+
+  'AYMO_SOURCES': files(
+    'src/aymo.c',
+    'src/aymo_convert.c',
+    'src/aymo_convert_none.c',
+    'src/aymo_cpu.c',
+    'src/aymo_score.c',
+    'src/aymo_score_avd.c',
+    'src/aymo_score_dro.c',
+    'src/aymo_score_imf.c',
+    'src/aymo_score_raw.c',
+    'src/aymo_tda8425.c',
+    'src/aymo_tda8425_common.c',
+    'src/aymo_tda8425_none.c',
+    'src/aymo_wave.c',
+    'src/aymo_ym7128.c',
+    'src/aymo_ym7128_common.c',
+    'src/aymo_ym7128_none.c',
+    'src/aymo_ymf262.c',
+    'src/aymo_ymf262_common.c',
+    'src/aymo_ymf262_none.c',
+  ),
+
+  'AYMO_SOURCES_X86': files (
+    'src/aymo_cpu_x86.c',
+  ),
+
+  'AYMO_SOURCES_X86_SSE41': files (
+    'src/aymo_convert_x86_sse41.c',
+    'src/aymo_tda8425_x86_sse41.c',
+    'src/aymo_ym7128_x86_sse41.c',
+    'src/aymo_ymf262_x86_sse41.c',
+  ),
+
+  'AYMO_SOURCES_X86_AVX': files (
+    'src/aymo_ymf262_x86_avx.c',
+  ),
+
+  'AYMO_SOURCES_X86_AVX2': files (
+    'src/aymo_convert_x86_avx2.c',
+    'src/aymo_tda8425_x86_avx2.c',
+    'src/aymo_ymf262_x86_avx2.c',
+  ),
+
+  'AYMO_SOURCES_ARM': files (
+    'src/aymo_cpu_arm.c',
+  ),
+
+  'AYMO_SOURCES_ARM_NEON': files (
+    'src/aymo_convert_arm_neon.c',
+    'src/aymo_tda8425_arm_neon.c',
+    'src/aymo_ym7128_arm_neon.c',
+    'src/aymo_ymf262_arm_neon.c',
+  ),
+
+  'AYMO_SOURCES_LIBC': files (
+    'src/aymo_file.c',
+  ),
+
+  'AYMO_SOURCES_AYMO': files (
+    'src/aymo_empty.c',
+  ),
+}
+
+# =====================================================================
+
+libm = cc.find_library('m', required: false)
+
+aymo_c_args = []
+
+# Assembly code listings
+if cc.get_argument_syntax() == 'msvc'
+  if not opt_asm.disabled()
+    aymo_c_args += ['/FAcs']
+  endif
+else
+  if opt_asm.enabled()
+    aymo_c_args += ['-S', '-fverbose-asm', '-masm=intel', '-Wa,-adhln']
+  endif
+endif
+
+subdir('contrib')
+
+aymo_sources = sources['AYMO_SOURCES']
+aymo_x86_sse41_sources = sources['AYMO_SOURCES_X86_SSE41']
+aymo_x86_avx_sources = sources['AYMO_SOURCES_X86_AVX']
+aymo_x86_avx2_sources = sources['AYMO_SOURCES_X86_AVX2']
+aymo_arm_neon_sources = sources['AYMO_SOURCES_ARM_NEON']
+
+aymo_static_libs = []
+
+foreach intr_name : ['x86_sse41', 'x86_avx', 'x86_avx2', 'arm_neon']
+  have_intr = get_variable('aymo_have_@0@'.format(intr_name))
+  if have_intr
+    intr_sources = get_variable('aymo_@0@_sources'.format(intr_name))
+    intr_args = get_variable('aymo_@0@_args'.format(intr_name), [])
+
+    aymo_static_libs += static_library(
+      'aymo-static_@0@'.format(intr_name),
+      intr_sources,
+      c_args: aymo_c_args + intr_args,
+      include_directories: [aymo_includes, aymo_contrib_includes],
+      link_with: aymo_contrib_lib,
+      install: false,
+    )
+  endif
+endforeach
+
+
+if host_cpu_family in ['x86', 'x86_64']
+  aymo_sources += sources['AYMO_SOURCES_X86']
+endif
+if host_cpu_family in ['arm', 'aarch64']
+  aymo_sources += sources['AYMO_SOURCES_ARM']
+endif
+
+
+if host_system in ['windows', 'cygwin']
+  aymo_sources += 'src/aymo_sys_windows.c'
+elif host_system in ['linux']
+  aymo_sources += 'src/aymo_sys_linux.c'
+endif
+
+
+aymo_static_lib = static_library(
+  'aymo-static',
+  aymo_sources,
+  c_args: aymo_c_args,
+  include_directories: aymo_includes,
+  link_with: aymo_static_libs,
+  dependencies: aymo_contrib_dep,
+  install: false,
+)
+
+aymo_static_dep = declare_dependency(
+  include_directories: aymo_includes,
+  link_with: aymo_static_lib,
+  dependencies: aymo_contrib_dep,
+)
+
+
+aymo_target_lib = library(
+  'aymo',
+  sources['AYMO_SOURCES_AYMO'],
+  version: libversion,
+  # darwin_versions: macosversion,  # TODO:
+  link_with: aymo_static_lib,
+  install: true,
+)
+
+aymo_target_dep = declare_dependency(
+  include_directories: aymo_includes,
+  link_with: aymo_target_lib,
+)
+
+
+aymo_libc_lib = static_library(
+  'aymo-libc',
+  sources['AYMO_SOURCES_LIBC'],
+  c_args: aymo_c_args,
+  include_directories: aymo_includes,
+  install: false,
+)
+
+aymo_libc_dep = declare_dependency(
+  include_directories: aymo_includes,
+  link_with: aymo_libc_lib,
+)
+
+# =====================================================================
+
+# pkg-config files (not using pkg module so we can use the existing .pc.in file)
+pkgconf = configuration_data()
+
+pkgconf.set('prefix', join_paths(get_option('prefix')))
+pkgconf.set('exec_prefix', '${prefix}')
+pkgconf.set('libdir', '${prefix}/@0@'.format(get_option('libdir')))
+pkgconf.set('includedir', '${prefix}/@0@'.format(get_option('includedir')))
+pkgconf.set('VERSION', aymo_version)
+pkgconf.set('URL', aymo_url)
+
+pkg_install_dir = '@0@/pkgconfig'.format(get_option('libdir'))
+
+configure_file(
+  input: 'aymo.pc.in',
+  output: 'aymo.pc',
+  configuration: pkgconf,
+  install_dir: pkg_install_dir
+)
+
+# =====================================================================
+
+configure_file(
+  output: 'aymo_config.h',
+  configuration: aymo_conf,
+#  macro_name: 'INCLUDE_AYMO_CONFIG_H',
+)
+
+subdir('include')
+
+subdir('apps')
+
+if not opt_tests.disabled()
+  subdir('tests')
+endif
+
+# =====================================================================
+
+# TODO: Doxygen
+#doxygen = find_program('doxygen', required: get_option('docs'))
+#if doxygen.found()
+#  subdir('doc')
+#endif
+
+# =====================================================================
+
+summary(
+  {
+    'Run-time CPU detection': rtcd_support_names,
+    'Generate Assembly Files': opt_asm.enabled(),
+  },
+  section: 'Compilation',
+  bool_yn: true,
+  list_sep: ', ',
+)
+
+summary(
+  {
+#    'API documentation': doxygen.found(),  # TODO: Docygen
+    'Apps': not opt_apps.disabled(),
+    'Tests': not opt_tests.disabled(),
+  },
+  section: 'Components',
+  bool_yn: true,
+  list_sep: ', ',
+)
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644
index 0000000..8c87e79
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1,8 @@
+# Compilation
+option('asm', type : 'feature', value : 'auto', description : 'Generate Assembly Files')
+option('rtcd', type : 'feature', value : 'auto', description : 'Run-Time CPU Detection')
+
+# Components
+option('apps', type : 'feature', value : 'auto', description : 'Build Applications')
+option('docs', type: 'feature', value: 'auto', description: 'Build Documentation')
+option('tests', type : 'feature', value : 'auto', description : 'Build Tests')
diff --git a/msvc-arm.txt b/msvc-arm.txt
new file mode 100644
index 0000000..cedb2df
--- /dev/null
+++ b/msvc-arm.txt
@@ -0,0 +1,14 @@
+[binaries]
+c = 'cl'
+cpp = 'cl'
+ar = 'lib'
+windres = 'rc'
+
+[built-in options]
+c_std = 'c99'
+
+[host_machine]
+system = 'windows'
+cpu_family = 'arm'
+cpu = 'armv7'
+endian = 'little'
diff --git a/msvc-arm_env.bat b/msvc-arm_env.bat
new file mode 100644
index 0000000..66e46b2
--- /dev/null
+++ b/msvc-arm_env.bat
@@ -0,0 +1,4 @@
+rem Run this script to setup environment for the MSVC ARM compiler
+"D:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsamd64_arm.bat"
+cd "D:\Documenti\GitHub\aymo"
+meson setup vs --backend vs --cross-file msvc-arm.txt
diff --git a/src/aymo.c b/src/aymo.c
new file mode 100644
index 0000000..d6c7c60
--- /dev/null
+++ b/src/aymo.c
@@ -0,0 +1,35 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#include "aymo_convert.h"
+#include "aymo_cpu.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+void aymo_boot(void)
+{
+    aymo_cpu_boot();
+    aymo_convert_boot();
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_convert.c b/src/aymo_convert.c
new file mode 100644
index 0000000..0cecf45
--- /dev/null
+++ b/src/aymo_convert.c
@@ -0,0 +1,206 @@
+// CPU-specific inline methods for ARM NEON.
+// Only #include after "aymo_cpu.h" to have inline methods.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_convert.h"
+#include "aymo_convert_arm_neon.h"
+#include "aymo_convert_none.h"
+#include "aymo_convert_x86_avx2.h"
+#include "aymo_convert_x86_sse41.h"
+#include "aymo_cpu.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// Dispatcher function types
+typedef void (*aymo_convert_i16_f32_f)(size_t n, const int16_t i16v[], float f32v[]);
+typedef void (*aymo_convert_f32_i16_f)(size_t n, const float f32v[], int16_t i16v[]);
+typedef void (*aymo_convert_i16_f32_1_f)(size_t n, const int16_t i16v[], float f32v[]);
+typedef void (*aymo_convert_f32_i16_1_f)(size_t n, const float f32v[], int16_t i16v[]);
+typedef void (*aymo_convert_i16_f32_k_f)(size_t n, const int16_t i16v[], float f32v[], float scale);
+typedef void (*aymo_convert_f32_i16_k_f)(size_t n, const float f32v[], int16_t i16v[], float scale);
+typedef void (*aymo_convert_u16_f32_f)(size_t n, const uint16_t u16v[], float f32v[]);
+typedef void (*aymo_convert_f32_u16_f)(size_t n, const float f32v[], uint16_t u16v[]);
+typedef void (*aymo_convert_u16_f32_1_f)(size_t n, const uint16_t u16v[], float f32v[]);
+typedef void (*aymo_convert_f32_u16_1_f)(size_t n, const float f32v[], uint16_t u16v[]);
+typedef void (*aymo_convert_u16_f32_k_f)(size_t n, const uint16_t u16v[], float f32v[], float scale);
+typedef void (*aymo_convert_f32_u16_k_f)(size_t n, const float f32v[], uint16_t u16v[], float scale);
+
+// Dispatcher function pointers
+static aymo_convert_i16_f32_f aymo_convert_i16_f32_p;
+static aymo_convert_f32_i16_f aymo_convert_f32_i16_p;
+static aymo_convert_i16_f32_1_f aymo_convert_i16_f32_1_p;
+static aymo_convert_f32_i16_1_f aymo_convert_f32_i16_1_p;
+static aymo_convert_i16_f32_k_f aymo_convert_i16_f32_k_p;
+static aymo_convert_f32_i16_k_f aymo_convert_f32_i16_k_p;
+static aymo_convert_u16_f32_f aymo_convert_u16_f32_p;
+static aymo_convert_f32_u16_f aymo_convert_f32_u16_p;
+static aymo_convert_u16_f32_1_f aymo_convert_u16_f32_1_p;
+static aymo_convert_f32_u16_1_f aymo_convert_f32_u16_1_p;
+static aymo_convert_u16_f32_k_f aymo_convert_u16_f32_k_p;
+static aymo_convert_f32_u16_k_f aymo_convert_f32_u16_k_p;
+
+
+void aymo_convert_boot(void)
+{
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+    if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) {
+        aymo_convert_i16_f32_p = aymo_convert_x86_avx2_i16_f32;
+        aymo_convert_f32_i16_p = aymo_convert_x86_avx2_f32_i16;
+        aymo_convert_i16_f32_1_p = aymo_convert_x86_avx2_i16_f32_1;
+        aymo_convert_f32_i16_1_p = aymo_convert_x86_avx2_f32_i16_1;
+        aymo_convert_i16_f32_k_p = aymo_convert_x86_avx2_i16_f32_k;
+        aymo_convert_f32_i16_k_p = aymo_convert_x86_avx2_f32_i16_k;
+        aymo_convert_u16_f32_p = aymo_convert_x86_avx2_u16_f32;
+        aymo_convert_f32_u16_p = aymo_convert_x86_avx2_f32_u16;
+        aymo_convert_u16_f32_1_p = aymo_convert_x86_avx2_u16_f32_1;
+        aymo_convert_f32_u16_1_p = aymo_convert_x86_avx2_f32_u16_1;
+        aymo_convert_u16_f32_k_p = aymo_convert_x86_avx2_u16_f32_k;
+        aymo_convert_f32_u16_k_p = aymo_convert_x86_avx2_f32_u16_k;
+        return;
+    }
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
+
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+    if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) {
+        aymo_convert_i16_f32_p = aymo_convert_x86_sse41_i16_f32;
+        aymo_convert_f32_i16_p = aymo_convert_x86_sse41_f32_i16;
+        aymo_convert_i16_f32_1_p = aymo_convert_x86_sse41_i16_f32_1;
+        aymo_convert_f32_i16_1_p = aymo_convert_x86_sse41_f32_i16_1;
+        aymo_convert_i16_f32_k_p = aymo_convert_x86_sse41_i16_f32_k;
+        aymo_convert_f32_i16_k_p = aymo_convert_x86_sse41_f32_i16_k;
+        aymo_convert_u16_f32_p = aymo_convert_x86_sse41_u16_f32;
+        aymo_convert_f32_u16_p = aymo_convert_x86_sse41_f32_u16;
+        aymo_convert_u16_f32_1_p = aymo_convert_x86_sse41_u16_f32_1;
+        aymo_convert_f32_u16_1_p = aymo_convert_x86_sse41_f32_u16_1;
+        aymo_convert_u16_f32_k_p = aymo_convert_x86_sse41_u16_f32_k;
+        aymo_convert_f32_u16_k_p = aymo_convert_x86_sse41_f32_u16_k;
+        return;
+    }
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
+
+#if 0//def AYMO_CPU_SUPPORT_ARM_NEON   //FIXME: TODO:
+    if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) {
+        aymo_convert_i16_f32_p = aymo_convert_arm_neon_i16_f32;
+        aymo_convert_f32_i16_p = aymo_convert_arm_neon_f32_i16;
+        aymo_convert_i16_f32_1_p = aymo_convert_arm_neon_i16_f32_1;
+        aymo_convert_f32_i16_1_p = aymo_convert_arm_neon_f32_i16_1;
+        aymo_convert_i16_f32_k_p = aymo_convert_arm_neon_i16_f32_k;
+        aymo_convert_f32_i16_k_p = aymo_convert_arm_neon_f32_i16_k;
+        aymo_convert_u16_f32_p = aymo_convert_arm_neon_u16_f32;
+        aymo_convert_f32_u16_p = aymo_convert_arm_neon_f32_u16;
+        aymo_convert_u16_f32_1_p = aymo_convert_arm_neon_u16_f32_1;
+        aymo_convert_f32_u16_1_p = aymo_convert_arm_neon_f32_u16_1;
+        aymo_convert_u16_f32_k_p = aymo_convert_arm_neon_u16_f32_k;
+        aymo_convert_f32_u16_k_p = aymo_convert_arm_neon_f32_u16_k;
+        return;
+    }
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
+
+    // Default dispatcher functions
+    aymo_convert_i16_f32_p = aymo_convert_none_i16_f32;
+    aymo_convert_f32_i16_p = aymo_convert_none_f32_i16;
+    aymo_convert_i16_f32_1_p = aymo_convert_none_i16_f32_1;
+    aymo_convert_f32_i16_1_p = aymo_convert_none_f32_i16_1;
+    aymo_convert_i16_f32_k_p = aymo_convert_none_i16_f32_k;
+    aymo_convert_f32_i16_k_p = aymo_convert_none_f32_i16_k;
+    aymo_convert_u16_f32_p = aymo_convert_none_u16_f32;
+    aymo_convert_f32_u16_p = aymo_convert_none_f32_u16;
+    aymo_convert_u16_f32_1_p = aymo_convert_none_u16_f32_1;
+    aymo_convert_f32_u16_1_p = aymo_convert_none_f32_u16_1;
+    aymo_convert_u16_f32_k_p = aymo_convert_none_u16_f32_k;
+    aymo_convert_f32_u16_k_p = aymo_convert_none_f32_u16_k;
+}
+
+
+void aymo_convert_i16_f32(size_t n, const int16_t i16v[], float f32v[])
+{
+    aymo_convert_i16_f32_p(n, i16v, f32v);
+}
+
+
+void aymo_convert_f32_i16(size_t n, const float f32v[], int16_t i16v[])
+{
+    aymo_convert_f32_i16_p(n, f32v, i16v);
+}
+
+
+void aymo_convert_i16_f32_1(size_t n, const int16_t i16v[], float f32v[])
+{
+    aymo_convert_i16_f32_1_p(n, i16v, f32v);
+}
+
+
+void aymo_convert_f32_i16_1(size_t n, const float f32v[], int16_t i16v[])
+{
+    aymo_convert_f32_i16_1_p(n, f32v, i16v);
+}
+
+
+void aymo_convert_i16_f32_k(size_t n, const int16_t i16v[], float f32v[], float scale)
+{
+    aymo_convert_i16_f32_k_p(n, i16v, f32v, scale);
+}
+
+
+void aymo_convert_f32_i16_k(size_t n, const float f32v[], int16_t i16v[], float scale)
+{
+    aymo_convert_f32_i16_k_p(n, f32v, i16v, scale);
+}
+
+
+void aymo_convert_u16_f32(size_t n, const uint16_t u16v[], float f32v[])
+{
+    aymo_convert_u16_f32_p(n, u16v, f32v);
+}
+
+
+void aymo_convert_f32_u16(size_t n, const float f32v[], uint16_t u16v[])
+{
+    aymo_convert_f32_u16_p(n, f32v, u16v);
+}
+
+
+void aymo_convert_u16_f32_1(size_t n, const uint16_t u16v[], float f32v[])
+{
+    aymo_convert_u16_f32_1_p(n, u16v, f32v);
+}
+
+
+void aymo_convert_f32_u16_1(size_t n, const float f32v[], uint16_t u16v[])
+{
+    aymo_convert_f32_u16_1_p(n, f32v, u16v);
+}
+
+
+void aymo_convert_u16_f32_k(size_t n, const uint16_t u16v[], float f32v[], float scale)
+{
+    aymo_convert_u16_f32_k_p(n, u16v, f32v, scale);
+}
+
+
+void aymo_convert_f32_u16_k(size_t n, const float f32v[], uint16_t u16v[], float scale)
+{
+    aymo_convert_f32_u16_k_p(n, f32v, u16v, scale);
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_convert_arm_neon.c b/src/aymo_convert_arm_neon.c
new file mode 100644
index 0000000..487588a
--- /dev/null
+++ b/src/aymo_convert_arm_neon.c
@@ -0,0 +1,821 @@
+// CPU-specific inline methods for ARM NEON.
+// Only #include after "aymo_cpu.h" to have inline methods.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_convert_arm_neon.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+static inline float reinterpret_f32_i32(int32_t i32)
+{
+    union { float f; int32_t i; } u;
+    u.i = i32;
+    return u.f;
+}
+
+
+#undef mm_extract_ps
+#define mm_extract_ps(a, imm8)  \
+    (reinterpret_f32_i32(_mm_extract_epi32(_mm_castps_si128(a), (imm8))))
+
+
+void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[])
+{
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            int16x8_t s16 = vld1q_s16(i16v); i16v += 8;
+            int32x4_t s32lo = vmovl_s16(vget_low_s16(s16));
+            int32x4_t s32hi = vmovl_s16(vget_high_s16(s16));
+            float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+            float32x4_t f32hi = vcvtq_f32_s32(s32hi);
+            vst1q_f32(f32v, f32lo); f32v += 4;
+            vst1q_f32(f32v, f32hi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        int16x4_t s16 = vld1_s16(i16v); i16v += 4;
+        int32x4_t s32lo = vmovl_s16(s16);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        vst1q_f32(f32v, f32lo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            int32_t i32t[4] = { i16v[0], i16v[1], i16v[2], 0 };
+            int32x4_t s32lo = vld1q_s32(i32t);
+            float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+            f32v[0] = vgetq_lane_f32(f32lo, 0);
+            f32v[1] = vgetq_lane_f32(f32lo, 1);
+            f32v[2] = vgetq_lane_f32(f32lo, 2);
+            break;
+        }
+        case 2: {
+            int32_t i32t[4] = { i16v[0], i16v[1], 0, 0 };
+            int32x4_t s32lo = vld1q_s32(i32t);
+            float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+            f32v[0] = vgetq_lane_f32(f32lo, 0);
+            f32v[1] = vgetq_lane_f32(f32lo, 1);
+            break;
+        }
+        case 1: {
+            int32_t i32t[4] = { i16v[0], 0, 0, 0 };
+            int32x4_t s32lo = vld1q_s32(i32t);
+            float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+            f32v[0] = vgetq_lane_f32(f32lo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[])
+{
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+            float32x4_t f32hi = vld1q_f32(f32v); f32v += 4;
+            int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+            int32x4_t s32hi = vcvtq_s32_f32(f32hi);
+            int16x4_t s16lo = vqmovn_s32(s32lo);
+            int16x4_t s16hi = vqmovn_s32(s32hi);
+            vst1q_s16(i16v, vcombine_s16(s16lo, s16hi)); i16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        vst1_s16(i16v, s16lo); i16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f };
+            float32x4_t f32lo = vld1q_f32(f32t);
+            int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+            int16x4_t s16lo = vqmovn_s32(s32lo);
+            i16v[0] = vget_lane_s16(s16lo, 0);
+            i16v[1] = vget_lane_s16(s16lo, 1);
+            i16v[2] = vget_lane_s16(s16lo, 2);
+            break;
+        }
+        case 2: {
+            float f32t[4] = { f32v[0], f32v[1], .0f, .0f };
+            float32x4_t f32lo = vld1q_f32(f32t);
+            int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+            int16x4_t s16lo = vqmovn_s32(s32lo);
+            i16v[0] = vget_lane_s16(s16lo, 0);
+            i16v[1] = vget_lane_s16(s16lo, 1);
+            break;
+        }
+        case 1: {
+            float f32t[4] = { f32v[0], .0f, .0f, .0f };
+            float32x4_t f32lo = vld1q_f32(f32t);
+            int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+            int16x4_t s16lo = vqmovn_s32(s32lo);
+            i16v[0] = vget_lane_s16(s16lo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[])
+{
+    const float scale = (float)(1. / 32768.);
+    float32x4_t psk = vdupq_n_f32(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            int16x8_t s16 = vld1q_s16(i16v); i16v += 8;
+            int32x4_t s32lo = vmovl_s16(vget_low_s16(s16));
+            int32x4_t s32hi = vmovl_s16(vget_high_s16(s16));
+            float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+            float32x4_t f32hi = vcvtq_f32_s32(s32hi);
+            f32lo = vmulq_f32(f32lo, psk);
+            f32hi = vmulq_f32(f32hi, psk);
+            vst1q_f32(f32v, f32lo); f32v += 4;
+            vst1q_f32(f32v, f32hi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        int16x4_t s16 = vld1_s16(i16v); i16v += 4;
+        int32x4_t s32lo = vmovl_s16(s16);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        vst1q_f32(f32v, f32lo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        int32_t i32t[4] = { i16v[0], i16v[1], i16v[2], 0 };
+        int32x4_t s32lo = vld1q_s32(i32t);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        f32v[2] = vgetq_lane_f32(f32lo, 2);
+        break;
+    }
+    case 2: {
+        int32_t i32t[4] = { i16v[0], i16v[1], 0, 0 };
+        int32x4_t s32lo = vld1q_s32(i32t);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        break;
+    }
+    case 1: {
+        int32_t i32t[4] = { i16v[0], 0, 0, 0 };
+        int32x4_t s32lo = vld1q_s32(i32t);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[])
+{
+    const float scale = (float)(32768.);
+    float32x4_t psk = vdupq_n_f32(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+            float32x4_t f32hi = vld1q_f32(f32v); f32v += 4;
+            f32lo = vmulq_f32(f32lo, psk);
+            f32hi = vmulq_f32(f32hi, psk);
+            int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+            int32x4_t s32hi = vcvtq_s32_f32(f32hi);
+            int16x4_t s16lo = vqmovn_s32(s32lo);
+            int16x4_t s16hi = vqmovn_s32(s32hi);
+            vst1q_s16(i16v, vcombine_s16(s16lo, s16hi)); i16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+        f32lo = vmulq_f32(f32lo, psk);
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        vst1_s16(i16v, s16lo); i16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        i16v[0] = vget_lane_s16(s16lo, 0);
+        i16v[1] = vget_lane_s16(s16lo, 1);
+        i16v[2] = vget_lane_s16(s16lo, 2);
+        break;
+    }
+    case 2: {
+        float f32t[4] = { f32v[0], f32v[1], .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        i16v[0] = vget_lane_s16(s16lo, 0);
+        i16v[1] = vget_lane_s16(s16lo, 1);
+        break;
+    }
+    case 1: {
+        float f32t[4] = { f32v[0], .0f, .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        i16v[0] = vget_lane_s16(s16lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale)
+{
+    float32x4_t psk = vdupq_n_f32(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            int16x8_t s16 = vld1q_s16(i16v); i16v += 8;
+            int32x4_t s32lo = vmovl_s16(vget_low_s16(s16));
+            int32x4_t s32hi = vmovl_s16(vget_high_s16(s16));
+            float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+            float32x4_t f32hi = vcvtq_f32_s32(s32hi);
+            f32lo = vmulq_f32(f32lo, psk);
+            f32hi = vmulq_f32(f32hi, psk);
+            vst1q_f32(f32v, f32lo); f32v += 4;
+            vst1q_f32(f32v, f32hi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        int16x4_t s16 = vld1_s16(i16v); i16v += 4;
+        int32x4_t s32lo = vmovl_s16(s16);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        vst1q_f32(f32v, f32lo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        int32_t i32t[4] = { i16v[0], i16v[1], i16v[2], 0 };
+        int32x4_t s32lo = vld1q_s32(i32t);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        f32v[2] = vgetq_lane_f32(f32lo, 2);
+        break;
+    }
+    case 2: {
+        int32_t i32t[4] = { i16v[0], i16v[1], 0, 0 };
+        int32x4_t s32lo = vld1q_s32(i32t);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        break;
+    }
+    case 1: {
+        int32_t i32t[4] = { i16v[0], 0, 0, 0 };
+        int32x4_t s32lo = vld1q_s32(i32t);
+        float32x4_t f32lo = vcvtq_f32_s32(s32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale)
+{
+    float32x4_t psk = vdupq_n_f32(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+            float32x4_t f32hi = vld1q_f32(f32v); f32v += 4;
+            f32lo = vmulq_f32(f32lo, psk);
+            f32hi = vmulq_f32(f32hi, psk);
+            int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+            int32x4_t s32hi = vcvtq_s32_f32(f32hi);
+            int16x4_t s16lo = vqmovn_s32(s32lo);
+            int16x4_t s16hi = vqmovn_s32(s32hi);
+            vst1q_s16(i16v, vcombine_s16(s16lo, s16hi)); i16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+        f32lo = vmulq_f32(f32lo, psk);
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        vst1_s16(i16v, s16lo); i16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        i16v[0] = vget_lane_s16(s16lo, 0);
+        i16v[1] = vget_lane_s16(s16lo, 1);
+        i16v[2] = vget_lane_s16(s16lo, 2);
+        break;
+    }
+    case 2: {
+        float f32t[4] = { f32v[0], f32v[1], .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        i16v[0] = vget_lane_s16(s16lo, 0);
+        i16v[1] = vget_lane_s16(s16lo, 1);
+        break;
+    }
+    case 1: {
+        float f32t[4] = { f32v[0], .0f, .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        int32x4_t s32lo = vcvtq_s32_f32(f32lo);
+        int16x4_t s16lo = vqmovn_s32(s32lo);
+        i16v[0] = vget_lane_s16(s16lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(u16_f32)(size_t n, const uint16_t u16v[], float f32v[])
+{
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            uint16x8_t u16 = vld1q_u16(u16v); u16v += 8;
+            uint32x4_t u32lo = vmovl_u16(vget_low_u16(u16));
+            uint32x4_t u32hi = vmovl_u16(vget_high_u16(u16));
+            float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+            float32x4_t f32hi = vcvtq_f32_u32(u32hi);
+            vst1q_f32(f32v, f32lo); f32v += 4;
+            vst1q_f32(f32v, f32hi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        uint16x4_t u16 = vld1_u16(u16v); u16v += 4;
+        uint32x4_t u32lo = vmovl_u16(u16);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        vst1q_f32(f32v, f32lo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        uint32_t u32t[4] = { u16v[0], u16v[1], u16v[2], 0 };
+        uint32x4_t u32lo = vld1q_u32(u32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        f32v[2] = vgetq_lane_f32(f32lo, 2);
+        break;
+    }
+    case 2: {
+        uint32_t u32t[4] = { u16v[0], u16v[1], 0, 0 };
+        uint32x4_t u32lo = vld1q_u32(u32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        break;
+    }
+    case 1: {
+        uint32_t u32t[4] = { u16v[0], 0, 0, 0 };
+        uint32x4_t u32lo = vld1q_u32(u32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t u16v[])
+{
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+            float32x4_t f32hi = vld1q_f32(f32v); f32v += 4;
+            uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+            uint32x4_t u32hi = vcvtq_u32_f32(f32hi);
+            uint16x4_t u16lo = vqmovn_u32(u32lo);
+            uint16x4_t u16hi = vqmovn_u32(u32hi);
+            vst1q_u16(u16v, vcombine_u16(u16lo, u16hi)); u16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        vst1_u16(u16v, u16lo); u16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        u16v[1] = vget_lane_u16(u16lo, 1);
+        u16v[2] = vget_lane_u16(u16lo, 2);
+        break;
+    }
+    case 2: {
+        float f32t[4] = { f32v[0], f32v[1], .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        u16v[1] = vget_lane_u16(u16lo, 1);
+        break;
+    }
+    case 1: {
+        float f32t[4] = { f32v[0], .0f, .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(u16_f32_1)(size_t n, const uint16_t u16v[], float f32v[])
+{
+    const float scale = (float)(1. / 32768.);
+    float32x4_t psk = vdupq_n_f32(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            uint16x8_t u16 = vld1q_u16(u16v); u16v += 8;
+            uint32x4_t u32lo = vmovl_u16(vget_low_u16(u16));
+            uint32x4_t u32hi = vmovl_u16(vget_high_u16(u16));
+            float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+            float32x4_t f32hi = vcvtq_f32_u32(u32hi);
+            f32lo = vmulq_f32(f32lo, psk);
+            f32hi = vmulq_f32(f32hi, psk);
+            vst1q_f32(f32v, f32lo); f32v += 4;
+            vst1q_f32(f32v, f32hi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        uint16x4_t u16 = vld1_u16(u16v); u16v += 4;
+        uint32x4_t u32lo = vmovl_u16(u16);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        vst1q_f32(f32v, f32lo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        uint32_t i32t[4] = { u16v[0], u16v[1], u16v[2], 0 };
+        uint32x4_t u32lo = vld1q_u32(i32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        f32v[2] = vgetq_lane_f32(f32lo, 2);
+        break;
+    }
+    case 2: {
+        uint32_t i32t[4] = { u16v[0], u16v[1], 0, 0 };
+        uint32x4_t u32lo = vld1q_u32(i32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        break;
+    }
+    case 1: {
+        uint32_t i32t[4] = { u16v[0], 0, 0, 0 };
+        uint32x4_t u32lo = vld1q_u32(i32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t u16v[])
+{
+    const float scale = (float)(32768.);
+    float32x4_t psk = vdupq_n_f32(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+            float32x4_t f32hi = vld1q_f32(f32v); f32v += 4;
+            f32lo = vmulq_f32(f32lo, psk);
+            f32hi = vmulq_f32(f32hi, psk);
+            uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+            uint32x4_t u32hi = vcvtq_u32_f32(f32hi);
+            uint16x4_t u16lo = vqmovn_u32(u32lo);
+            uint16x4_t u16hi = vqmovn_u32(u32hi);
+            vst1q_u16(u16v, vcombine_u16(u16lo, u16hi)); u16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+        f32lo = vmulq_f32(f32lo, psk);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        vst1_u16(u16v, u16lo); u16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        u16v[1] = vget_lane_u16(u16lo, 1);
+        u16v[2] = vget_lane_u16(u16lo, 2);
+        break;
+    }
+    case 2: {
+        float f32t[4] = { f32v[0], f32v[1], .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        u16v[1] = vget_lane_u16(u16lo, 1);
+        break;
+    }
+    case 1: {
+        float f32t[4] = { f32v[0], .0f, .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(u16_f32_k)(size_t n, const uint16_t u16v[], float f32v[], float scale)
+{
+    float32x4_t psk = vdupq_n_f32(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            uint16x8_t u16 = vld1q_u16(u16v); u16v += 8;
+            uint32x4_t u32lo = vmovl_u16(vget_low_u16(u16));
+            uint32x4_t u32hi = vmovl_u16(vget_high_u16(u16));
+            float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+            float32x4_t f32hi = vcvtq_f32_u32(u32hi);
+            f32lo = vmulq_f32(f32lo, psk);
+            f32hi = vmulq_f32(f32hi, psk);
+            vst1q_f32(f32v, f32lo); f32v += 4;
+            vst1q_f32(f32v, f32hi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        uint16x4_t u16 = vld1_u16(u16v); u16v += 4;
+        uint32x4_t u32lo = vmovl_u16(u16);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        vst1q_f32(f32v, f32lo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        uint32_t i32t[4] = { u16v[0], u16v[1], u16v[2], 0 };
+        uint32x4_t u32lo = vld1q_u32(i32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        f32v[2] = vgetq_lane_f32(f32lo, 2);
+        break;
+    }
+    case 2: {
+        uint32_t i32t[4] = { u16v[0], u16v[1], 0, 0 };
+        uint32x4_t u32lo = vld1q_u32(i32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        f32v[1] = vgetq_lane_f32(f32lo, 1);
+        break;
+    }
+    case 1: {
+        uint32_t i32t[4] = { u16v[0], 0, 0, 0 };
+        uint32x4_t u32lo = vld1q_u32(i32t);
+        float32x4_t f32lo = vcvtq_f32_u32(u32lo);
+        f32lo = vmulq_f32(f32lo, psk);
+        f32v[0] = vgetq_lane_f32(f32lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t u16v[], float scale)
+{
+    float32x4_t psk = vdupq_n_f32(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+            float32x4_t f32hi = vld1q_f32(f32v); f32v += 4;
+            f32lo = vmulq_f32(f32lo, psk);
+            f32hi = vmulq_f32(f32hi, psk);
+            uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+            uint32x4_t u32hi = vcvtq_u32_f32(f32hi);
+            uint16x4_t u16lo = vqmovn_u32(u32lo);
+            uint16x4_t u16hi = vqmovn_u32(u32hi);
+            vst1q_u16(u16v, vcombine_u16(u16lo, u16hi)); u16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        float32x4_t f32lo = vld1q_f32(f32v); f32v += 4;
+        f32lo = vmulq_f32(f32lo, psk);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        vst1_u16(u16v, u16lo); u16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+    case 3: {
+        float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        u16v[1] = vget_lane_u16(u16lo, 1);
+        u16v[2] = vget_lane_u16(u16lo, 2);
+        break;
+    }
+    case 2: {
+        float f32t[4] = { f32v[0], f32v[1], .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        u16v[1] = vget_lane_u16(u16lo, 1);
+        break;
+    }
+    case 1: {
+        float f32t[4] = { f32v[0], .0f, .0f, .0f };
+        float32x4_t f32lo = vld1q_f32(f32t);
+        f32lo = vmulq_f32(f32lo, psk);
+        uint32x4_t u32lo = vcvtq_u32_f32(f32lo);
+        uint16x4_t u16lo = vqmovn_u32(u32lo);
+        u16v[0] = vget_lane_u16(u16lo, 0);
+        break;
+    }
+    default: break;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
diff --git a/src/aymo_convert_none.c b/src/aymo_convert_none.c
new file mode 100644
index 0000000..da0590f
--- /dev/null
+++ b/src/aymo_convert_none.c
@@ -0,0 +1,177 @@
+// CPU-specific inline methods for ARM NEON.
+// Only #include after "aymo_cpu.h" to have inline methods.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_convert_none.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+static inline float convert_i16_f32(int16_t i)
+{
+    return (float)i;
+}
+
+
+static inline int16_t convert_f32_i16(float f)
+{
+    if (f >= (float)INT16_MAX) {
+        return INT16_MAX;
+    }
+    if (f < (float)INT16_MIN) {
+        return INT16_MIN;
+    }
+    return (int16_t)f;
+}
+
+
+static inline float convert_u16_f32(uint16_t u)
+{
+    return (float)u;
+}
+
+
+static inline uint16_t convert_f32_u16(float f)
+{
+    if (f >= (float)UINT16_MAX) {
+        return UINT16_MAX;
+    }
+    if (f < 0.f) {
+        return 0u;
+    }
+    return (uint16_t)f;
+}
+
+
+void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[])
+{
+    const int16_t* i16e = (i16v + n);
+    while (i16v != i16e) {
+        *f32v++ = convert_i16_f32(*i16v++);
+    }
+}
+
+
+void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[])
+{
+    const float* f32e = (f32v + n);
+    while (f32v != f32e) {
+        *i16v++ = convert_f32_i16(*f32v++);
+    }
+}
+
+
+void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[])
+{
+    const float scale = (float)(1. / 32768.);
+    const int16_t* i16e = (i16v + n);
+    while (i16v != i16e) {
+        *f32v++ = (convert_i16_f32(*i16v++) * scale);
+    }
+}
+
+
+void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[])
+{
+    const float scale = (float)(32768.);
+    const float* f32e = (f32v + n);
+    while (f32v != f32e) {
+        *i16v++ = convert_f32_i16(*f32v++ * scale);
+    }
+}
+
+
+void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale)
+{
+    const int16_t* i16e = (i16v + n);
+    while (i16v != i16e) {
+        *f32v++ = (convert_i16_f32(*i16v++) * scale);
+    }
+}
+
+
+void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale)
+{
+    const float* f32e = (f32v + n);
+    while (f32v != f32e) {
+        *i16v++ = convert_f32_i16(*f32v++ * scale);
+    }
+}
+
+
+void aymo_(u16_f32)(size_t n, const uint16_t u16v[], float f32v[])
+{
+    const uint16_t* u16e = (u16v + n);
+    while (u16v != u16e) {
+        *f32v++ = convert_u16_f32(*u16v++);
+    }
+}
+
+
+void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t u16v[])
+{
+    const float* f32e = (f32v + n);
+    while (f32v != f32e) {
+        *u16v++ = convert_f32_u16(*f32v++);
+    }
+}
+
+
+void aymo_(u16_f32_1)(size_t n, const uint16_t u16v[], float f32v[])
+{
+    const float scale = (float)(1. / 32768.);
+    const uint16_t* u16e = (u16v + n);
+    while (u16v != u16e) {
+        *f32v++ = (convert_u16_f32(*u16v++) * scale);
+    }
+}
+
+
+void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t u16v[])
+{
+    const float scale = (float)(32768.);
+    const float* f32e = (f32v + n);
+    while (f32v != f32e) {
+        *u16v++ = convert_f32_u16(*f32v++ * scale);
+    }
+}
+
+
+void aymo_(u16_f32_k)(size_t n, const uint16_t u16v[], float f32v[], float scale)
+{
+    const uint16_t* u16e = (u16v + n);
+    while (u16v != u16e) {
+        *f32v++ = (convert_u16_f32(*u16v++) * scale);
+    }
+}
+
+
+void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t u16v[], float scale)
+{
+    const float* f32e = (f32v + n);
+    while (f32v != f32e) {
+        *u16v++ = convert_f32_u16(*f32v++ * scale);
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_convert_x86_avx2.c b/src/aymo_convert_x86_avx2.c
new file mode 100644
index 0000000..a3de900
--- /dev/null
+++ b/src/aymo_convert_x86_avx2.c
@@ -0,0 +1,335 @@
+// CPU-specific inline methods for ARM NEON.
+// Only #include after "aymo_cpu.h" to have inline methods.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+#include "aymo_convert_x86_sse41.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_convert_x86_avx2.h"
+
+#include <immintrin.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[])
+{
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256i epi16 = _mm256_loadu_si256((const void*)i16v); i16v += 16;
+            __m256i epi32lo = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 0));
+            __m256i epi32hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 1));
+            __m256 pslo = _mm256_cvtepi32_ps(epi32lo);
+            __m256 pshi = _mm256_cvtepi32_ps(epi32hi);
+            _mm256_storeu_ps((void*)f32v, pslo); f32v += 8;
+            _mm256_storeu_ps((void*)f32v, pshi); f32v += 8;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_i16_f32(n, i16v, f32v);
+    }
+}
+
+
+void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[])
+{
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            __m256i epi32lo = _mm256_cvtps_epi32(pslo);
+            __m256i epi32hi = _mm256_cvtps_epi32(pshi);
+            __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1);
+            __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0);
+            epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1);
+            epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0);
+            __m256i epi16 = _mm256_packs_epi32(epi32lo, epi32hi);
+            _mm256_storeu_si256((void*)i16v, epi16); i16v += 16;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_f32_i16(n, f32v, i16v);
+    }
+}
+
+
+void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[])
+{
+    const float scale = (float)(1. / 32768.);
+    __m256 psk = _mm256_set1_ps(scale);
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256i epi16 = _mm256_loadu_si256((const void*)i16v); i16v += 16;
+            __m256i epi32lo = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 0));
+            __m256i epi32hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 1));
+            __m256 pslo = _mm256_cvtepi32_ps(epi32lo);
+            __m256 pshi = _mm256_cvtepi32_ps(epi32hi);
+            pslo = _mm256_mul_ps(pslo, psk);
+            pshi = _mm256_mul_ps(pshi, psk);
+            _mm256_storeu_ps((void*)f32v, pslo); f32v += 8;
+            _mm256_storeu_ps((void*)f32v, pshi); f32v += 8;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_i16_f32_1(n, i16v, f32v);
+    }
+}
+
+
+void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[])
+{
+    const float scale = (float)(32768.);
+    __m256 psk = _mm256_set1_ps(scale);
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            pslo = _mm256_mul_ps(pslo, psk);
+            pshi = _mm256_mul_ps(pshi, psk);
+            __m256i epi32lo = _mm256_cvtps_epi32(pslo);
+            __m256i epi32hi = _mm256_cvtps_epi32(pshi);
+            __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1);
+            __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0);
+            epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1);
+            epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0);
+            __m256i epi16 = _mm256_packs_epi32(epi32lo, epi32hi);
+            _mm256_storeu_si256((void*)i16v, epi16); i16v += 16;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_f32_i16_1(n, f32v, i16v);
+    }
+}
+
+
+void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale)
+{
+    __m256 psk = _mm256_set1_ps(scale);
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256i epi16 = _mm256_loadu_si256((const void*)i16v); i16v += 16;
+            __m256i epi32lo = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 0));
+            __m256i epi32hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 1));
+            __m256 pslo = _mm256_cvtepi32_ps(epi32lo);
+            __m256 pshi = _mm256_cvtepi32_ps(epi32hi);
+            pslo = _mm256_mul_ps(pslo, psk);
+            pshi = _mm256_mul_ps(pshi, psk);
+            _mm256_storeu_ps((void*)f32v, pslo); f32v += 8;
+            _mm256_storeu_ps((void*)f32v, pshi); f32v += 8;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_i16_f32_k(n, i16v, f32v, scale);
+    }
+}
+
+
+void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale)
+{
+    __m256 psk = _mm256_set1_ps(scale);
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            pslo = _mm256_mul_ps(pslo, psk);
+            pshi = _mm256_mul_ps(pshi, psk);
+            __m256i epi32lo = _mm256_cvtps_epi32(pslo);
+            __m256i epi32hi = _mm256_cvtps_epi32(pshi);
+            __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1);
+            __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0);
+            epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1);
+            epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0);
+            __m256i epi16 = _mm256_packs_epi32(epi32lo, epi32hi);
+            _mm256_storeu_si256((void*)i16v, epi16); i16v += 16;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_f32_i16_k(n, f32v, i16v, scale);
+    }
+}
+
+
+void aymo_(u16_f32)(size_t n, const uint16_t u16v[], float f32v[])
+{
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256i epu16 = _mm256_loadu_si256((const void*)u16v); u16v += 16;
+            __m256i epi32lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 0));
+            __m256i epi32hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 1));
+            __m256 pslo = _mm256_cvtepi32_ps(epi32lo);
+            __m256 pshi = _mm256_cvtepi32_ps(epi32hi);
+            _mm256_storeu_ps((void*)f32v, pslo); f32v += 8;
+            _mm256_storeu_ps((void*)f32v, pshi); f32v += 8;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_u16_f32(n, u16v, f32v);
+    }
+}
+
+
+void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t u16v[])
+{
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            __m256i epi32lo = _mm256_cvtps_epi32(pslo);
+            __m256i epi32hi = _mm256_cvtps_epi32(pshi);
+            __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1);
+            __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0);
+            epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1);
+            epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0);
+            __m256i epu16 = _mm256_packus_epi32(epi32lo, epi32hi);
+            _mm256_storeu_si256((void*)u16v, epu16); u16v += 16;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_f32_u16(n, f32v, u16v);
+    }
+}
+
+
+void aymo_(u16_f32_1)(size_t n, const uint16_t u16v[], float f32v[])
+{
+    const float scale = (float)(1. / 32768.);
+    __m256 psk = _mm256_set1_ps(scale);
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256i epu16 = _mm256_loadu_si256((const void*)u16v); u16v += 16;
+            __m256i epi32lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 0));
+            __m256i epi32hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 1));
+            __m256 pslo = _mm256_cvtepi32_ps(epi32lo);
+            __m256 pshi = _mm256_cvtepi32_ps(epi32hi);
+            pslo = _mm256_mul_ps(pslo, psk);
+            pshi = _mm256_mul_ps(pshi, psk);
+            _mm256_storeu_ps((void*)f32v, pslo); f32v += 8;
+            _mm256_storeu_ps((void*)f32v, pshi); f32v += 8;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_u16_f32_1(n, u16v, f32v);
+    }
+}
+
+
+void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t u16v[])
+{
+    const float scale = (float)(32768.);
+    __m256 psk = _mm256_set1_ps(scale);
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            pslo = _mm256_mul_ps(pslo, psk);
+            pshi = _mm256_mul_ps(pshi, psk);
+            __m256i epi32lo = _mm256_cvtps_epi32(pslo);
+            __m256i epi32hi = _mm256_cvtps_epi32(pshi);
+            __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1);
+            __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0);
+            epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1);
+            epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0);
+            __m256i epu16 = _mm256_packus_epi32(epi32lo, epi32hi);
+            _mm256_storeu_si256((void*)u16v, epu16); u16v += 16;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_f32_u16_1(n, f32v, u16v);
+    }
+}
+
+
+void aymo_(u16_f32_k)(size_t n, const uint16_t u16v[], float f32v[], float scale)
+{
+    __m256 psk = _mm256_set1_ps(scale);
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256i epu16 = _mm256_loadu_si256((const void*)u16v); u16v += 16;
+            __m256i epi32lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 0));
+            __m256i epi32hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 1));
+            __m256 pslo = _mm256_cvtepi32_ps(epi32lo);
+            __m256 pshi = _mm256_cvtepi32_ps(epi32hi);
+            pslo = _mm256_mul_ps(pslo, psk);
+            pshi = _mm256_mul_ps(pshi, psk);
+            _mm256_storeu_ps((void*)f32v, pslo); f32v += 8;
+            _mm256_storeu_ps((void*)f32v, pshi); f32v += 8;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_u16_f32_k(n, u16v, f32v, scale);
+    }
+}
+
+
+void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t u16v[], float scale)
+{
+    __m256 psk = _mm256_set1_ps(scale);
+    if (n >= 16) {
+        size_t nw = (n / 16);
+        n %= 16;
+        do {
+            __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8;
+            pslo = _mm256_mul_ps(pslo, psk);
+            pshi = _mm256_mul_ps(pshi, psk);
+            __m256i epi32lo = _mm256_cvtps_epi32(pslo);
+            __m256i epi32hi = _mm256_cvtps_epi32(pshi);
+            __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1);
+            __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0);
+            epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1);
+            epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0);
+            __m256i epu16 = _mm256_packus_epi32(epi32lo, epi32hi);
+            _mm256_storeu_si256((void*)u16v, epu16); u16v += 16;
+        } while (--nw);
+    }
+    if (n) {
+        aymo_convert_x86_sse41_f32_u16_k(n, f32v, u16v, scale);
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
diff --git a/src/aymo_convert_x86_sse41.c b/src/aymo_convert_x86_sse41.c
new file mode 100644
index 0000000..56d2d49
--- /dev/null
+++ b/src/aymo_convert_x86_sse41.c
@@ -0,0 +1,796 @@
+// CPU-specific inline methods for ARM NEON.
+// Only #include after "aymo_cpu.h" to have inline methods.
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_convert_x86_sse41.h"
+
+#include <immintrin.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+static inline float reinterpret_f32_i32(int32_t i32)
+{
+    union { float f; int32_t i; } u;
+    u.i = i32;
+    return u.f;
+}
+
+
+#undef mm_extract_ps
+#define mm_extract_ps(a, imm8)  \
+    (reinterpret_f32_i32(_mm_extract_epi32(_mm_castps_si128(a), (imm8))))
+
+
+void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[])
+{
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128i epi16 = _mm_loadu_si128((const void*)i16v); i16v += 8;
+            __m128i epi32lo = _mm_cvtepi16_epi32(epi16);
+            epi16 = _mm_shuffle_epi32(epi16, _MM_SHUFFLE(3, 2, 3, 2));
+            __m128i epi32hi = _mm_cvtepi16_epi32(epi16);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            __m128 pshi = _mm_cvtepi32_ps(epi32hi);
+            _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+            _mm_storeu_ps((void*)f32v, pshi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128i epi16lo = _mm_loadl_epi64((const void*)i16v); i16v += 4;
+        __m128i epi32lo = _mm_cvtepi16_epi32(epi16lo);
+        __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+        _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], i16v[2], 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            f32v[2] = mm_extract_ps(pslo, 2);
+            break;
+        }
+        case 2: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            break;
+        }
+        case 1: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], 0, 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[])
+{
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi32hi = _mm_cvtps_epi32(pshi);
+            __m128i epi16 = _mm_packs_epi32(epi32lo, epi32hi);
+            _mm_storeu_si128((void*)i16v, epi16); i16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+        __m128i epi32lo = _mm_cvtps_epi32(pslo);
+        __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+        _mm_storel_epi64((void*)i16v, epi16lo); i16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            i16v[1] = _mm_extract_epi16(epi16lo, 1);
+            i16v[2] = _mm_extract_epi16(epi16lo, 2);
+            break;
+        }
+        case 2: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            i16v[1] = _mm_extract_epi16(epi16lo, 1);
+            break;
+        }
+        case 1: {
+            __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[])
+{
+    const float scale = (float)(1. / 32768.);
+    __m128 psk = _mm_set1_ps(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128i epi16 = _mm_loadu_si128((const void*)i16v); i16v += 8;
+            __m128i epi32lo = _mm_cvtepi16_epi32(epi16);
+            epi16 = _mm_shuffle_epi32(epi16, _MM_SHUFFLE(3, 2, 3, 2));
+            __m128i epi32hi = _mm_cvtepi16_epi32(epi16);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            __m128 pshi = _mm_cvtepi32_ps(epi32hi);
+            pslo = _mm_mul_ps(pslo, psk);
+            pshi = _mm_mul_ps(pshi, psk);
+            _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+            _mm_storeu_ps((void*)f32v, pshi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128i epi16lo = _mm_loadl_epi64((const void*)i16v); i16v += 4;
+        __m128i epi32lo = _mm_cvtepi16_epi32(epi16lo);
+        __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+        pslo = _mm_mul_ps(pslo, psk);
+        _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], i16v[2], 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            f32v[2] = mm_extract_ps(pslo, 2);
+            break;
+        }
+        case 2: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            break;
+        }
+        case 1: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], 0, 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[])
+{
+    const float scale = (float)(32768.);
+    __m128 psk = _mm_set1_ps(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            pslo = _mm_mul_ps(pslo, psk);
+            pshi = _mm_mul_ps(pshi, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi32hi = _mm_cvtps_epi32(pshi);
+            __m128i epi16 = _mm_packs_epi32(epi32lo, epi32hi);
+            _mm_storeu_si128((void*)i16v, epi16); i16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+        pslo = _mm_mul_ps(pslo, psk);
+        __m128i epi32lo = _mm_cvtps_epi32(pslo);
+        __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+        _mm_storel_epi64((void*)i16v, epi16lo); i16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            i16v[1] = _mm_extract_epi16(epi16lo, 1);
+            i16v[2] = _mm_extract_epi16(epi16lo, 2);
+            break;
+        }
+        case 2: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            i16v[1] = _mm_extract_epi16(epi16lo, 1);
+            break;
+        }
+        case 1: {
+            __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale)
+{
+    __m128 psk = _mm_set1_ps(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128i epi16 = _mm_loadu_si128((const void*)i16v); i16v += 8;
+            __m128i epi32lo = _mm_cvtepi16_epi32(epi16);
+            epi16 = _mm_shuffle_epi32(epi16, _MM_SHUFFLE(3, 2, 3, 2));
+            __m128i epi32hi = _mm_cvtepi16_epi32(epi16);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            __m128 pshi = _mm_cvtepi32_ps(epi32hi);
+            pslo = _mm_mul_ps(pslo, psk);
+            pshi = _mm_mul_ps(pshi, psk);
+            _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+            _mm_storeu_ps((void*)f32v, pshi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128i epi16lo = _mm_loadl_epi64((const void*)i16v); i16v += 4;
+        __m128i epi32lo = _mm_cvtepi16_epi32(epi16lo);
+        __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+        pslo = _mm_mul_ps(pslo, psk);
+        _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], i16v[2], 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            f32v[2] = mm_extract_ps(pslo, 2);
+            break;
+        }
+        case 2: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            break;
+        }
+        case 1: {
+            __m128i epi32lo = _mm_setr_epi32(i16v[0], 0, 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale)
+{
+    __m128 psk = _mm_set1_ps(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            pslo = _mm_mul_ps(pslo, psk);
+            pshi = _mm_mul_ps(pshi, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi32hi = _mm_cvtps_epi32(pshi);
+            __m128i epi16 = _mm_packs_epi32(epi32lo, epi32hi);
+            _mm_storeu_si128((void*)i16v, epi16); i16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+        pslo = _mm_mul_ps(pslo, psk);
+        __m128i epi32lo = _mm_cvtps_epi32(pslo);
+        __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+        _mm_storel_epi64((void*)i16v, epi16lo); i16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            i16v[1] = _mm_extract_epi16(epi16lo, 1);
+            i16v[2] = _mm_extract_epi16(epi16lo, 2);
+            break;
+        }
+        case 2: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            i16v[1] = _mm_extract_epi16(epi16lo, 1);
+            break;
+        }
+        case 1: {
+            __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo);
+            i16v[0] = _mm_extract_epi16(epi16lo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(u16_f32)(size_t n, const uint16_t u16v[], float f32v[])
+{
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128i epu16 = _mm_loadu_si128((const void*)u16v); u16v += 8;
+            __m128i epu32lo = _mm_cvtepu16_epi32(epu16);
+            epu16 = _mm_shuffle_epi32(epu16, _MM_SHUFFLE(3, 2, 3, 2));
+            __m128i epu32hi = _mm_cvtepu16_epi32(epu16);
+            __m128 pslo = _mm_cvtepi32_ps(epu32lo);
+            __m128 pshi = _mm_cvtepi32_ps(epu32hi);
+            _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+            _mm_storeu_ps((void*)f32v, pshi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128i epu16lo = _mm_loadl_epi64((const void*)u16v); u16v += 4;
+        __m128i epu32lo = _mm_cvtepu16_epi32(epu16lo);
+        __m128 pslo = _mm_cvtepi32_ps(epu32lo);
+        _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0],
+                                             (int32_t)(uint32_t)u16v[1],
+                                             (int32_t)(uint32_t)u16v[2], 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            f32v[2] = mm_extract_ps(pslo, 2);
+            break;
+        }
+        case 2: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0],
+                                             (int32_t)(uint32_t)u16v[1], 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            break;
+        }
+        case 1: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], 0, 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t u16v[])
+{
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            __m128i epu32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu32hi = _mm_cvtps_epi32(pshi);
+            __m128i epu16 = _mm_packus_epi32(epu32lo, epu32hi);
+            _mm_storeu_si128((void*)u16v, epu16); u16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+        __m128i epu32lo = _mm_cvtps_epi32(pslo);
+        __m128i epu16lo = _mm_packus_epi32(epu32lo, epu32lo);
+        _mm_storel_epi64((void*)u16v, epu16lo); u16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1);
+            u16v[2] = (uint16_t)_mm_extract_epi16(epu16lo, 2);
+            break;
+        }
+        case 2: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1);
+            break;
+        }
+        case 1: {
+            __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(u16_f32_1)(size_t n, const uint16_t u16v[], float f32v[])
+{
+    const float scale = (float)(1. / 32768.);
+    __m128 psk = _mm_set1_ps(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128i epu16 = _mm_loadu_si128((const void*)u16v); u16v += 8;
+            __m128i epu32lo = _mm_cvtepu16_epi32(epu16);
+            epu16 = _mm_shuffle_epi32(epu16, _MM_SHUFFLE(3, 2, 3, 2));
+            __m128i epu32hi = _mm_cvtepu16_epi32(epu16);
+            __m128 pslo = _mm_cvtepi32_ps(epu32lo);
+            __m128 pshi = _mm_cvtepi32_ps(epu32hi);
+            pslo = _mm_mul_ps(pslo, psk);
+            pshi = _mm_mul_ps(pshi, psk);
+            _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+            _mm_storeu_ps((void*)f32v, pshi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128i epu16lo = _mm_loadl_epi64((const void*)u16v); u16v += 4;
+        __m128i epu32lo = _mm_cvtepu16_epi32(epu16lo);
+        __m128 pslo = _mm_cvtepi32_ps(epu32lo);
+        pslo = _mm_mul_ps(pslo, psk);
+        _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0],
+                                             (int32_t)(uint32_t)u16v[1],
+                                             (int32_t)(uint32_t)u16v[2], 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            f32v[2] = mm_extract_ps(pslo, 2);
+            break;
+        }
+        case 2: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0],
+                                             (int32_t)(uint32_t)u16v[1], 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            break;
+        }
+        case 1: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], 0, 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t u16v[])
+{
+    const float scale = (float)(32768.);
+    __m128 psk = _mm_set1_ps(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            pslo = _mm_mul_ps(pslo, psk);
+            pshi = _mm_mul_ps(pshi, psk);
+            __m128i epu32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu32hi = _mm_cvtps_epi32(pshi);
+            __m128i epu16 = _mm_packus_epi32(epu32lo, epu32hi);
+            _mm_storeu_si128((void*)u16v, epu16); u16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+        pslo = _mm_mul_ps(pslo, psk);
+        __m128i epu32lo = _mm_cvtps_epi32(pslo);
+        __m128i epu16lo = _mm_packus_epi32(epu32lo, epu32lo);
+        _mm_storel_epi64((void*)u16v, epu16lo); u16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1);
+            u16v[2] = (uint16_t)_mm_extract_epi16(epu16lo, 2);
+            break;
+        }
+        case 2: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1);
+            break;
+        }
+        case 1: {
+            __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(u16_f32_k)(size_t n, const uint16_t u16v[], float f32v[], float scale)
+{
+    __m128 psk = _mm_set1_ps(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128i epu16 = _mm_loadu_si128((const void*)u16v); u16v += 8;
+            __m128i epu32lo = _mm_cvtepu16_epi32(epu16);
+            epu16 = _mm_shuffle_epi32(epu16, _MM_SHUFFLE(3, 2, 3, 2));
+            __m128i epu32hi = _mm_cvtepu16_epi32(epu16);
+            __m128 pslo = _mm_cvtepi32_ps(epu32lo);
+            __m128 pshi = _mm_cvtepi32_ps(epu32hi);
+            pslo = _mm_mul_ps(pslo, psk);
+            pshi = _mm_mul_ps(pshi, psk);
+            _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+            _mm_storeu_ps((void*)f32v, pshi); f32v += 4;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128i epu16lo = _mm_loadl_epi64((const void*)u16v); u16v += 4;
+        __m128i epu32lo = _mm_cvtepu16_epi32(epu16lo);
+        __m128 pslo = _mm_cvtepi32_ps(epu32lo);
+        pslo = _mm_mul_ps(pslo, psk);
+        _mm_storeu_ps((void*)f32v, pslo); f32v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0],
+                                             (int32_t)(uint32_t)u16v[1],
+                                             (int32_t)(uint32_t)u16v[2], 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            f32v[2] = mm_extract_ps(pslo, 2);
+            break;
+        }
+        case 2: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0],
+                                             (int32_t)(uint32_t)u16v[1], 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            f32v[1] = mm_extract_ps(pslo, 1);
+            break;
+        }
+        case 1: {
+            __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], 0, 0, 0);
+            __m128 pslo = _mm_cvtepi32_ps(epi32lo);
+            pslo = _mm_mul_ps(pslo, psk);
+            f32v[0] = mm_extract_ps(pslo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t u16v[], float scale)
+{
+    __m128 psk = _mm_set1_ps(scale);
+    if (n >= 8) {
+        size_t nw = (n / 8);
+        do {
+            __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4;
+            pslo = _mm_mul_ps(pslo, psk);
+            pshi = _mm_mul_ps(pshi, psk);
+            __m128i epu32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu32hi = _mm_cvtps_epi32(pshi);
+            __m128i epu16 = _mm_packus_epi32(epu32lo, epu32hi);
+            _mm_storeu_si128((void*)u16v, epu16); u16v += 8;
+        } while (--nw);
+        n %= 8;
+        if (n == 0) {
+            return;
+        }
+    }
+    if (n >= 4) {
+        __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4;
+        pslo = _mm_mul_ps(pslo, psk);
+        __m128i epu32lo = _mm_cvtps_epi32(pslo);
+        __m128i epu16lo = _mm_packus_epi32(epu32lo, epu32lo);
+        _mm_storel_epi64((void*)u16v, epu16lo); u16v += 4;
+        n %= 4;
+        if (n == 0) {
+            return;
+        }
+    }
+    switch (n) {
+        case 3: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1);
+            u16v[2] = (uint16_t)_mm_extract_epi16(epu16lo, 2);
+            break;
+        }
+        case 2: {
+            __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1);
+            break;
+        }
+        case 1: {
+            __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f);
+            pslo = _mm_mul_ps(pslo, psk);
+            __m128i epi32lo = _mm_cvtps_epi32(pslo);
+            __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo);
+            u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0);
+            break;
+        }
+        default: break;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
diff --git a/src/aymo_cpu.c b/src/aymo_cpu.c
new file mode 100644
index 0000000..ffc2b1a
--- /dev/null
+++ b/src/aymo_cpu.c
@@ -0,0 +1,38 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+void aymo_cpu_boot(void)
+{
+    #if (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64))
+        aymo_cpu_x86_boot();
+    #endif
+
+    #if (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64))
+        aymo_cpu_arm_boot();
+    #endif
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_cpu_arm.c b/src/aymo_cpu_arm.c
new file mode 100644
index 0000000..e8369fa
--- /dev/null
+++ b/src/aymo_cpu_arm.c
@@ -0,0 +1,61 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#if (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64))
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+static unsigned aymo_cpu_arm_extensions;
+
+
+void aymo_cpu_arm_boot(void)
+{
+    unsigned mask = 0u;
+
+#ifdef AYMO_CPU_PRESUME_ARM_NEON
+    mask |= AYMO_CPU_ARM_EXT_NEON;
+#endif
+#ifdef AYMO_CPU_PRESUME_ARM_NEON64
+    mask |= AYMO_CPU_ARM_EXT_NEON64;
+#endif
+
+    // FIXME: TODO: feature detection
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+    mask |= AYMO_CPU_ARM_EXT_NEON;
+#endif
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON64
+    mask |= AYMO_CPU_ARM_EXT_NEON64;
+#endif
+
+    aymo_cpu_arm_extensions = mask;
+}
+
+
+unsigned aymo_cpu_arm_get_extensions(void)
+{
+    return aymo_cpu_arm_extensions;
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64))
diff --git a/src/aymo_cpu_x86.c b/src/aymo_cpu_x86.c
new file mode 100644
index 0000000..5456c9f
--- /dev/null
+++ b/src/aymo_cpu_x86.c
@@ -0,0 +1,119 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#if (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64))
+
+#ifdef AYMO_CPU_HAVE_CPUINFO
+    #if defined(AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID)
+        #include <cpuid.h>
+    #elif defined(AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUID)
+        #include <intrin.h>
+    #endif
+#endif  // AYMO_CPU_HAVE_CPUINFO
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+#define AYMO_CPU_X86_CPUID_SSE      (1uL << 25u)  // edx[25] @ leaf 1
+#define AYMO_CPU_X86_CPUID_SSE2     (1uL << 26u)  // edx[26] @ leaf 1
+#define AYMO_CPU_X86_CPUID_SSE3     (1uL <<  0u)  // ecx[ 0] @ leaf 1
+#define AYMO_CPU_X86_CPUID_SSSE3    (1uL <<  9u)  // ecx[ 9] @ leaf 1
+#define AYMO_CPU_X86_CPUID_SSE41    (1uL << 19u)  // ecx[19] @ leaf 1
+#define AYMO_CPU_X86_CPUID_SSE42    (1uL << 20u)  // ecx[20] @ leaf 1
+#define AYMO_CPU_X86_CPUID_AVX      (1uL << 28u)  // ecx[28] @ leaf 1
+#define AYMO_CPU_X86_CPUID_AVX2     (1uL <<  5u)  // ebx[ 5] @ leaf 7.0
+#define AYMO_CPU_X86_CPUID_FMA      (1uL << 12u)  // ecx[12] @ leaf 1
+
+
+static unsigned aymo_cpu_x86_extensions;
+
+
+void aymo_cpu_x86_boot(void)
+{
+    unsigned mask = 0u;
+
+#ifdef AYMO_CPU_PRESUME_X86_SSE
+    mask |= AYMO_CPU_X86_EXT_SSE;
+#endif
+#ifdef AYMO_CPU_PRESUME_X86_SSE2
+    mask |= AYMO_CPU_X86_EXT_SSE2;
+#endif
+#ifdef AYMO_CPU_PRESUME_X86_SSE3
+    mask |= AYMO_CPU_X86_EXT_SSE3;
+#endif
+#ifdef AYMO_CPU_PRESUME_X86_SSSE3
+    mask |= AYMO_CPU_X86_EXT_SSSE3;
+#endif
+#ifdef AYMO_CPU_PRESUME_X86_SSE41
+    mask |= AYMO_CPU_X86_EXT_SSE41;
+#endif
+#ifdef AYMO_CPU_PRESUME_X86_SSE42
+    mask |= AYMO_CPU_X86_EXT_SSE42;
+#endif
+#ifdef AYMO_CPU_PRESUME_X86_AVX
+    mask |= AYMO_CPU_X86_EXT_AVX;
+#endif
+#ifdef AYMO_CPU_PRESUME_X86_AVX2
+    mask |= AYMO_CPU_X86_EXT_AVX2;
+#endif
+#ifdef AYMO_CPU_PRESUME_X86_FMA3
+    mask |= AYMO_CPU_X86_EXT_FMA3;
+#endif
+
+#ifdef AYMO_CPU_HAVE_CPUINFO
+    unsigned e1[4] = { 0u, 0u, 0u, 0u };
+    unsigned e7[4] = { 0u, 0u, 0u, 0u };
+
+    #if defined(AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID)
+        __cpuid(1u, e1[0], e1[1], e1[2], e1[3]);
+    #elif defined(AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUID)
+        __cpuid((int*)e1, 1);
+    #endif
+
+    #if defined(AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID_COUNT)
+        __cpuid_count(7u, 0u, e7[0], e7[1], e7[2], e7[3]);
+    #elif defined(AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUIDEX)
+        __cpuidex((int*)e7, 7, 0);
+    #endif
+
+    if (e1[3] & AYMO_CPU_X86_CPUID_SSE  ) { mask |= AYMO_CPU_X86_EXT_SSE; }
+    if (e1[3] & AYMO_CPU_X86_CPUID_SSE2 ) { mask |= AYMO_CPU_X86_EXT_SSE2; }
+    if (e1[2] & AYMO_CPU_X86_CPUID_SSE3 ) { mask |= AYMO_CPU_X86_EXT_SSE3; }
+    if (e1[2] & AYMO_CPU_X86_CPUID_SSSE3) { mask |= AYMO_CPU_X86_EXT_SSSE3; }
+    if (e1[2] & AYMO_CPU_X86_CPUID_SSE41) { mask |= AYMO_CPU_X86_EXT_SSE41; }
+    if (e1[2] & AYMO_CPU_X86_CPUID_SSE42) { mask |= AYMO_CPU_X86_EXT_SSE42; }
+    if (e1[2] & AYMO_CPU_X86_CPUID_AVX  ) { mask |= AYMO_CPU_X86_EXT_AVX; }
+    if (e7[1] & AYMO_CPU_X86_CPUID_AVX2 ) { mask |= AYMO_CPU_X86_EXT_AVX2; }
+    if (e1[2] & AYMO_CPU_X86_CPUID_FMA  ) { mask |= AYMO_CPU_X86_EXT_FMA3; }
+#endif  // AYMO_CPU_HAVE_CPUINFO
+
+    aymo_cpu_x86_extensions = mask;
+}
+
+
+unsigned aymo_cpu_x86_get_extensions(void)
+{
+    return aymo_cpu_x86_extensions;
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64))
diff --git a/src/aymo_empty.c b/src/aymo_empty.c
new file mode 100644
index 0000000..439e361
--- /dev/null
+++ b/src/aymo_empty.c
@@ -0,0 +1 @@
+/* Just an empty file to make Meson happy :-) */
diff --git a/src/aymo_file.c b/src/aymo_file.c
new file mode 100644
index 0000000..a0e7b83
--- /dev/null
+++ b/src/aymo_file.c
@@ -0,0 +1,133 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_file.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+static unsigned char aymo_file_chunk[AYMO_FILE_CHUNK_SIZE];
+
+
+int aymo_file_save(const char* pathp, const void* datap, size_t size)
+{
+    const char* chunkp = (const char*)datap;
+    FILE* filep = (FILE*)NULL;
+    size_t total = 0u;
+    size_t subsize = 0u;
+
+    assert(pathp != NULL);
+    assert(*pathp != '\0');
+    if (datap == NULL) {
+        size = 0u;
+    }
+
+    filep = fopen(pathp, "wb");
+    if (filep == NULL) {
+        perror("fopen()");
+        goto error_;
+    }
+
+    while (total < size) {
+        subsize = fwrite(chunkp, 1u, (total - size), filep);
+        if (subsize == 0u) {
+            perror("fwrite()");
+            goto error_;
+        }
+
+        total += subsize;
+        chunkp += subsize;
+    }
+    return 0;
+
+error_:
+    return 1;
+}
+
+
+int aymo_file_load(const char* pathp, void** datapp, size_t* sizep)
+{
+    FILE* filep = (FILE*)NULL;
+    void* datap = NULL;
+    size_t total = 0u;
+    size_t subsize = 0u;
+
+    assert(pathp != NULL);
+    assert(*pathp != '\0');
+    assert(datapp != NULL);
+    assert(sizep != NULL);
+
+    *datapp = NULL;
+    *sizep = 0U;
+
+    filep = fopen(pathp, "rb");
+    if (filep == NULL) {
+        perror("fopen()");
+        goto error_;
+    }
+
+    datap = malloc(1u);
+    if (datap == NULL) {
+        perror("malloc()");
+        goto error_;
+    }
+
+    while (!feof(filep)) {
+        subsize = fread(&aymo_file_chunk[0], 1u, AYMO_FILE_CHUNK_SIZE, filep);
+        if (subsize == 0u) {
+            perror("fread()");
+            goto error_;
+        }
+
+        datap = realloc(datap, (total + subsize));
+        if (datap == NULL) {
+            perror("realloc()");
+            goto error_;
+        }
+
+        (void)memcpy((unsigned char*)datap + total, &aymo_file_chunk[0], subsize);
+
+        total += subsize;
+    }
+    *datapp = datap;
+    *sizep = total;
+    return 0;
+
+error_:
+    aymo_file_unload(datap);
+    return 1;
+}
+
+
+void aymo_file_unload(void* datap)
+{
+    if (datap) {
+        free(datap);
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_score.c b/src/aymo_score.c
new file mode 100644
index 0000000..019a3fb
--- /dev/null
+++ b/src/aymo_score.c
@@ -0,0 +1,153 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_score.h"
+#include "aymo_score_avd.h"
+#include "aymo_score_dro.h"
+#include "aymo_score_imf.h"
+#include "aymo_score_raw.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+int aymo_score_ctor(
+    struct aymo_score_instance* score
+)
+{
+    assert(score);
+    assert(score->vt);
+    return score->vt->ctor(score);
+}
+
+
+void aymo_score_dtor(
+    struct aymo_score_instance* score
+)
+{
+    assert(score);
+    assert(score->vt);
+    score->vt->dtor(score);
+}
+
+
+int aymo_score_load(
+    struct aymo_score_instance* score,
+    const void* data,
+    uint32_t size
+)
+{
+    assert(score);
+    assert(score->vt);
+    return score->vt->load(score, data, size);
+}
+
+
+void aymo_score_unload(
+    struct aymo_score_instance* score
+)
+{
+    assert(score);
+    assert(score->vt);
+    score->vt->unload(score);
+}
+
+
+struct aymo_score_status* aymo_score_get_status(
+    struct aymo_score_instance* score
+)
+{
+    assert(score);
+    assert(score->vt);
+    return score->vt->get_status(score);
+}
+
+
+void aymo_score_restart(
+    struct aymo_score_instance* score
+)
+{
+    assert(score);
+    assert(score->vt);
+    score->vt->restart(score);
+}
+
+
+uint32_t aymo_score_tick(
+    struct aymo_score_instance* score,
+    uint32_t count
+)
+{
+    assert(score);
+    assert(score->vt);
+    return score->vt->tick(score, count);
+}
+
+
+enum aymo_score_type aymo_score_ext_to_type(
+    const char *tag
+)
+{
+    if (tag != NULL) {
+        if (((tag[0] == 'A') || (tag[0] == 'a')) &&
+            ((tag[1] == 'V') || (tag[1] == 'v')) &&
+            ((tag[2] == 'D') || (tag[2] == 'd')) &&
+            (tag[3] == '\0')) {
+            return aymo_score_type_avd;
+        }
+        if (((tag[0] == 'D') || (tag[0] == 'd')) &&
+            ((tag[1] == 'R') || (tag[1] == 'r')) &&
+            ((tag[2] == 'O') || (tag[2] == 'o')) &&
+            (tag[3] == '\0')) {
+            return aymo_score_type_dro;
+        }
+        if (((tag[0] == 'I') || (tag[0] == 'i')) &&
+            ((tag[1] == 'M') || (tag[1] == 'm')) &&
+            ((tag[2] == 'F') || (tag[2] == 'f')) &&
+            (tag[3] == '\0')) {
+            return aymo_score_type_imf;
+        }
+        if (((tag[0] == 'R') || (tag[0] == 'r')) &&
+            ((tag[1] == 'A') || (tag[1] == 'a')) &&
+            ((tag[2] == 'W') || (tag[2] == 'w')) &&
+            (tag[3] == '\0')) {
+            return aymo_score_type_raw;
+        }
+    }
+    return aymo_score_type_unknown;
+}
+
+
+const struct aymo_score_vt* aymo_score_type_to_vt(
+    enum aymo_score_type score_type
+)
+{
+    switch (score_type) {
+        case aymo_score_type_avd: return &aymo_score_avd_vt;
+        case aymo_score_type_dro: return &aymo_score_dro_vt;
+        case aymo_score_type_imf: return &aymo_score_imf_vt;
+        case aymo_score_type_raw: return &aymo_score_raw_vt;
+        default: return NULL;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_score_avd.c b/src/aymo_score_avd.c
new file mode 100644
index 0000000..d3606a6
--- /dev/null
+++ b/src/aymo_score_avd.c
@@ -0,0 +1,174 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_score_avd.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_score_vt aymo_score_avd_vt = {
+    "aymo_score_avd",
+    (aymo_score_ctor_f)aymo_score_avd_ctor,
+    (aymo_score_dtor_f)aymo_score_avd_dtor,
+    (aymo_score_load_f)aymo_score_avd_load,
+    (aymo_score_unload_f)aymo_score_avd_unload,
+    (aymo_score_get_status_f)aymo_score_avd_get_status,
+    (aymo_score_restart_f)aymo_score_avd_restart,
+    (aymo_score_tick_f)aymo_score_avd_tick
+};
+
+
+int aymo_score_avd_ctor(
+    struct aymo_score_avd_instance* score
+)
+{
+    assert(score);
+
+    score->vt = &aymo_score_avd_vt;
+
+    score->events = NULL;
+    score->length = 0u;
+    aymo_score_avd_restart(score);
+    return 0;
+}
+
+
+void aymo_score_avd_dtor(
+    struct aymo_score_avd_instance* score
+)
+{
+    AYMO_UNUSED_VAR(score);
+    assert(score);
+}
+
+
+int aymo_score_avd_load(
+    struct aymo_score_avd_instance* score,
+    const void* data,
+    uint32_t size
+)
+{
+    assert(score);
+
+    uint32_t length = (size / sizeof(struct aymo_score_avd_event));
+    assert(!length || data);
+
+    score->events = (const struct aymo_score_avd_event*)data;
+    score->length = length;
+    aymo_score_avd_restart(score);
+    return 0;
+}
+
+
+void aymo_score_avd_unload(
+    struct aymo_score_avd_instance* score
+)
+{
+    aymo_score_avd_ctor(score);
+}
+
+
+struct aymo_score_status* aymo_score_avd_get_status(
+    struct aymo_score_avd_instance* score
+)
+{
+    assert(score);
+    return &score->status;
+}
+
+
+void aymo_score_avd_restart(
+    struct aymo_score_avd_instance* score
+)
+{
+    assert(score);
+
+    score->index = 0u;
+
+    score->status.delay = 0u;
+    score->status.address = 0u;
+    score->status.value = 0u;
+    score->status.flags = 0u;
+
+    if (score->index >= score->length) {
+        score->status.flags |= AYMO_SCORE_FLAG_EOF;
+    }
+}
+
+
+uint32_t aymo_score_avd_tick(
+    struct aymo_score_avd_instance* score,
+    uint32_t count
+)
+{
+    assert(score);
+    assert(!score->length || score->events);
+
+    uint32_t pending = count;
+
+    do {
+        if (pending >= score->status.delay) {
+            pending -= score->status.delay;
+            score->status.delay = 0u;
+        }
+        else {
+            score->status.delay -= pending;
+            pending = 0u;
+        }
+
+        score->status.address = 0u;
+        score->status.value = 0u;
+        score->status.flags = 0u;
+
+        if (score->status.delay) {
+            score->status.flags = AYMO_SCORE_FLAG_DELAY;
+        }
+        else if (score->index < score->length) {
+            const struct aymo_score_avd_event* event = &score->events[score->index++];
+
+            if (event->address_hi & 0x80u) {  // delay tag
+                uint32_t delay = (((uint32_t)(event->address_hi & 0x7Fu) << 16u) |
+                                    ((uint32_t)event->address_lo << 8u) | event->value);
+                if (delay) {
+                    score->status.delay = delay;
+                    score->status.flags = AYMO_SCORE_FLAG_DELAY;
+                }
+            }
+            else {
+                score->status.address = (((uint16_t)event->address_hi << 8u) | event->address_lo);
+                score->status.value = event->value;
+                score->status.flags = AYMO_SCORE_FLAG_EVENT;
+                count -= pending;  // FIXME: what if another event follows immediately? --> count -= CONSUMED
+                break;
+            }
+        }
+        else {
+            score->status.flags = AYMO_SCORE_FLAG_EOF;
+            break;
+        }
+    } while (pending);
+
+    return count;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_score_dro.c b/src/aymo_score_dro.c
new file mode 100644
index 0000000..e9ab2db
--- /dev/null
+++ b/src/aymo_score_dro.c
@@ -0,0 +1,376 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_score_dro.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_score_vt aymo_score_dro_vt = {
+    "aymo_score_dro",
+    (aymo_score_ctor_f)aymo_score_dro_ctor,
+    (aymo_score_dtor_f)aymo_score_dro_dtor,
+    (aymo_score_load_f)aymo_score_dro_load,
+    (aymo_score_unload_f)aymo_score_dro_unload,
+    (aymo_score_get_status_f)aymo_score_dro_get_status,
+    (aymo_score_restart_f)aymo_score_dro_restart,
+    (aymo_score_tick_f)aymo_score_dro_tick
+};
+
+
+static inline uint16_t make_u16le(uint8_t lo, uint8_t hi)
+{
+    return (uint16_t)((unsigned)lo | ((unsigned)hi << 8u));
+}
+
+
+int aymo_score_dro_ctor_specific(
+    struct aymo_score_dro_instance* score,
+    uint32_t opl_rate
+)
+{
+    assert(score);
+    assert(opl_rate);
+
+    uint32_t division = (opl_rate / 1000u);  // TODO: improve resolution via fixed point 24.8
+    division += (uint32_t)(division == 0u);
+
+    score->vt = &aymo_score_dro_vt;
+
+    score->header = NULL;
+    score->v1_header = NULL;
+    score->v2_header = NULL;
+    score->codemap = NULL;
+    score->events = NULL;
+
+    score->opl_rate = opl_rate;
+    score->division = division;
+    score->length = 0u;
+    score->offset = 0u;
+    score->address_hi = 0u;
+
+    aymo_score_dro_restart(score);
+    return 0;
+}
+
+
+int aymo_score_dro_ctor(
+    struct aymo_score_dro_instance* score
+)
+{
+    return aymo_score_dro_ctor_specific(score, AYMO_SCORE_OPL_RATE_DEFAULT);
+}
+
+
+void aymo_score_dro_dtor(
+    struct aymo_score_dro_instance* score
+)
+{
+    AYMO_UNUSED_VAR(score);
+    assert(score);
+}
+
+
+int aymo_score_dro_load(
+    struct aymo_score_dro_instance* score,
+    const void* data,
+    uint32_t size
+)
+{
+    assert(score);
+    assert(data);
+    assert(size);
+
+    score->header = NULL;
+    score->v1_header = NULL;
+    score->v2_header = NULL;
+    score->codemap = NULL;
+    score->events = NULL;
+    score->length = 0u;
+
+    aymo_score_dro_restart(score);
+
+    if (size < sizeof(struct aymo_score_dro_header)) {
+        return 1;
+    }
+    const uint8_t* ptr = (const uint8_t*)data;
+    const struct aymo_score_dro_header* header = NULL;
+    header = (const struct aymo_score_dro_header*)(const void*)ptr;
+    ptr += sizeof(struct aymo_score_dro_header);
+    size -= sizeof(struct aymo_score_dro_header);
+    const struct aymo_score_dro_v1_header* v1_header = NULL;
+    const struct aymo_score_dro_v2_header* v2_header = NULL;
+    const uint8_t* codemap = NULL;
+    const uint8_t* events = NULL;
+    uint32_t length = 0u;
+
+    for (unsigned i = 0u; i < 8u; ++i) {
+        if (header->signature[i] != AYMO_DRO_SIGNATURE[i]) {
+            return 1;
+        }
+    }
+
+    if ((((header->version_major == 0u) && (header->version_minor == 1u)) ||
+         ((header->version_major == 1u) && (header->version_minor == 0u)))) {
+        if (size < sizeof(struct aymo_score_dro_v1_header)) {
+            return 1;
+        }
+        v1_header = (const struct aymo_score_dro_v1_header*)(const void*)ptr;
+        ptr += sizeof(struct aymo_score_dro_v1_header);
+        size -= sizeof(struct aymo_score_dro_v1_header);
+        if ((v1_header->hardware_extra[0] ||
+             v1_header->hardware_extra[1] ||
+             v1_header->hardware_extra[2])) {
+            ptr -= 3u;
+        }
+        events = ptr;
+        length = v1_header->length_bytes;
+    }
+    else if ((header->version_major == 2u) && (header->version_minor == 0u)) {
+        if (size < sizeof(struct aymo_score_dro_v1_header)) {
+            return 1;
+        }
+        v2_header = (const struct aymo_score_dro_v2_header*)(const void*)ptr;
+        ptr += sizeof(struct aymo_score_dro_v2_header);
+        size -= sizeof(struct aymo_score_dro_v2_header);
+        if (v2_header->format != (uint8_t)aymo_score_dro_v2_format_interleaved) {
+            return 1;
+        }
+        if (v2_header->codemap_length > 128u) {
+            return 1;
+        }
+        if (size < v2_header->codemap_length) {
+            return 1;
+        }
+        codemap = ptr;
+        ptr += v2_header->codemap_length;
+        size -= v2_header->codemap_length;
+        events = ptr;
+        length = (v2_header->length_pairs * sizeof(struct aymo_score_dro_pair));
+    }
+    else {
+        return 1;
+    }
+
+    score->header = header;
+    score->v1_header = v1_header;
+    score->v2_header = v2_header;
+    score->codemap = codemap;
+    score->events = events;
+    score->length = length;
+
+    aymo_score_dro_restart(score);
+    return 0;
+}
+
+
+void aymo_score_dro_unload(
+    struct aymo_score_dro_instance* score
+)
+{
+    aymo_score_dro_restart(score);
+}
+
+
+void aymo_score_dro_restart(
+    struct aymo_score_dro_instance* score
+)
+{
+    assert(score);
+
+    score->offset = 0u;
+    score->address_hi = 0u;
+
+    score->status.delay = 0u;
+    score->status.address = 0u;
+    score->status.value = 0u;
+    score->status.flags = 0u;
+
+    if (score->offset >= score->length) {
+        score->status.flags |= AYMO_SCORE_FLAG_EOF;
+    }
+}
+
+
+static void aymo_score_dro_decode_v1(
+    struct aymo_score_dro_instance* score
+)
+{
+    const uint8_t* ptr = &(score->events[score->offset]);
+
+    switch ((enum aymo_score_dro_v1_code)ptr[0]) {
+        case aymo_score_dro_v1_code_delay_byte: {
+            if ((score->offset + 1u) <= score->length) {
+                score->status.delay = ((ptr[1] + 1uL) * score->division);
+                score->status.flags = AYMO_SCORE_FLAG_DELAY;
+                score->offset += 2u;
+            }
+            else {
+                score->status.flags = AYMO_SCORE_FLAG_DELAY;
+                score->offset = score->length;
+            }
+            break;
+        }
+        case aymo_score_dro_v1_code_delay_word: {
+            if ((score->offset + 2u) <= score->length) {
+                score->status.delay = ((make_u16le(ptr[1], ptr[2]) + 1uL) * score->division);
+                score->status.flags = AYMO_SCORE_FLAG_DELAY;
+                score->offset += 3u;
+            }
+            else {
+                score->status.flags = AYMO_SCORE_FLAG_EOF;
+                score->offset = score->length;
+            }
+            break;
+        }
+        case aymo_score_dro_v1_code_switch_low: {
+            score->address_hi = 0u;
+            score->offset += 1u;
+            break;
+        }
+        case aymo_score_dro_v1_code_switch_high: {
+            score->address_hi = 1u;
+            score->offset += 1u;
+            break;
+        }
+        case aymo_score_dro_v1_code_escape: {
+            if ((score->offset + 2u) <= score->length) {
+                score->status.address = make_u16le(ptr[1], score->address_hi);
+                score->status.value = ptr[2];
+                score->status.flags = AYMO_SCORE_FLAG_EVENT;
+                score->offset += 3u;
+            }
+            else {
+                score->status.flags = AYMO_SCORE_FLAG_EOF;
+                score->offset = score->length;
+            }
+            break;
+        }
+        case aymo_score_dro_v1_code_invalid:
+        default: {
+            if ((score->offset + 2u) <= score->length) {
+                score->status.address = make_u16le(ptr[0], score->address_hi);
+                score->status.value = ptr[1];
+                score->status.flags = AYMO_SCORE_FLAG_EVENT;
+                score->offset += 2u;
+            }
+            else {
+                score->status.flags = AYMO_SCORE_FLAG_EOF;
+                score->offset = score->length;
+            }
+            break;
+        }
+    }
+}
+
+
+static void aymo_score_dro_decode_v2(
+    struct aymo_score_dro_instance* score
+)
+{
+    const struct aymo_score_dro_v2_header *v2_header = score->v2_header;
+    const uint8_t* ptr = &(score->events[score->offset]);
+
+    if (ptr[0] == v2_header->short_delay_code) {
+        score->status.delay = (ptr[1] + 1uL);
+        score->status.flags = AYMO_SCORE_FLAG_DELAY;
+    }
+    else if (ptr[0] == v2_header->long_delay_code) {
+        score->status.delay = ((ptr[1] + 1uL) * 256u);
+        score->status.flags = AYMO_SCORE_FLAG_DELAY;
+    }
+    else if ((ptr[0] & 0xFFu) < v2_header->codemap_length) {
+        score->address_hi = ((ptr[0] & 0x80u) >> 7u);
+        uint8_t address_lo = score->codemap[ptr[0] & 0xFFu];
+        score->status.address = make_u16le(address_lo, score->address_hi);
+        score->status.value = ptr[1];
+        score->status.flags = AYMO_SCORE_FLAG_EVENT;
+    }
+    score->offset += 2u;
+}
+
+
+struct aymo_score_status* aymo_score_dro_get_status(
+    struct aymo_score_dro_instance* score
+)
+{
+    assert(score);
+    return &score->status;
+}
+
+
+uint32_t aymo_score_dro_tick(
+    struct aymo_score_dro_instance* score,
+    uint32_t count
+)
+{
+    assert(score);
+    assert(!score->length || score->events);
+
+    uint32_t pending = count;
+
+    do {
+        if (pending >= score->status.delay) {
+            pending -= score->status.delay;
+            score->status.delay = 0u;
+        }
+        else {
+            score->status.delay -= pending;
+            pending = 0u;
+        }
+
+        score->status.address = 0u;
+        score->status.value = 0u;
+        score->status.flags = 0u;
+
+        if (score->status.delay) {
+            score->status.flags = AYMO_SCORE_FLAG_DELAY;
+        }
+        else if (score->offset < score->length) {
+            if (score->v2_header) {
+                aymo_score_dro_decode_v2(score);
+            }
+            else if (score->v1_header) {
+                aymo_score_dro_decode_v1(score);
+            }
+            else {
+                score->status.flags = AYMO_SCORE_FLAG_EOF;
+                score->offset = score->length;
+                break;
+            }
+
+            if (score->status.flags & AYMO_SCORE_FLAG_EVENT) {
+                count -= pending;  // FIXME: what if another event follows immediately? --> count -= CONSUMED
+                break;
+            }
+        }
+        else {
+            score->status.flags = AYMO_SCORE_FLAG_EOF;
+            break;
+        }
+    } while (pending);
+
+    return count;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_score_imf.c b/src/aymo_score_imf.c
new file mode 100644
index 0000000..ec0d1f3
--- /dev/null
+++ b/src/aymo_score_imf.c
@@ -0,0 +1,266 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_score_imf.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_score_vt aymo_score_imf_vt = {
+    "aymo_score_imf",
+    (aymo_score_ctor_f)aymo_score_imf_ctor,
+    (aymo_score_dtor_f)aymo_score_imf_dtor,
+    (aymo_score_load_f)aymo_score_imf_load,
+    (aymo_score_unload_f)aymo_score_imf_unload,
+    (aymo_score_get_status_f)aymo_score_imf_get_status,
+    (aymo_score_restart_f)aymo_score_imf_restart,
+    (aymo_score_tick_f)aymo_score_imf_tick
+};
+
+
+// See: https://moddingwiki.shikadi.net/wiki/IMF_Format
+uint8_t aymo_score_imf_guess_type(
+    const void* data,
+    uint32_t size
+)
+{
+    assert(data);
+
+    if (size < 2u) {
+        return 0u;
+    }
+
+    const uint8_t* ptr = (const uint8_t *)data;
+    uint16_t word = (ptr[0] | ((uint16_t)ptr[1] << 8u));
+    ptr += 2u;
+    if (!word || (word & 3u)) {
+        return 0u;
+    }
+
+    uint32_t sum1 = 0u;
+    uint32_t sum2 = 0u;
+    uint16_t i = 42u;
+
+    while ((size >= 4u) && i--)
+    {
+        word = (ptr[0] | ((uint16_t)ptr[1] << 8u));
+        ptr += 2u;
+        sum1 += word;
+
+        word = (ptr[0] | ((uint16_t)ptr[1] << 8u));
+        ptr += 2u;
+        sum2 += word;
+    }
+    return (uint8_t)(sum1 > sum2);
+}
+
+
+int aymo_score_imf_ctor_specific(
+    struct aymo_score_imf_instance* score,
+    uint32_t imf_rate,
+    uint32_t opl_rate
+)
+{
+    assert(score);
+    assert(opl_rate);
+    assert(imf_rate);
+
+    uint32_t division = (opl_rate / imf_rate);  // TODO: improve resolution via fixed point 24.8
+    division += (uint32_t)(division == 0u);
+
+    score->vt = &aymo_score_imf_vt;
+
+    score->events = NULL;
+    score->imf_rate = imf_rate;
+    score->opl_rate = opl_rate;
+    score->division = division;
+    score->length = 0u;
+    score->type = 0u;
+    score->address_hi = 0u;
+
+    aymo_score_imf_restart(score);
+    return 0;
+}
+
+
+int aymo_score_imf_ctor(
+    struct aymo_score_imf_instance* score
+)
+{
+    return aymo_score_imf_ctor_specific(score, aymo_score_imf_rate_default, AYMO_SCORE_OPL_RATE_DEFAULT);
+}
+
+
+void aymo_score_imf_dtor(
+    struct aymo_score_imf_instance* score
+)
+{
+    AYMO_UNUSED_VAR(score);
+    assert(score);
+}
+
+
+int aymo_score_imf_load_specific(
+    struct aymo_score_imf_instance* score,
+    const void* data,
+    uint32_t size,
+    uint8_t type
+)
+{
+    assert(score);
+    assert(data);
+    assert(size);
+
+    score->type = type;
+
+    if (type) {
+        const uint8_t* ptr = (const uint8_t*)data;
+        uint32_t length_by_header = (ptr[0] | ((uint16_t)ptr[1] << 8u));
+        length_by_header /= sizeof(struct aymo_score_imf_event);
+        score->length = length_by_header;
+        score->events = (const struct aymo_score_imf_event*)(const void*)&ptr[2];
+
+        uint32_t length_by_size = (uint32_t)(size - 2);
+        length_by_size /= sizeof(struct aymo_score_imf_event);
+        if (score->length > length_by_size) {
+            score->length = length_by_size;
+        }
+    }
+    else {
+        uint32_t length_by_size = (uint32_t)size;
+        length_by_size /= sizeof(struct aymo_score_imf_event);
+        score->length = length_by_size;
+        score->events = (const struct aymo_score_imf_event*)data;
+    }
+
+    aymo_score_imf_restart(score);
+    return 0;
+}
+
+
+int aymo_score_imf_load(
+    struct aymo_score_imf_instance* score,
+    const void* data,
+    uint32_t size
+)
+{
+    uint8_t type = aymo_score_imf_guess_type(data, size);
+    return aymo_score_imf_load_specific(score, data, size, type);
+}
+
+
+void aymo_score_imf_unload(
+    struct aymo_score_imf_instance* score
+)
+{
+    aymo_score_imf_restart(score);
+}
+
+
+struct aymo_score_status* aymo_score_imf_get_status(
+    struct aymo_score_imf_instance* score
+)
+{
+    assert(score);
+    return &score->status;
+}
+
+
+void aymo_score_imf_restart(
+    struct aymo_score_imf_instance* score
+)
+{
+    assert(score);
+
+    score->index = 0u;
+    score->address_hi = 0u;
+
+    score->status.delay = 0u;
+    score->status.address = 0u;
+    score->status.value = 0u;
+    score->status.flags = 0u;
+
+    if (score->index >= score->length) {
+        score->status.flags |= AYMO_SCORE_FLAG_EOF;
+    }
+}
+
+
+uint32_t aymo_score_imf_tick(
+    struct aymo_score_imf_instance* score,
+    uint32_t count
+)
+{
+    assert(score);
+    assert(!score->length || score->events);
+
+    uint32_t pending = count;
+
+    do {
+        if (pending >= score->status.delay) {
+            pending -= score->status.delay;
+            score->status.delay = 0u;
+        }
+        else {
+            score->status.delay -= pending;
+            pending = 0u;
+        }
+
+        score->status.address = 0u;
+        score->status.value = 0u;
+        score->status.flags = 0u;
+
+        if (score->status.delay) {
+            score->status.flags = AYMO_SCORE_FLAG_DELAY;
+        }
+        else if (score->index < score->length) {
+            const struct aymo_score_imf_event* event = &score->events[score->index++];
+
+            uint16_t delay = (((uint16_t)event->delay_hi << 8u) | event->delay_lo);
+            if (delay) {
+                score->status.delay = (delay * score->division);
+                score->status.flags = AYMO_SCORE_FLAG_DELAY;
+            }
+
+            // Override virtual register 0x05 to extend the address range for OPL3
+            if AYMO_UNLIKELY(event->address_lo == 0x05u) {
+                score->address_hi = (event->value & 0x01u);
+            }
+            else {
+                score->status.address = ((uint16_t)(score->address_hi << 8u) | event->address_lo);
+                score->status.value = event->value;
+                score->status.flags = AYMO_SCORE_FLAG_EVENT;
+                count -= pending;  // FIXME: what if another event follows immediately? --> count -= CONSUMED
+                break;
+            }
+        }
+        else {
+            score->status.flags = AYMO_SCORE_FLAG_EOF;
+            break;
+        }
+    } while (pending);
+
+    return count;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_score_raw.c b/src/aymo_score_raw.c
new file mode 100644
index 0000000..53bd148
--- /dev/null
+++ b/src/aymo_score_raw.c
@@ -0,0 +1,231 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_score_raw.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_score_vt aymo_score_raw_vt = {
+    "aymo_score_raw",
+    (aymo_score_ctor_f)aymo_score_raw_ctor,
+    (aymo_score_dtor_f)aymo_score_raw_dtor,
+    (aymo_score_load_f)aymo_score_raw_load,
+    (aymo_score_unload_f)aymo_score_raw_unload,
+    (aymo_score_get_status_f)aymo_score_raw_get_status,
+    (aymo_score_restart_f)aymo_score_raw_restart,
+    (aymo_score_tick_f)aymo_score_raw_tick
+};
+
+
+static void aymo_score_raw_update_clock(
+    struct aymo_score_raw_instance* score
+)
+{
+    score->clock += (uint16_t)(score->clock == 0u);
+    score->raw_rate = (AYMO_SCORE_RAW_REFCLK / score->clock);
+    score->division = (AYMO_SCORE_OPL_RATE_DEFAULT / score->raw_rate);  // TODO: improve resolution via fixed point 24.8
+    score->division += (uint32_t)(score->division == 0u);
+}
+
+
+int aymo_score_raw_ctor(
+    struct aymo_score_raw_instance* score
+)
+{
+    assert(score);
+
+    score->vt = &aymo_score_raw_vt;
+
+    score->events = NULL;
+    score->raw_rate = AYMO_SCORE_RAW_REFCLK;
+    score->division = 1u;
+    score->length = 0u;
+    score->address_hi = 0u;
+
+    aymo_score_raw_restart(score);
+    return 0;
+}
+
+
+void aymo_score_raw_dtor(
+    struct aymo_score_raw_instance* score
+)
+{
+    AYMO_UNUSED_VAR(score);
+    assert(score);
+}
+
+
+int aymo_score_raw_load(
+    struct aymo_score_raw_instance* score,
+    const void* data,
+    uint32_t size
+)
+{
+    assert(score);
+    assert(data);
+    assert(size);
+
+    if (size < sizeof(struct aymo_score_raw_header)) {
+        return 1;
+    }
+    const uint8_t* ptr = (const uint8_t*)data;
+
+    if (((ptr[0] != 'R') ||
+         (ptr[1] != 'A') ||
+         (ptr[2] != 'W') ||
+         (ptr[3] != 'A') ||
+         (ptr[4] != 'D') ||
+         (ptr[5] != 'A') ||
+         (ptr[6] != 'T') ||
+         (ptr[7] != 'A'))) {
+        return 1;
+    }
+    score->clock_initial = *(const uint16_t*)(const void*)&ptr[8];
+    score->events = (const struct aymo_score_raw_event*)(const void*)&ptr[10];
+
+    uint32_t length_by_size = (uint32_t)(size - sizeof(struct aymo_score_raw_header));
+    length_by_size /= sizeof(struct aymo_score_raw_event);
+    if (score->length > length_by_size) {
+        score->length = length_by_size;
+    }
+
+    aymo_score_raw_restart(score);
+    return 0;
+}
+
+
+void aymo_score_raw_unload(
+    struct aymo_score_raw_instance* score
+)
+{
+    aymo_score_raw_restart(score);
+}
+
+
+struct aymo_score_status* aymo_score_raw_get_status(
+    struct aymo_score_raw_instance* score
+)
+{
+    assert(score);
+    return &score->status;
+}
+
+
+void aymo_score_raw_restart(
+    struct aymo_score_raw_instance* score
+)
+{
+    assert(score);
+
+    score->index = 0u;
+    score->address_hi = 0u;
+    score->clock = score->clock_initial;
+    aymo_score_raw_update_clock(score);
+
+    score->status.delay = 0u;
+    score->status.address = 0u;
+    score->status.value = 0u;
+    score->status.flags = 0u;
+
+    if (score->index >= score->length) {
+        score->status.flags |= AYMO_SCORE_FLAG_EOF;
+    }
+}
+
+
+uint32_t aymo_score_raw_tick(
+    struct aymo_score_raw_instance* score,
+    uint32_t count
+)
+{
+    assert(score);
+    assert(!score->length || score->events);
+
+    uint32_t pending = count;
+
+    do {
+        if (pending >= score->status.delay) {
+            pending -= score->status.delay;
+            score->status.delay = 0u;
+        }
+        else {
+            score->status.delay -= pending;
+            pending = 0u;
+        }
+
+        score->status.address = 0u;
+        score->status.value = 0u;
+        score->status.flags = 0u;
+
+        if (score->status.delay) {
+            score->status.flags = AYMO_SCORE_FLAG_DELAY;
+        }
+        else if (score->index < score->length) {
+            const struct aymo_score_raw_event* event = &score->events[score->index++];
+
+            if (event->ctrl == 0x00u) {
+                uint8_t delay = event->data;
+                if (delay) {
+                    score->status.delay = (delay * score->division);
+                    score->status.flags = AYMO_SCORE_FLAG_DELAY;
+                }
+            }
+            else if (event->ctrl == 0x02u) {
+                if (event->ctrl == 0x00u) {
+                    if ((score->index + 1u) < score->length) {
+                        score->index++;
+                        score->clock = *(const uint16_t*)(void*)++event;
+                        aymo_score_raw_update_clock(score);
+                    }
+                    else {
+                        score->status.flags = AYMO_SCORE_FLAG_EOF;
+                        break;
+                    }
+                }
+                else if (event->ctrl == 0x01u) {
+                    score->address_hi = 0u;
+                }
+                else if (event->ctrl == 0x02u) {
+                    score->address_hi = 1u;
+                }
+            }
+            else {
+                score->status.address = ((uint16_t)(score->address_hi << 8u) | event->ctrl);
+                score->status.value = event->data;
+                score->status.flags = AYMO_SCORE_FLAG_EVENT;
+                count -= pending;  // FIXME: what if another event follows immediately? --> count -= CONSUMED
+                break;
+            }
+        }
+        else {
+            score->status.flags = AYMO_SCORE_FLAG_EOF;
+            break;
+        }
+    } while (pending);
+
+    return count;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_sys_linux.c b/src/aymo_sys_linux.c
new file mode 100644
index 0000000..01d943d
--- /dev/null
+++ b/src/aymo_sys_linux.c
@@ -0,0 +1,19 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
diff --git a/src/aymo_sys_windows.c b/src/aymo_sys_windows.c
new file mode 100644
index 0000000..902a949
--- /dev/null
+++ b/src/aymo_sys_windows.c
@@ -0,0 +1,71 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cc.h"
+#if (defined(AYMO_CC_HOST_WINDOWS) || defined(AYMO_CC_HOST_CYGWIN))
+
+#define WIN32_LEAN_AND_MEAN 1
+#include <windows.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+BOOL WINAPI DllMain(
+    _In_ HINSTANCE hinstDLL,
+    _In_ DWORD     fdwReason,
+    _In_ LPVOID    lpvReserved
+)
+{
+    // Perform actions based on the reason for calling.
+    switch (fdwReason)
+    {
+        case DLL_PROCESS_ATTACH: {
+            // Initialize once for each new process.
+            // Return FALSE to fail DLL load.
+
+            // Thread optimization.
+            DisableThreadLibraryCalls(hinstDLL);
+            break;
+        }
+        case DLL_THREAD_ATTACH: {
+            // Do thread-specific initialization.
+            break;
+        }
+        case DLL_THREAD_DETACH: {
+            // Do thread-specific cleanup.
+            break;
+        }
+        case DLL_PROCESS_DETACH: {
+            if (lpvReserved) {
+                // Do not do cleanup if process termination scenario.
+                break;
+            }
+            // Perform any necessary cleanup.
+            break;
+        }
+        default: break;
+    }
+    return TRUE;  // Successful DLL_PROCESS_ATTACH.
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // (defined(AYMO_CC_HOST_WINDOWS) || defined(AYMO_CC_HOST_CYGWIN))
diff --git a/src/aymo_tda8425.c b/src/aymo_tda8425.c
new file mode 100644
index 0000000..41981ff
--- /dev/null
+++ b/src/aymo_tda8425.c
@@ -0,0 +1,172 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#include "aymo_tda8425.h"
+#include "aymo_tda8425_arm_neon.h"
+#include "aymo_tda8425_none.h"
+#include "aymo_tda8425_x86_avx2.h"
+#include "aymo_tda8425_x86_sse41.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_tda8425_math* aymo_tda8425_math;
+
+static const struct aymo_tda8425_vt* aymo_tda8425_best_vt;
+
+
+void aymo_tda8425_boot(const struct aymo_tda8425_math* math)
+{
+    assert(math);
+
+    aymo_tda8425_math = math;
+
+    #ifdef AYMO_CPU_SUPPORT_X86_AVX2
+        if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) {
+            aymo_tda8425_best_vt = aymo_tda8425_x86_avx2_get_vt();
+            return;
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_X86_SSE41
+        if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) {
+            aymo_tda8425_best_vt = aymo_tda8425_x86_sse41_get_vt();
+            return;
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_ARM_NEON
+        if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) {
+            aymo_tda8425_best_vt = aymo_tda8425_arm_neon_get_vt();
+            return;
+        }
+    #endif
+
+    aymo_tda8425_best_vt = aymo_tda8425_none_get_vt();
+}
+
+
+const struct aymo_tda8425_vt* aymo_tda8425_get_vt(const char* cpu_ext)
+{
+    if (cpu_ext == NULL) {
+        return NULL;
+    }
+
+    #ifdef AYMO_CPU_SUPPORT_X86_AVX2
+        if (!aymo_strcmp(cpu_ext, "x86_avx2")) {
+            if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) {
+                return aymo_tda8425_x86_avx2_get_vt();
+            }
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_X86_SSE41
+        if (!aymo_strcmp(cpu_ext, "x86_sse41")) {
+            if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) {
+                return aymo_tda8425_x86_sse41_get_vt();
+            }
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_ARM_NEON
+        if (!aymo_strcmp(cpu_ext, "arm_neon")) {
+            if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) {
+                return aymo_tda8425_arm_neon_get_vt();
+            }
+        }
+    #endif
+
+    if (!aymo_strcmp(cpu_ext, "none")) {
+        return aymo_tda8425_none_get_vt();
+    }
+    return NULL;
+}
+
+
+const struct aymo_tda8425_vt* aymo_tda8425_get_best_vt(void)
+{
+    return aymo_tda8425_best_vt;
+}
+
+
+uint32_t aymo_tda8425_get_sizeof(struct aymo_tda8425_chip* chip)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->get_sizeof);
+
+    return chip->vt->get_sizeof();
+}
+
+
+void aymo_tda8425_ctor(struct aymo_tda8425_chip* chip, float sample_rate)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->ctor);
+
+    chip->vt->ctor(chip, sample_rate);
+}
+
+
+void aymo_tda8425_dtor(struct aymo_tda8425_chip* chip)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->dtor);
+
+    chip->vt->dtor(chip);
+}
+
+
+uint8_t aymo_tda8425_read(struct aymo_tda8425_chip* chip, uint16_t address)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->read);
+
+    return chip->vt->read(chip, address);
+}
+
+
+void aymo_tda8425_write(struct aymo_tda8425_chip* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->write);
+
+    chip->vt->write(chip, address, value);
+}
+
+
+void aymo_tda8425_process_f32(struct aymo_tda8425_chip* chip, uint32_t count, const float x[], float y[])
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->process_f32);
+
+    chip->vt->process_f32(chip, count, x, y);
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_tda8425_arm_neon.c b/src/aymo_tda8425_arm_neon.c
new file mode 100644
index 0000000..c2a56b6
--- /dev/null
+++ b/src/aymo_tda8425_arm_neon.c
@@ -0,0 +1,504 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_tda8425_arm_neon.h"
+#include "aymo_tda8425.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+#undef cos
+#undef fabs
+#undef log10
+#undef pow
+#undef sqrt
+#undef tan
+
+#define cos     (aymo_tda8425_math->cos)
+#define fabs    (aymo_tda8425_math->fabs)
+#define log10   (aymo_tda8425_math->log10)
+#define pow     (aymo_tda8425_math->pow)
+#define sqrt    (aymo_tda8425_math->sqrt)
+#define tan     (aymo_tda8425_math->tan)
+
+
+const struct aymo_tda8425_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_tda8425_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_tda8425_ctor_f)&(aymo_(ctor)),
+    (aymo_tda8425_dtor_f)&(aymo_(dtor)),
+    (aymo_tda8425_read_f)&(aymo_(read)),
+    (aymo_tda8425_write_f)&(aymo_(write)),
+    (aymo_tda8425_process_f32_f)&(aymo_(process_f32))
+};
+
+
+const struct aymo_tda8425_vt* aymo_(get_vt)(void)
+{
+    return &aymo_(vt);
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate)
+{
+    assert(chip);
+    assert(sample_rate > 0.f);
+
+    // Wipe everything
+    aymo_memset(chip, 0, sizeof(struct aymo_(chip)));
+
+    // Setup default parameters
+    chip->sample_rate = sample_rate;
+    chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[0];
+    chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[0];
+
+    // Setup default registers
+    aymo_(write)(chip, 0x00u, 0xFCu);  // VL: 0 dB
+    aymo_(write)(chip, 0x01u, 0xFCu);  // VR: 0 dB
+    aymo_(write)(chip, 0x02u, 0xF6u);  // BA: 0 dB
+    aymo_(write)(chip, 0x03u, 0xF6u);  // TR: 0 dB
+    aymo_(write)(chip, 0x07u, 0xFCu);  // PP: light pseudo
+    aymo_(write)(chip, 0x08u, 0xCEu);  // SF: linear stereo, channel 1, unmuted
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+static void aymo_(apply_vl)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vl & 0x3Fu];
+
+    if (chip->reg_sf & 0x20u) {  // mute
+        db = -90.;
+    }
+
+    double g = pow(10., (db * .05));
+    chip->kv = vset_lane_f32((float)g, chip->kv, 0);
+}
+
+
+static void aymo_(apply_vr)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vr & 0x3Fu];
+
+    if (chip->reg_sf & 0x20u) {  // mute
+        db = -90.;
+    }
+
+    double g = pow(10., (db * .05));
+    chip->kv = vset_lane_f32((float)g, chip->kv, 1);
+}
+
+
+static void aymo_(apply_ba)(struct aymo_(chip)* chip)
+{
+    double dbb = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu];
+    double gb = pow(10., (dbb * (.05 * .5)));
+    double fs = (double)chip->sample_rate;
+    double pi = 3.14159265358979323846264338327950288;
+    double fcb = 300.;  // [Hz]
+    double wb = ((2. * pi) * fcb);
+    double kb = (tan(wb * (.5 / fs)) / wb);
+
+    double a0 = ((kb * wb) + gb);
+    double a1 = ((kb * wb) - gb);
+    double a2 = 0.;
+
+    double b0 = (((kb * wb) * (gb * gb)) + gb);
+    double b1 = (((kb * wb) * (gb * gb)) - gb);
+    double b2 = 0.;
+
+    double ra0 = (1. / a0);
+    chip->kb0 = vsetq_lane_f32((float)(b0 * ra0), chip->kb0, 2);
+    chip->kb1 = vsetq_lane_f32((float)(b1 * ra0), chip->kb1, 2);
+    chip->kb2 = vsetq_lane_f32((float)(b2 * ra0), chip->kb2, 2);
+    ra0 = -ra0;
+    chip->ka1 = vsetq_lane_f32((float)(a1 * ra0), chip->ka1, 2);
+    chip->ka2 = vsetq_lane_f32((float)(a2 * ra0), chip->ka2, 2);
+}
+
+
+static void aymo_(apply_tr)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_tr_to_db[chip->reg_tr & 0x0Fu];
+    double gt = pow(10., (db * (.05 * .5)));
+    double fs = (double)chip->sample_rate;
+    double pi = 3.14159265358979323846264338327950288;
+    double fcd = 10.;  // [Hz]
+    double wd = ((2. * pi) * fcd);
+    double kd = ((chip->reg_sf & 0x40u) ? 0. : (tan(wd * (.5 / fs)) / wd));
+    double fct = 4500.;  // [Hz]
+    double wt = ((2. * pi) * fct);
+    double kt = (tan(wt * (.5 / fs)) / wt);
+
+    double a0 = (((gt * kt * wt) * (kd * wd)) + ((gt * kt * wt) + (kd * wd)) + 1.);
+    double a1 = (((gt * kt * wt) * (kd * wd) * 2.) - 2.);
+    double a2 = (((gt * kt * wt) * (kd * wd)) - ((gt * kt * wt) + (kd * wd)) + 1.);
+
+    double b0 = ((gt * gt) + (gt * kt * wt));
+    double b1 = ((gt * gt) * -2.);
+    double b2 = ((gt * gt) - (gt * kt * wt));
+
+    double ra0 = (1. / a0);
+    chip->kb0 = vsetq_lane_f32((float)(b0 * ra0), chip->kb0, 1);
+    chip->kb1 = vsetq_lane_f32((float)(b1 * ra0), chip->kb1, 1);
+    chip->kb2 = vsetq_lane_f32((float)(b2 * ra0), chip->kb2, 1);
+    ra0 = -ra0;
+    chip->ka1 = vsetq_lane_f32((float)(a1 * ra0), chip->ka1, 1);
+    chip->ka2 = vsetq_lane_f32((float)(a2 * ra0), chip->ka2, 1);
+}
+
+
+static void aymo_(apply_source_mode)(struct aymo_(chip)* chip)
+{
+    // Default mute
+    vf32x2_t klr = vdup_n_f32(0.f);
+    vf32x2_t krl = vdup_n_f32(0.f);
+
+    uint8_t source = (chip->reg_sf & 0x07u);
+    uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u);
+
+    // Forced mono
+    if (mode == 0x00u) {  // process
+        switch (source) {
+            // Channel 1
+            case 0x02u:
+            case 0x04u:
+            case 0x06u: {
+                klr = vdup_n_f32(1.f);
+                krl = vdup_n_f32(1.f);
+                break;
+            }
+        }
+    }
+    else {  // not forced mono
+        switch (source) {
+            // Channel 1
+            case 0x02u: {  // mono left
+                klr = vset_lane_f32(1.f, klr, 0);
+                krl = vset_lane_f32(1.f, krl, 1);
+                break;
+            }
+            case 0x04u: {  // mono right
+                klr = vset_lane_f32(1.f, klr, 1);
+                krl = vset_lane_f32(1.f, krl, 0);
+                break;
+            }
+            case 0x06u: {  // stereo
+                klr = vdup_n_f32(1.f);
+                krl = vdup_n_f32(0.f);
+                break;
+            }
+            default: {
+                if (mode == 0x03u) {  // spatial stereo
+                    mode = 0x02u;  // force linear stereo (mute)
+                }
+                break;
+            }
+        }
+
+        // Spatial stereo
+        if (mode == 0x03u) {  // process
+            const float xt = .52f;  // cross-talk
+            vf32x2_t kx = vdup_n_f32(xt);
+            klr = vadd_f32(klr, kx);
+            krl = vsub_f32(krl, kx);
+        }
+    }  // not forced mono
+
+    chip->klr = klr;
+    chip->krl = krl;
+}
+
+
+static void aymo_(apply_pseudo)(struct aymo_(chip)* chip)
+{
+    uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u);
+
+    // Pseudo stereo
+    if (mode == 0x02u) {  // enabled
+        double c1 = (double)chip->pseudo_c1;
+        double c2 = (double)chip->pseudo_c2;
+        double r1 = 15000.;  // [ohm]
+        double r2 = 15000.;  // [ohm]
+        double t1 = (c1 * r1);
+        double t2 = (c2 * r2);
+
+        double fs = (double)chip->sample_rate;
+        double k = (.5 / fs);
+        double kk = (k * k);
+        double t1_t2 = (t1 * t2);
+        double t1_t2_k = ((t1 + t2) * k);
+
+        double a0 = (kk + t1_t2 + t1_t2_k);
+        double a1 = ((kk - t1_t2) * 2.);
+        double a2 = (kk + t1_t2 - t1_t2_k);
+
+        double b0 = a2;
+        double b1 = a1;
+        double b2 = a0;
+
+        double ra0 = (1. / a0);
+        chip->kb0 = vsetq_lane_f32((float)(b0 * ra0), chip->kb0, 0);
+        chip->kb1 = vsetq_lane_f32((float)(b1 * ra0), chip->kb1, 0);
+        chip->kb2 = vsetq_lane_f32((float)(b2 * ra0), chip->kb2, 0);
+        ra0 = -ra0;
+        chip->ka1 = vsetq_lane_f32((float)(a1 * ra0), chip->ka1, 0);
+        chip->ka2 = vsetq_lane_f32((float)(a2 * ra0), chip->ka2, 0);
+    }
+    else {  // pass-through
+        chip->kb0 = vsetq_lane_f32(1.f, chip->kb0, 0);
+        chip->kb1 = vsetq_lane_f32(.0f, chip->kb1, 0);
+        chip->kb2 = vsetq_lane_f32(.0f, chip->kb2, 0);
+
+        chip->ka1 = vsetq_lane_f32(.0f, chip->ka1, 0);
+        chip->ka2 = vsetq_lane_f32(.0f, chip->ka2, 0);
+    }
+}
+
+
+static void aymo_(apply_tfilter)(struct aymo_(chip)* chip)
+{
+    // T-filter
+    if (chip->reg_sf & 0x80u) {  // pass-through
+        chip->kb0 = vsetq_lane_f32(1.f, chip->kb0, 3);
+        chip->kb1 = vsetq_lane_f32(.0f, chip->kb1, 3);
+        chip->kb2 = vsetq_lane_f32(.0f, chip->kb2, 3);
+
+        chip->ka1 = vsetq_lane_f32(.0f, chip->ka1, 3);
+        chip->ka2 = vsetq_lane_f32(.0f, chip->ka2, 3);
+    }
+    else {  // enabled
+        double db = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu];
+        double g = pow(10., (db * (.05 * .5)));
+        double fs = (double)chip->sample_rate;
+        double pi = 3.14159265358979323846264338327950288;
+        double fc = 180.;  // [Hz]
+        double w = ((2. * pi) * fc);
+        double k = (tan(w * (.5 / fs)) / w);
+
+        double log10_g = log10(g);
+        double ang = (log10_g * .85);
+        double abs_sqrt_log10_g = sqrt(fabs(log10_g));
+        double abs2_sqrt_log10_g = abs_sqrt_log10_g * abs_sqrt_log10_g;
+        double kw = (k * w);
+        double m_k2w2 = ((kw * kw) * -.05);
+        double sqrt_5 = 2.23606797749978980505147774238139391;
+        double ph = (pi * .75);
+        double h_sqrt_5_kw_abs_sqrt_log10_g = ((sqrt_5 * .2) * kw * abs_sqrt_log10_g);
+        double cosm = cos(ang - ph);
+        double cosp = cos(ang + ph);
+
+        double a0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosm)));
+        double a1 = (((m_k2w2 + abs2_sqrt_log10_g)) * 2.);
+        double a2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosm)));
+
+        double b0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosp)));
+        double b1 = a1;
+        double b2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosp)));
+
+        double ra0 = (1. / a0);
+        chip->kb0 = vsetq_lane_f32((float)(b0 * ra0), chip->kb0, 3);
+        chip->kb1 = vsetq_lane_f32((float)(b1 * ra0), chip->kb1, 3);
+        chip->kb2 = vsetq_lane_f32((float)(b2 * ra0), chip->kb2, 3);
+        ra0 = -ra0;
+        chip->ka1 = vsetq_lane_f32((float)(a1 * ra0), chip->ka1, 3);
+        chip->ka2 = vsetq_lane_f32((float)(a2 * ra0), chip->ka2, 3);
+    }
+}
+
+
+static void aymo_(apply_pp)(struct aymo_(chip)* chip)
+{
+    uint8_t pseudo_preset = (chip->reg_pp & 0x03u);
+    if (pseudo_preset >= 3u) {
+        pseudo_preset = 0u;
+    }
+    chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[pseudo_preset];
+    chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[pseudo_preset];
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    assert(chip);
+
+    switch (address) {
+        case 0x00u: {
+            return chip->reg_vl;
+        }
+        case 0x01u: {
+            return chip->reg_vr;
+        }
+        case 0x02u: {
+            return chip->reg_ba;
+        }
+        case 0x03u: {
+            return chip->reg_tr;
+        }
+        case 0x07u: {
+            return chip->reg_pp;
+        }
+        case 0x08u: {
+            return chip->reg_sf;
+        }
+        default: {
+            return 0xFFu;
+        }
+    }
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    switch (address) {
+        case 0x00u: {  // VL
+            value |= 0xC0u;
+            chip->reg_vl = value;
+            aymo_(apply_vl)(chip);
+            break;
+        }
+        case 0x01u: {  // VR
+            value |= 0xC0u;
+            chip->reg_vr = value;
+            aymo_(apply_vr)(chip);
+            break;
+        }
+        case 0x02u: {  // BA
+            value |= 0xF0u;
+            chip->reg_ba = value;
+            aymo_(apply_ba)(chip);
+            break;
+        }
+        case 0x03u: {  // TR
+            value |= 0xF0u;
+            chip->reg_tr = value;
+            aymo_(apply_tr)(chip);
+            break;
+        }
+        case 0x07u: {  // PP
+            value |= 0xFCu;
+            chip->reg_pp = value;
+            aymo_(apply_pp)(chip);
+            aymo_(apply_pseudo)(chip);
+            break;
+        }
+        case 0x08u: {  // SF
+            chip->reg_sf = value;
+            aymo_(apply_source_mode)(chip);
+            aymo_(apply_pseudo)(chip);
+            aymo_(apply_tfilter)(chip);
+            aymo_(apply_vl)(chip);
+            aymo_(apply_vr)(chip);
+            aymo_(apply_tr)(chip);
+            break;
+        }
+    }
+}
+
+
+void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[])
+{
+    assert(chip);
+    assert(x);
+    assert(y);
+
+    vf32x4_t b2l = chip->hb1l;
+    vf32x4_t b2r = chip->hb1r;
+    vf32x4_t a2l = chip->ha1l;
+    vf32x4_t a2r = chip->ha1r;
+
+    const float* xe = &x[count * 2u];
+
+    while AYMO_LIKELY(x != xe) {
+        vf32x4_t y2l = vaddq_f32(vmulq_f32(b2l, chip->kb2), vmulq_f32(a2l, chip->ka2));
+        vf32x4_t y2r = vaddq_f32(vmulq_f32(b2r, chip->kb2), vmulq_f32(a2r, chip->ka2));
+        chip->hb2l = b2l;
+        chip->hb2r = b2r;
+        chip->ha2l = a2l;
+        chip->ha2r = a2r;
+
+        vf32x4_t b1l = chip->hb0l;
+        vf32x4_t b1r = chip->hb0r;
+        vf32x4_t a1l = chip->ha0l;
+        vf32x4_t a1r = chip->ha0r;
+        vf32x4_t y1l = vaddq_f32(vmulq_f32(b1l, chip->kb1), vmulq_f32(a1l, chip->ka1));
+        vf32x4_t y1r = vaddq_f32(vmulq_f32(b1r, chip->kb1), vmulq_f32(a1r, chip->ka1));
+        chip->hb1l = b1l;
+        chip->hb1r = b1r;
+        chip->ha1l = a1l;
+        chip->ha1r = a1r;
+
+        vf32x4_t yyl = vaddq_f32(y2l, y1l);
+        vf32x4_t yyr = vaddq_f32(y2r, y1r);
+
+        vf32x2_t xlr = vld1_f32(x); x += 2u;
+        vf32x2_t xrl = vrev64_f32(xlr);
+        vf32x2_t wx = vadd_f32(vmul_f32(xlr, chip->klr), vmul_f32(xrl, chip->krl));
+        vf32x4_t xx = vcombine_f32(wx, wx);
+
+        vf32x4_t xl = vrev64q_f32(xx);
+        vf32x4_t b0l = vextq_f32(xl, a1l, 3);
+        vf32x4_t b0r = vextq_f32(xx, a1r, 3);
+        yyl = vaddq_f32(yyl, vmulq_f32(b0l, chip->kb0));
+        yyr = vaddq_f32(yyr, vmulq_f32(b0r, chip->kb0));
+        chip->hb0l = b0l;
+        chip->hb0r = b0r;
+
+        chip->ha0l = yyl;
+        chip->ha0r = yyr;
+
+        vf32x2_t ylh = vget_high_f32(yyl);
+        vf32x2_t yrh = vget_high_f32(yyr);
+        vf32x2_t yy = vext_f32(ylh, vrev64_f32(yrh), 1);
+        yy = vmul_f32(yy, chip->kv);
+
+        b2l = chip->hb1l;
+        b2r = chip->hb1r;
+        a2l = chip->ha1l;
+        a2r = chip->ha1r;
+
+        vst1_f32(y, yy); y += 2u;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
diff --git a/src/aymo_tda8425_common.c b/src/aymo_tda8425_common.c
new file mode 100644
index 0000000..0745270
--- /dev/null
+++ b/src/aymo_tda8425_common.c
@@ -0,0 +1,150 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_tda8425_common.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const int8_t aymo_tda8425_reg_v_to_db[64] =
+{
+    -90,  //  0
+    -90,  //  1
+    -90,  //  2
+    -90,  //  3
+    -90,  //  4
+    -90,  //  5
+    -90,  //  6
+    -90,  //  7
+    -90,  //  8
+    -90,  //  9
+    -90,  // 10
+    -90,  // 11
+    -90,  // 12
+    -90,  // 13
+    -90,  // 14
+    -90,  // 15
+    -90,  // 16
+    -90,  // 17
+    -90,  // 18
+    -90,  // 19
+    -90,  // 20
+    -90,  // 21
+    -90,  // 22
+    -90,  // 23
+    -90,  // 24
+    -90,  // 25
+    -90,  // 26
+    -90,  // 27
+    -64,  // 28
+    -62,  // 29
+    -60,  // 30
+    -58,  // 31
+    -56,  // 32
+    -54,  // 33
+    -52,  // 34
+    -50,  // 35
+    -48,  // 36
+    -46,  // 37
+    -44,  // 38
+    -42,  // 39
+    -40,  // 40
+    -38,  // 41
+    -36,  // 42
+    -34,  // 43
+    -32,  // 44
+    -30,  // 45
+    -28,  // 46
+    -26,  // 47
+    -24,  // 48
+    -22,  // 49
+    -20,  // 50
+    -18,  // 51
+    -16,  // 52
+    -14,  // 53
+    -12,  // 54
+    -10,  // 55
+    - 8,  // 56
+    - 6,  // 57
+    - 4,  // 58
+    - 2,  // 59
+    + 0,  // 60
+    + 2,  // 61
+    + 4,  // 62
+    + 6   // 63
+};
+
+const int8_t aymo_tda8425_reg_ba_to_db[16] =
+{
+    -12,  //  0
+    -12,  //  1
+    -12,  //  2
+    - 9,  //  3
+    - 6,  //  4
+    - 3,  //  5
+    + 0,  //  6
+    + 3,  //  7
+    + 6,  //  8
+    + 9,  //  9
+    +12,  // 10
+    +15,  // 11
+    +15,  // 12
+    +15,  // 13
+    +15,  // 14
+    +15   // 15
+};
+
+const int8_t aymo_tda8425_reg_tr_to_db[16] =
+{
+    -12,  //  0
+    -12,  //  1
+    -12,  //  2
+    - 9,  //  3
+    - 6,  //  4
+    - 3,  //  5
+    + 0,  //  6
+    + 3,  //  7
+    + 6,  //  8
+    + 9,  //  9
+    +12,  // 10
+    +12,  // 11
+    +12,  // 12
+    +12,  // 13
+    +12,  // 14
+    +12   // 15
+};
+
+
+const float aymo_tda8425_pseudo_preset_c1[3] =
+{
+    15.e-9f,
+    5.6e-9f,
+    5.6e-9f
+};
+
+const float aymo_tda8425_pseudo_preset_c2[3] =
+{
+    15.e-9f,
+    47.e-9f,
+    68.e-9f
+};
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_tda8425_none.c b/src/aymo_tda8425_none.c
new file mode 100644
index 0000000..44f4fd2
--- /dev/null
+++ b/src/aymo_tda8425_none.c
@@ -0,0 +1,148 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+
+#include "aymo_tda8425_common.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_tda8425_none.h"
+
+#include <assert.h>
+#include <math.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_tda8425_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_tda8425_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_tda8425_ctor_f)&(aymo_(ctor)),
+    (aymo_tda8425_dtor_f)&(aymo_(dtor)),
+    (aymo_tda8425_read_f)&(aymo_(read)),
+    (aymo_tda8425_write_f)&(aymo_(write)),
+    (aymo_tda8425_process_f32_f)&(aymo_(process_f32))
+};
+
+
+const struct aymo_tda8425_vt* aymo_(get_vt)(void)
+{
+    return &aymo_(vt);
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate)
+{
+    assert(chip);
+    assert(sample_rate > .0f);
+
+    for (int i = 0; i < AYMO_(DELAY); ++i) {
+        chip->yh[i][0] = .0f;
+        chip->yh[i][1] = .0f;
+    }
+
+    TDA8425_Chip* emu = &chip->emu;
+    TDA8425_Chip_Ctor(emu);
+
+    TDA8425_Chip_Setup(
+        emu,
+        (TDA8425_Float)sample_rate,
+        (TDA8425_Float)TDA8425_Pseudo_C1_Table[TDA8425_Pseudo_Preset_1],
+        (TDA8425_Float)TDA8425_Pseudo_C2_Table[TDA8425_Pseudo_Preset_1],
+        TDA8425_Tfilter_Mode_Disabled
+    );
+
+    TDA8425_Chip_Reset(emu);
+
+    TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_VL, 0xFCu);
+    TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_VR, 0xFCu);
+    TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_BA, 0xF6u);
+    TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_TR, 0xF6u);
+    TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_SF, 0xCEu);
+
+    TDA8425_Chip_Start(emu);
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    assert(chip);
+
+    if (address <= (uint16_t)TDA8425_Reg_SF) {
+        return TDA8425_Chip_Read(&chip->emu, (TDA8425_Address)address);
+    }
+    return 0xFFu;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address <= (uint16_t)TDA8425_Reg_SF) {
+        TDA8425_Chip_Write(&chip->emu, (TDA8425_Address)address, value);
+    }
+}
+
+
+void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[])
+{
+    assert(chip);
+    assert(x);
+    assert(y);
+
+    TDA8425_Chip* emu = &chip->emu;
+    TDA8425_Chip_Process_Data data;
+    data.inputs[TDA8425_Source_2][TDA8425_Stereo_L] = (TDA8425_Float)0.f;
+    data.inputs[TDA8425_Source_2][TDA8425_Stereo_R] = (TDA8425_Float)0.f;
+
+    while (count--) {
+        data.inputs[TDA8425_Source_1][TDA8425_Stereo_L] = (TDA8425_Float)*x++;
+        data.inputs[TDA8425_Source_1][TDA8425_Stereo_R] = (TDA8425_Float)*x++;
+
+        TDA8425_Chip_Process(emu, &data);
+
+        for (int i = (AYMO_(DELAY) - 1); i > 0; --i) {
+            chip->yh[i][0] = chip->yh[i-1][0];
+            chip->yh[i][1] = chip->yh[i-1][1];
+        }
+        chip->yh[0][0] = (float)data.outputs[TDA8425_Stereo_L];
+        chip->yh[0][1] = (float)data.outputs[TDA8425_Stereo_R];
+
+        *y++ = chip->yh[AYMO_(DELAY)-1][0];
+        *y++ = chip->yh[AYMO_(DELAY)-1][1];
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_tda8425_x86_avx2.c b/src/aymo_tda8425_x86_avx2.c
new file mode 100644
index 0000000..769e63f
--- /dev/null
+++ b/src/aymo_tda8425_x86_avx2.c
@@ -0,0 +1,499 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+#include "aymo_tda8425.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_tda8425_x86_avx2.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+#undef cos
+#undef fabs
+#undef log10
+#undef pow
+#undef sqrt
+#undef tan
+
+#define cos     (aymo_tda8425_math->cos)
+#define fabs    (aymo_tda8425_math->fabs)
+#define log10   (aymo_tda8425_math->log10)
+#define pow     (aymo_tda8425_math->pow)
+#define sqrt    (aymo_tda8425_math->sqrt)
+#define tan     (aymo_tda8425_math->tan)
+
+
+#undef mm256_alignr_ps
+#define mm256_alignr_ps(a, b, imm8)  \
+    (_mm256_castsi256_ps(_mm256_alignr_epi8(_mm256_castps_si256(a), _mm256_castps_si256(b), ((imm8) * 4))))
+
+
+const struct aymo_tda8425_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_tda8425_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_tda8425_ctor_f)&(aymo_(ctor)),
+    (aymo_tda8425_dtor_f)&(aymo_(dtor)),
+    (aymo_tda8425_read_f)&(aymo_(read)),
+    (aymo_tda8425_write_f)&(aymo_(write)),
+    (aymo_tda8425_process_f32_f)&(aymo_(process_f32))
+};
+
+
+const struct aymo_tda8425_vt* aymo_(get_vt)(void)
+{
+    return &aymo_(vt);
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate)
+{
+    assert(chip);
+    assert(sample_rate > 0.f);
+
+    // Wipe everything
+    aymo_memset(chip, 0, sizeof(struct aymo_(chip)));
+
+    // Setup default parameters
+    chip->sample_rate = sample_rate;
+    chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[0];
+    chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[0];
+
+    // Setup default registers
+    aymo_(write)(chip, 0x00u, 0xFCu);  // VL: 0 dB
+    aymo_(write)(chip, 0x01u, 0xFCu);  // VR: 0 dB
+    aymo_(write)(chip, 0x02u, 0xF6u);  // BA: 0 dB
+    aymo_(write)(chip, 0x03u, 0xF6u);  // TR: 0 dB
+    aymo_(write)(chip, 0x07u, 0xFCu);  // PP: light pseudo
+    aymo_(write)(chip, 0x08u, 0xCEu);  // SF: linear stereo, channel 1, unmuted
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+static void aymo_(apply_vl)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vl & 0x3Fu];
+
+    if (chip->reg_sf & 0x20u) {  // mute
+        db = -90.;
+    }
+
+    double g = pow(10., (db * .05));
+    vf32x4_t kvlo = _mm_set_ps((float)g, .0f, .0f, .0f);
+    chip->kv = _mm256_insertf128_ps(chip->kv, kvlo, 0);
+}
+
+
+static void aymo_(apply_vr)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vr & 0x3Fu];
+
+    if (chip->reg_sf & 0x20u) {  // mute
+        db = -90.;
+    }
+
+    double g = pow(10., (db * .05));
+    vf32x4_t kvhi = _mm_set_ps((float)g, .0f, .0f, .0f);
+    chip->kv = _mm256_insertf128_ps(chip->kv, kvhi, 1);
+}
+
+
+static void aymo_(apply_ba)(struct aymo_(chip)* chip)
+{
+    double dbb = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu];
+    double gb = pow(10., (dbb * (.05 * .5)));
+    double fs = (double)chip->sample_rate;
+    double pi = 3.14159265358979323846264338327950288;
+    double fcb = 300.;  // [Hz]
+    double wb = ((2. * pi) * fcb);
+    double kb = (tan(wb * (.5 / fs)) / wb);
+
+    double a0 = ((kb * wb) + gb);
+    double a1 = ((kb * wb) - gb);
+    double a2 = 0.;
+
+    double b0 = (((kb * wb) * (gb * gb)) + gb);
+    double b1 = (((kb * wb) * (gb * gb)) - gb);
+    double b2 = 0.;
+
+    double ra0 = (1. / a0);
+    chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps((float)(b0 * ra0)), 0x44);
+    chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps((float)(b1 * ra0)), 0x44);
+    chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps((float)(b2 * ra0)), 0x44);
+    ra0 = -ra0;
+    chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps((float)(a1 * ra0)), 0x44);
+    chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps((float)(a2 * ra0)), 0x44);
+}
+
+
+static void aymo_(apply_tr)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_tr_to_db[chip->reg_tr & 0x0Fu];
+    double gt = pow(10., (db * (.05 * .5)));
+    double fs = (double)chip->sample_rate;
+    double pi = 3.14159265358979323846264338327950288;
+    double fcd = 10.;  // [Hz]
+    double wd = ((2. * pi) * fcd);
+    double kd = ((chip->reg_sf & 0x40u) ? 0. : (tan(wd * (.5 / fs)) / wd));
+    double fct = 4500.;  // [Hz]
+    double wt = ((2. * pi) * fct);
+    double kt = (tan(wt * (.5 / fs)) / wt);
+
+    double a0 = (((gt * kt * wt) * (kd * wd)) + ((gt * kt * wt) + (kd * wd)) + 1.);
+    double a1 = (((gt * kt * wt) * (kd * wd) * 2.) - 2.);
+    double a2 = (((gt * kt * wt) * (kd * wd)) - ((gt * kt * wt) + (kd * wd)) + 1.);
+
+    double b0 = ((gt * gt) + (gt * kt * wt));
+    double b1 = ((gt * gt) * -2.);
+    double b2 = ((gt * gt) - (gt * kt * wt));
+
+    double ra0 = (1. / a0);
+    chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps((float)(b0 * ra0)), 0x22);
+    chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps((float)(b1 * ra0)), 0x22);
+    chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps((float)(b2 * ra0)), 0x22);
+    ra0 = -ra0;
+    chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps((float)(a1 * ra0)), 0x22);
+    chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps((float)(a2 * ra0)), 0x22);
+}
+
+
+static void aymo_(apply_source_mode)(struct aymo_(chip)* chip)
+{
+    // Default mute
+    vf32x8_t klr = _mm256_setzero_ps();
+    vf32x8_t krl = _mm256_setzero_ps();
+
+    uint8_t source = (chip->reg_sf & 0x07u);
+    uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u);
+
+    // Forced mono
+    if (mode == 0x00u) {  // process
+        switch (source) {
+            // Channel 1
+            case 0x02u:
+            case 0x04u:
+            case 0x06u: {
+                klr = _mm256_set_ps(1.f, .0f, .0f, .0f,  1.f, .0f, .0f, .0f);
+                krl = _mm256_set_ps(1.f, .0f, .0f, .0f,  1.f, .0f, .0f, .0f);
+                break;
+            }
+        }
+    }
+    else {  // not forced mono
+        switch (source) {
+            // Channel 1
+            case 0x02u: {  // mono left
+                klr = _mm256_set_ps(0.f, .0f, .0f, .0f,  1.f, .0f, .0f, .0f);
+                krl = _mm256_set_ps(1.f, .0f, .0f, .0f,  0.f, .0f, .0f, .0f);
+                break;
+            }
+            case 0x04u: {  // mono right
+                klr = _mm256_set_ps(1.f, .0f, .0f, .0f,  0.f, .0f, .0f, .0f);
+                krl = _mm256_set_ps(0.f, .0f, .0f, .0f,  1.f, .0f, .0f, .0f);
+                break;
+            }
+            case 0x06u: {  // stereo
+                klr = _mm256_set_ps(1.f, .0f, .0f, .0f,  1.f, .0f, .0f, .0f);
+                krl = _mm256_set_ps(0.f, .0f, .0f, .0f,  0.f, .0f, .0f, .0f);
+                break;
+            }
+            default: {
+                if (mode == 0x03u) {  // spatial stereo
+                    mode = 0x02u;  // force linear stereo (mute)
+                }
+                break;
+            }
+        }
+
+        // Spatial stereo
+        if (mode == 0x03u) {  // process
+            const float xt = .52f;  // cross-talk
+            __m256 kx = _mm256_set_ps(xt, .0f, .0f, .0f,  xt, .0f, .0f, .0f);
+            klr = _mm256_add_ps(klr, kx);
+            krl = _mm256_sub_ps(krl, kx);
+        }
+    }  // not forced mono
+
+    chip->klr = klr;
+    chip->krl = krl;
+}
+
+
+static void aymo_(apply_pseudo)(struct aymo_(chip)* chip)
+{
+    uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u);
+
+    // Pseudo stereo
+    if (mode == 0x02u) {  // enabled
+        double c1 = (double)chip->pseudo_c1;
+        double c2 = (double)chip->pseudo_c2;
+        double r1 = 15000.;  // [ohm]
+        double r2 = 15000.;  // [ohm]
+        double t1 = (c1 * r1);
+        double t2 = (c2 * r2);
+
+        double fs = (double)chip->sample_rate;
+        double k = (.5 / fs);
+        double kk = (k * k);
+        double t1_t2 = (t1 * t2);
+        double t1_t2_k = ((t1 + t2) * k);
+
+        double a0 = (kk + t1_t2 + t1_t2_k);
+        double a1 = ((kk - t1_t2) * 2.);
+        double a2 = (kk + t1_t2 - t1_t2_k);
+
+        double b0 = a2;
+        double b1 = a1;
+        double b2 = a0;
+
+        double ra0 = (1. / a0);
+        chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps((float)(b0 * ra0)), 0x11);
+        chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps((float)(b1 * ra0)), 0x11);
+        chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps((float)(b2 * ra0)), 0x11);
+        ra0 = -ra0;
+        chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps((float)(a1 * ra0)), 0x11);
+        chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps((float)(a2 * ra0)), 0x11);
+    }
+    else {  // pass-through
+        chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps(1.f), 0x11);
+        chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps(.0f), 0x11);
+        chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps(.0f), 0x11);
+
+        chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps(.0f), 0x11);
+        chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps(.0f), 0x11);
+    }
+}
+
+
+static void aymo_(apply_tfilter)(struct aymo_(chip)* chip)
+{
+    // T-filter
+    if (chip->reg_sf & 0x80u) {  // pass-through
+        chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps(1.f), 0x88);
+        chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps(.0f), 0x88);
+        chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps(.0f), 0x88);
+
+        chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps(.0f), 0x88);
+        chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps(.0f), 0x88);
+    }
+    else {  // enabled
+        double db = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu];
+        double g = pow(10., (db * (.05 * .5)));
+        double fs = (double)chip->sample_rate;
+        double pi = 3.14159265358979323846264338327950288;
+        double fc = 180.;  // [Hz]
+        double w = ((2. * pi) * fc);
+        double k = (tan(w * (.5 / fs)) / w);
+
+        double log10_g = log10(g);
+        double ang = (log10_g * .85);
+        double abs_sqrt_log10_g = sqrt(fabs(log10_g));
+        double abs2_sqrt_log10_g = abs_sqrt_log10_g * abs_sqrt_log10_g;
+        double kw = (k * w);
+        double m_k2w2 = ((kw * kw) * -.05);
+        double sqrt_5 = 2.23606797749978980505147774238139391;
+        double ph = (pi * .75);
+        double h_sqrt_5_kw_abs_sqrt_log10_g = ((sqrt_5 * .2) * kw * abs_sqrt_log10_g);
+        double cosm = cos(ang - ph);
+        double cosp = cos(ang + ph);
+
+        double a0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosm)));
+        double a1 = (((m_k2w2 + abs2_sqrt_log10_g)) * 2.);
+        double a2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosm)));
+
+        double b0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosp)));
+        double b1 = a1;
+        double b2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosp)));
+
+        double ra0 = (1. / a0);
+        chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps((float)(b0 * ra0)), 0x88);
+        chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps((float)(b1 * ra0)), 0x88);
+        chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps((float)(b2 * ra0)), 0x88);
+        ra0 = -ra0;
+        chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps((float)(a1 * ra0)), 0x88);
+        chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps((float)(a2 * ra0)), 0x88);
+    }
+}
+
+
+static void aymo_(apply_pp)(struct aymo_(chip)* chip)
+{
+    uint8_t pseudo_preset = (chip->reg_pp & 0x03u);
+    if (pseudo_preset >= 3u) {
+        pseudo_preset = 0u;
+    }
+    chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[pseudo_preset];
+    chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[pseudo_preset];
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    assert(chip);
+
+    switch (address) {
+        case 0x00u: {
+            return chip->reg_vl;
+        }
+        case 0x01u: {
+            return chip->reg_vr;
+        }
+        case 0x02u: {
+            return chip->reg_ba;
+        }
+        case 0x03u: {
+            return chip->reg_tr;
+        }
+        case 0x07u: {
+            return chip->reg_pp;
+        }
+        case 0x08u: {
+            return chip->reg_sf;
+        }
+        default: {
+            return 0xFFu;
+        }
+    }
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    switch (address) {
+        case 0x00u: {  // VL
+            value |= 0xC0u;
+            chip->reg_vl = value;
+            aymo_(apply_vl)(chip);
+            break;
+        }
+        case 0x01u: {  // VR
+            value |= 0xC0u;
+            chip->reg_vr = value;
+            aymo_(apply_vr)(chip);
+            break;
+        }
+        case 0x02u: {  // BA
+            value |= 0xF0u;
+            chip->reg_ba = value;
+            aymo_(apply_ba)(chip);
+            break;
+        }
+        case 0x03u: {  // TR
+            value |= 0xF0u;
+            chip->reg_tr = value;
+            aymo_(apply_tr)(chip);
+            break;
+        }
+        case 0x07u: {  // PP
+            value |= 0xFCu;
+            chip->reg_pp = value;
+            aymo_(apply_pp)(chip);
+            aymo_(apply_pseudo)(chip);
+            break;
+        }
+        case 0x08u: {  // SF
+            chip->reg_sf = value;
+            aymo_(apply_source_mode)(chip);
+            aymo_(apply_pseudo)(chip);
+            aymo_(apply_tfilter)(chip);
+            aymo_(apply_vl)(chip);
+            aymo_(apply_vr)(chip);
+            aymo_(apply_tr)(chip);
+            break;
+        }
+    }
+}
+
+
+void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[])
+{
+    assert(chip);
+    assert(x);
+    assert(y);
+
+    float AYMO_ALIGN_V256 xlrv[8];
+    float AYMO_ALIGN_V256 xrlv[8];
+    float AYMO_ALIGN_V256 yyv[8];
+
+    vf32x8_t b2 = chip->hb1;
+    vf32x8_t a2 = chip->ha1;
+
+    const float* xe = &x[count * 2u];
+
+    while AYMO_LIKELY(x != xe) {
+        vf32x8_t y2 = _mm256_add_ps(_mm256_mul_ps(b2, chip->kb2), _mm256_mul_ps(a2, chip->ka2));
+        chip->hb2 = b2;
+        chip->ha2 = a2;
+
+        vf32x8_t b1 = chip->hb0;
+        vf32x8_t a1 = chip->ha0;
+        vf32x8_t y1 = _mm256_add_ps(_mm256_mul_ps(b1, chip->kb1), _mm256_mul_ps(a1, chip->ka1));
+        chip->hb1 = b1;
+        chip->ha1 = a1;
+
+        vf32x8_t yy = _mm256_add_ps(y2, y1);
+
+        xrlv[7] = xlrv[3] = *x++;
+        xrlv[3] = xlrv[7] = *x++;
+        _mm_sfence();
+        vf32x8_t xlr = _mm256_load_ps(xlrv);
+        vf32x8_t xrl = _mm256_load_ps(xrlv);
+        vf32x8_t xx = _mm256_add_ps(_mm256_mul_ps(xlr, chip->klr), _mm256_mul_ps(xrl, chip->krl));
+
+        vf32x8_t b0 = mm256_alignr_ps(chip->ha0, xx, 3);
+        yy = _mm256_add_ps(yy, _mm256_mul_ps(b0, chip->kb0));
+        chip->hb0 = b0;
+
+        chip->ha0 = yy;
+
+        yy = _mm256_mul_ps(yy, chip->kv);
+        _mm256_store_ps(yyv, yy);
+
+        b2 = chip->hb1;
+        a2 = chip->ha1;
+
+        _mm_sfence();
+        *y++ = yyv[3];
+        *y++ = yyv[7];
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
diff --git a/src/aymo_tda8425_x86_sse41.c b/src/aymo_tda8425_x86_sse41.c
new file mode 100644
index 0000000..ebfb44e
--- /dev/null
+++ b/src/aymo_tda8425_x86_sse41.c
@@ -0,0 +1,512 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#include "aymo_tda8425.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_tda8425_x86_sse41.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+#undef cos
+#undef fabs
+#undef log10
+#undef pow
+#undef sqrt
+#undef tan
+
+#define cos     (aymo_tda8425_math->cos)
+#define fabs    (aymo_tda8425_math->fabs)
+#define log10   (aymo_tda8425_math->log10)
+#define pow     (aymo_tda8425_math->pow)
+#define sqrt    (aymo_tda8425_math->sqrt)
+#define tan     (aymo_tda8425_math->tan)
+
+
+#undef mm_insert_ps
+#define mm_insert_ps(a, b, imm8)  \
+    (_mm_blend_ps((a), _mm_set1_ps(b), (1 << (imm8))))
+
+
+#undef mm_alignr_ps
+#define mm_alignr_ps(a, b, imm8)  \
+    (_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(a), _mm_castps_si128(b), ((imm8) * 4))))
+
+
+const struct aymo_tda8425_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_tda8425_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_tda8425_ctor_f)&(aymo_(ctor)),
+    (aymo_tda8425_dtor_f)&(aymo_(dtor)),
+    (aymo_tda8425_read_f)&(aymo_(read)),
+    (aymo_tda8425_write_f)&(aymo_(write)),
+    (aymo_tda8425_process_f32_f)&(aymo_(process_f32))
+};
+
+
+const struct aymo_tda8425_vt* aymo_(get_vt)(void)
+{
+    return &aymo_(vt);
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate)
+{
+    assert(chip);
+    assert(sample_rate > 0.f);
+
+    // Wipe everything
+    aymo_memset(chip, 0, sizeof(struct aymo_(chip)));
+
+    // Setup default parameters
+    chip->sample_rate = sample_rate;
+    chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[0];
+    chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[0];
+
+    // Setup default registers
+    aymo_(write)(chip, 0x00u, 0xFCu);  // VL: 0 dB
+    aymo_(write)(chip, 0x01u, 0xFCu);  // VR: 0 dB
+    aymo_(write)(chip, 0x02u, 0xF6u);  // BA: 0 dB
+    aymo_(write)(chip, 0x03u, 0xF6u);  // TR: 0 dB
+    aymo_(write)(chip, 0x07u, 0xFCu);  // PP: light pseudo
+    aymo_(write)(chip, 0x08u, 0xCEu);  // SF: linear stereo, channel 1, unmuted
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+static void aymo_(apply_vl)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vl & 0x3Fu];
+
+    if (chip->reg_sf & 0x20u) {  // mute
+        db = -90.;
+    }
+
+    double g = pow(10., (db * .05));
+    chip->kv = mm_insert_ps(chip->kv, (float)g, 2);
+}
+
+
+static void aymo_(apply_vr)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vr & 0x3Fu];
+
+    if (chip->reg_sf & 0x20u) {  // mute
+        db = -90.;
+    }
+
+    double g = pow(10., (db * .05));
+    chip->kv = mm_insert_ps(chip->kv, (float)g, 3);
+}
+
+
+static void aymo_(apply_ba)(struct aymo_(chip)* chip)
+{
+    double dbb = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu];
+    double gb = pow(10., (dbb * (.05 * .5)));
+    double fs = (double)chip->sample_rate;
+    double pi = 3.14159265358979323846264338327950288;
+    double fcb = 300.;  // [Hz]
+    double wb = ((2. * pi) * fcb);
+    double kb = (tan(wb * (.5 / fs)) / wb);
+
+    double a0 = ((kb * wb) + gb);
+    double a1 = ((kb * wb) - gb);
+    double a2 = 0.;
+
+    double b0 = (((kb * wb) * (gb * gb)) + gb);
+    double b1 = (((kb * wb) * (gb * gb)) - gb);
+    double b2 = 0.;
+
+    double ra0 = (1. / a0);
+    chip->kb0 = mm_insert_ps(chip->kb0, (float)(b0 * ra0), 2);
+    chip->kb1 = mm_insert_ps(chip->kb1, (float)(b1 * ra0), 2);
+    chip->kb2 = mm_insert_ps(chip->kb2, (float)(b2 * ra0), 2);
+    ra0 = -ra0;
+    chip->ka1 = mm_insert_ps(chip->ka1, (float)(a1 * ra0), 2);
+    chip->ka2 = mm_insert_ps(chip->ka2, (float)(a2 * ra0), 2);
+}
+
+
+static void aymo_(apply_tr)(struct aymo_(chip)* chip)
+{
+    double db = (double)aymo_tda8425_reg_tr_to_db[chip->reg_tr & 0x0Fu];
+    double gt = pow(10., (db * (.05 * .5)));
+    double fs = (double)chip->sample_rate;
+    double pi = 3.14159265358979323846264338327950288;
+    double fcd = 10.;  // [Hz]
+    double wd = ((2. * pi) * fcd);
+    double kd = ((chip->reg_sf & 0x40u) ? 0. : (tan(wd * (.5 / fs)) / wd));
+    double fct = 4500.;  // [Hz]
+    double wt = ((2. * pi) * fct);
+    double kt = (tan(wt * (.5 / fs)) / wt);
+
+    double a0 = (((gt * kt * wt) * (kd * wd)) + ((gt * kt * wt) + (kd * wd)) + 1.);
+    double a1 = (((gt * kt * wt) * (kd * wd) * 2.) - 2.);
+    double a2 = (((gt * kt * wt) * (kd * wd)) - ((gt * kt * wt) + (kd * wd)) + 1.);
+
+    double b0 = ((gt * gt) + (gt * kt * wt));
+    double b1 = ((gt * gt) * -2.);
+    double b2 = ((gt * gt) - (gt * kt * wt));
+
+    double ra0 = (1. / a0);
+    chip->kb0 = mm_insert_ps(chip->kb0, (float)(b0 * ra0), 1);
+    chip->kb1 = mm_insert_ps(chip->kb1, (float)(b1 * ra0), 1);
+    chip->kb2 = mm_insert_ps(chip->kb2, (float)(b2 * ra0), 1);
+    ra0 = -ra0;
+    chip->ka1 = mm_insert_ps(chip->ka1, (float)(a1 * ra0), 1);
+    chip->ka2 = mm_insert_ps(chip->ka2, (float)(a2 * ra0), 1);
+}
+
+
+static void aymo_(apply_source_mode)(struct aymo_(chip)* chip)
+{
+    // Default mute
+    vf32x4_t klr = _mm_setzero_ps();
+    vf32x4_t krl = _mm_setzero_ps();
+
+    uint8_t source = (chip->reg_sf & 0x07u);
+    uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u);
+
+    // Forced mono
+    if (mode == 0x00u) {  // process
+        switch (source) {
+            // Channel 1
+            case 0x02u:
+            case 0x04u:
+            case 0x06u: {
+                klr = _mm_set_ps(1.f, 1.f, .0f, .0f);
+                krl = _mm_set_ps(1.f, 1.f, .0f, .0f);
+                break;
+            }
+        }
+    }
+    else {  // not forced mono
+        switch (source) {
+            // Channel 1
+            case 0x02u: {  // mono left
+                klr = _mm_set_ps(0.f, 1.f, .0f, .0f);
+                krl = _mm_set_ps(1.f, 0.f, .0f, .0f);
+                break;
+            }
+            case 0x04u: {  // mono right
+                klr = _mm_set_ps(1.f, 0.f, .0f, .0f);
+                krl = _mm_set_ps(0.f, 1.f, .0f, .0f);
+                break;
+            }
+            case 0x06u: {  // stereo
+                klr = _mm_set_ps(1.f, 1.f, .0f, .0f);
+                krl = _mm_set_ps(0.f, 0.f, .0f, .0f);
+                break;
+            }
+            default: {
+                if (mode == 0x03u) {  // spatial stereo
+                    mode = 0x02u;  // force linear stereo (mute)
+                }
+                break;
+            }
+        }
+
+        // Spatial stereo
+        if (mode == 0x03u) {  // process
+            const float xt = .52f;  // cross-talk
+            vf32x4_t kx = _mm_set_ps(xt, xt, .0f, .0f);
+            klr = _mm_add_ps(klr, kx);
+            krl = _mm_sub_ps(krl, kx);
+        }
+    }  // not forced mono
+
+    chip->klr = klr;
+    chip->krl = krl;
+}
+
+
+static void aymo_(apply_pseudo)(struct aymo_(chip)* chip)
+{
+    uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u);
+
+    // Pseudo stereo
+    if (mode == 0x02u) {  // enabled
+        double c1 = (double)chip->pseudo_c1;
+        double c2 = (double)chip->pseudo_c2;
+        double r1 = 15000.;  // [ohm]
+        double r2 = 15000.;  // [ohm]
+        double t1 = (c1 * r1);
+        double t2 = (c2 * r2);
+
+        double fs = (double)chip->sample_rate;
+        double k = (.5 / fs);
+        double kk = (k * k);
+        double t1_t2 = (t1 * t2);
+        double t1_t2_k = ((t1 + t2) * k);
+
+        double a0 = (kk + t1_t2 + t1_t2_k);
+        double a1 = ((kk - t1_t2) * 2.);
+        double a2 = (kk + t1_t2 - t1_t2_k);
+
+        double b0 = a2;
+        double b1 = a1;
+        double b2 = a0;
+
+        double ra0 = (1. / a0);
+        chip->kb0 = mm_insert_ps(chip->kb0, (float)(b0 * ra0), 0);
+        chip->kb1 = mm_insert_ps(chip->kb1, (float)(b1 * ra0), 0);
+        chip->kb2 = mm_insert_ps(chip->kb2, (float)(b2 * ra0), 0);
+        ra0 = -ra0;
+        chip->ka1 = mm_insert_ps(chip->ka1, (float)(a1 * ra0), 0);
+        chip->ka2 = mm_insert_ps(chip->ka2, (float)(a2 * ra0), 0);
+    }
+    else {  // pass-through
+        chip->kb0 = mm_insert_ps(chip->kb0, 1.f, 0);
+        chip->kb1 = mm_insert_ps(chip->kb1, .0f, 0);
+        chip->kb2 = mm_insert_ps(chip->kb2, .0f, 0);
+
+        chip->ka1 = mm_insert_ps(chip->ka1, .0f, 0);
+        chip->ka2 = mm_insert_ps(chip->ka2, .0f, 0);
+    }
+}
+
+
+static void aymo_(apply_tfilter)(struct aymo_(chip)* chip)
+{
+    // T-filter
+    if (chip->reg_sf & 0x80u) {  // pass-through
+        chip->kb0 = mm_insert_ps(chip->kb0, 1.f, 3);
+        chip->kb1 = mm_insert_ps(chip->kb1, .0f, 3);
+        chip->kb2 = mm_insert_ps(chip->kb2, .0f, 3);
+
+        chip->ka1 = mm_insert_ps(chip->ka1, .0f, 3);
+        chip->ka2 = mm_insert_ps(chip->ka2, .0f, 3);
+    }
+    else {  // enabled
+        double db = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu];
+        double g = pow(10., (db * (.05 * .5)));
+        double fs = (double)chip->sample_rate;
+        double pi = 3.14159265358979323846264338327950288;
+        double fc = 180.;  // [Hz]
+        double w = ((2. * pi) * fc);
+        double k = (tan(w * (.5 / fs)) / w);
+
+        double log10_g = log10(g);
+        double ang = (log10_g * .85);
+        double abs_sqrt_log10_g = sqrt(fabs(log10_g));
+        double abs2_sqrt_log10_g = abs_sqrt_log10_g * abs_sqrt_log10_g;
+        double kw = (k * w);
+        double m_k2w2 = ((kw * kw) * -.05);
+        double sqrt_5 = 2.23606797749978980505147774238139391;
+        double ph = (pi * .75);
+        double h_sqrt_5_kw_abs_sqrt_log10_g = ((sqrt_5 * .2) * kw * abs_sqrt_log10_g);
+        double cosm = cos(ang - ph);
+        double cosp = cos(ang + ph);
+
+        double a0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosm)));
+        double a1 = (((m_k2w2 + abs2_sqrt_log10_g)) * 2.);
+        double a2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosm)));
+
+        double b0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosp)));
+        double b1 = a1;
+        double b2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosp)));
+
+        double ra0 = (1. / a0);
+        chip->kb0 = mm_insert_ps(chip->kb0, (float)(b0 * ra0), 3);
+        chip->kb1 = mm_insert_ps(chip->kb1, (float)(b1 * ra0), 3);
+        chip->kb2 = mm_insert_ps(chip->kb2, (float)(b2 * ra0), 3);
+        ra0 = -ra0;
+        chip->ka1 = mm_insert_ps(chip->ka1, (float)(a1 * ra0), 3);
+        chip->ka2 = mm_insert_ps(chip->ka2, (float)(a2 * ra0), 3);
+    }
+}
+
+
+static void aymo_(apply_pp)(struct aymo_(chip)* chip)
+{
+    uint8_t pseudo_preset = (chip->reg_pp & 0x03u);
+    if (pseudo_preset >= 3u) {
+        pseudo_preset = 0u;
+    }
+    chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[pseudo_preset];
+    chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[pseudo_preset];
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    assert(chip);
+
+    switch (address) {
+        case 0x00u: {
+            return chip->reg_vl;
+        }
+        case 0x01u: {
+            return chip->reg_vr;
+        }
+        case 0x02u: {
+            return chip->reg_ba;
+        }
+        case 0x03u: {
+            return chip->reg_tr;
+        }
+        case 0x07u: {
+            return chip->reg_pp;
+        }
+        case 0x08u: {
+            return chip->reg_sf;
+        }
+        default: {
+            return 0xFFu;
+        }
+    }
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    switch (address) {
+        case 0x00u: {  // VL
+            value |= 0xC0u;
+            chip->reg_vl = value;
+            aymo_(apply_vl)(chip);
+            break;
+        }
+        case 0x01u: {  // VR
+            value |= 0xC0u;
+            chip->reg_vr = value;
+            aymo_(apply_vr)(chip);
+            break;
+        }
+        case 0x02u: {  // BA
+            value |= 0xF0u;
+            chip->reg_ba = value;
+            aymo_(apply_ba)(chip);
+            break;
+        }
+        case 0x03u: {  // TR
+            value |= 0xF0u;
+            chip->reg_tr = value;
+            aymo_(apply_tr)(chip);
+            break;
+        }
+        case 0x07u: {  // PP
+            value |= 0xFCu;
+            chip->reg_pp = value;
+            aymo_(apply_pp)(chip);
+            aymo_(apply_pseudo)(chip);
+            break;
+        }
+        case 0x08u: {  // SF
+            chip->reg_sf = value;
+            aymo_(apply_source_mode)(chip);
+            aymo_(apply_pseudo)(chip);
+            aymo_(apply_tfilter)(chip);
+            aymo_(apply_vl)(chip);
+            aymo_(apply_vr)(chip);
+            aymo_(apply_tr)(chip);
+            break;
+        }
+    }
+}
+
+
+void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[])
+{
+    assert(chip);
+    assert(x);
+    assert(y);
+
+    vf32x4_t b2l = chip->hb1l;
+    vf32x4_t b2r = chip->hb1r;
+    vf32x4_t a2l = chip->ha1l;
+    vf32x4_t a2r = chip->ha1r;
+
+    const float* xe = &x[count * 2u];
+
+    while AYMO_LIKELY(x != xe) {
+        vf32x4_t y2l = _mm_add_ps(_mm_mul_ps(b2l, chip->kb2), _mm_mul_ps(a2l, chip->ka2));
+        vf32x4_t y2r = _mm_add_ps(_mm_mul_ps(b2r, chip->kb2), _mm_mul_ps(a2r, chip->ka2));
+        chip->hb2l = b2l;
+        chip->hb2r = b2r;
+        chip->ha2l = a2l;
+        chip->ha2r = a2r;
+
+        vf32x4_t b1l = chip->hb0l;
+        vf32x4_t b1r = chip->hb0r;
+        vf32x4_t a1l = chip->ha0l;
+        vf32x4_t a1r = chip->ha0r;
+        vf32x4_t y1l = _mm_add_ps(_mm_mul_ps(b1l, chip->kb1), _mm_mul_ps(a1l, chip->ka1));
+        vf32x4_t y1r = _mm_add_ps(_mm_mul_ps(b1r, chip->kb1), _mm_mul_ps(a1r, chip->ka1));
+        chip->hb1l = b1l;
+        chip->hb1r = b1r;
+        chip->ha1l = a1l;
+        chip->ha1r = a1r;
+
+        vf32x4_t yyl = _mm_add_ps(y2l, y1l);
+        vf32x4_t yyr = _mm_add_ps(y2r, y1r);
+
+        vf32x4_t xlr = _mm_loadh_pi(_mm_undefined_ps(), (const void*)x); x += 2u;
+        vf32x4_t xrl = _mm_shuffle_ps(xlr, xlr, _MM_SHUFFLE(2, 3, 0, 1));  // "23.."
+        vf32x4_t xx = _mm_add_ps(_mm_mul_ps(xlr, chip->klr), _mm_mul_ps(xrl, chip->krl));
+
+        vf32x4_t xl = _mm_shuffle_ps(xx, xx, _MM_SHUFFLE(2, 3, 0, 1));  // "2..."
+        vf32x4_t b0l = mm_alignr_ps(a1l, xl, 3);
+        vf32x4_t b0r = mm_alignr_ps(a1r, xx, 3);
+        yyl = _mm_add_ps(yyl, _mm_mul_ps(b0l, chip->kb0));
+        yyr = _mm_add_ps(yyr, _mm_mul_ps(b0r, chip->kb0));
+        chip->hb0l = b0l;
+        chip->hb0r = b0r;
+
+        chip->ha0l = yyl;
+        chip->ha0r = yyr;
+
+        yyl = _mm_shuffle_ps(yyl, yyl, _MM_SHUFFLE(2, 3, 0, 1));  // ".3.."
+        vf32x4_t yy = _mm_blend_ps(yyl, yyr, 0x8);  // "1000"
+        yy = _mm_mul_ps(yy, chip->kv);
+
+        b2l = chip->hb1l;
+        b2r = chip->hb1r;
+        a2l = chip->ha1l;
+        a2r = chip->ha1r;
+
+        _mm_storeh_pi((void*)y, yy); y += 2u;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
diff --git a/src/aymo_wave.c b/src/aymo_wave.c
new file mode 100644
index 0000000..4cb1606
--- /dev/null
+++ b/src/aymo_wave.c
@@ -0,0 +1,79 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_wave.h"
+
+#include <assert.h>
+#include <stddef.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+AYMO_PUBLIC void aymo_wave_heading_setup(
+    struct aymo_wave_heading* heading,
+    uint16_t wave_fmt_type,
+    uint16_t channel_count,
+    uint16_t sample_bits,
+    uint32_t sample_rate,
+    uint32_t sample_count
+)
+{
+    assert(heading);
+    assert(channel_count > 0u);
+    assert(sample_bits > 0u);
+    assert(sample_rate > 0u);
+
+    uint16_t sample_byte_size = (sample_bits / 8u);
+    uint32_t sample_data_size = (sample_count * channel_count * sample_byte_size);
+    assert(sample_data_size < (UINT32_MAX - 32u));
+
+    heading->riff_fourcc[0]         = 'R';
+    heading->riff_fourcc[1]         = 'I';
+    heading->riff_fourcc[2]         = 'F';
+    heading->riff_fourcc[3]         = 'F';
+    heading->riff_size              = (32u + sample_data_size);
+
+    heading->wave_fourcc[0]         = 'W';
+    heading->wave_fourcc[1]         = 'A';
+    heading->wave_fourcc[2]         = 'V';
+    heading->wave_fourcc[3]         = 'E';
+
+    heading->wave_fmt_fourcc[0]     = 'f';
+    heading->wave_fmt_fourcc[1]     = 'm';
+    heading->wave_fmt_fourcc[2]     = 't';
+    heading->wave_fmt_fourcc[3]     = ' ';
+    heading->wave_fmt_size          = 16u;
+
+    heading->wave_fmt_type          = wave_fmt_type;
+    heading->wave_fmt_channel_count = channel_count;
+    heading->wave_fmt_sample_rate   = sample_rate;
+    heading->wave_fmt_byte_rate     = (sample_byte_size * sample_rate);
+    heading->wave_fmt_block_align   = (sample_byte_size * channel_count);
+    heading->wave_fmt_sample_bits   = sample_bits;
+
+    heading->wave_data_fourcc[0]    = 'd';
+    heading->wave_data_fourcc[1]    = 'a';
+    heading->wave_data_fourcc[2]    = 't';
+    heading->wave_data_fourcc[3]    = 'a';
+    heading->wave_data_size         = sample_data_size;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_ym7128.c b/src/aymo_ym7128.c
new file mode 100644
index 0000000..da2c169
--- /dev/null
+++ b/src/aymo_ym7128.c
@@ -0,0 +1,148 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#include "aymo_ym7128.h"
+#include "aymo_ym7128_arm_neon.h"
+#include "aymo_ym7128_none.h"
+#include "aymo_ym7128_x86_sse41.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+static const struct aymo_ym7128_vt* aymo_ym7128_best_vt;
+
+
+void aymo_ym7128_boot(void)
+{
+    #ifdef AYMO_CPU_SUPPORT_X86_SSE41
+        if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) {
+            aymo_ym7128_best_vt = aymo_ym7128_x86_sse41_get_vt();
+            return;
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_ARM_NEON
+        if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) {
+            aymo_ym7128_best_vt = aymo_ym7128_arm_neon_get_vt();
+            return;
+        }
+    #endif
+
+    aymo_ym7128_best_vt = aymo_ym7128_none_get_vt();
+}
+
+
+const struct aymo_ym7128_vt* aymo_ym7128_get_vt(const char* cpu_ext)
+{
+    if (cpu_ext == NULL) {
+        return NULL;
+    }
+
+    #ifdef AYMO_CPU_SUPPORT_X86_SSE41
+        if (!aymo_strcmp(cpu_ext, "x86_sse41")) {
+            if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) {
+                return aymo_ym7128_x86_sse41_get_vt();
+            }
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_ARM_NEON
+        if (!aymo_strcmp(cpu_ext, "arm_neon")) {
+            if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) {
+                return aymo_ym7128_arm_neon_get_vt();
+            }
+        }
+    #endif
+
+    if (!aymo_strcmp(cpu_ext, "none")) {
+        return aymo_ym7128_none_get_vt();
+    }
+    return NULL;
+}
+
+
+const struct aymo_ym7128_vt* aymo_ym7128_get_best_vt(void)
+{
+    return aymo_ym7128_best_vt;
+}
+
+
+uint32_t aymo_ym7128_get_sizeof(struct aymo_ym7128_chip* chip)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->get_sizeof);
+
+    return chip->vt->get_sizeof();
+}
+
+
+void aymo_ym7128_ctor(struct aymo_ym7128_chip* chip)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->ctor);
+
+    chip->vt->ctor(chip);
+}
+
+
+void aymo_ym7128_dtor(struct aymo_ym7128_chip* chip)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->dtor);
+
+    chip->vt->dtor(chip);
+}
+
+
+uint8_t aymo_ym7128_read(struct aymo_ym7128_chip* chip, uint16_t address)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->read);
+
+    return chip->vt->read(chip, address);
+}
+
+
+void aymo_ym7128_write(struct aymo_ym7128_chip* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->write);
+
+    chip->vt->write(chip, address, value);
+}
+
+
+void aymo_ym7128_process_i16(struct aymo_ym7128_chip* chip, uint32_t count, const int16_t x[], int16_t y[])
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->process_i16);
+
+    chip->vt->process_i16(chip, count, x, y);
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_ym7128_arm_neon.c b/src/aymo_ym7128_arm_neon.c
new file mode 100644
index 0000000..00b717c
--- /dev/null
+++ b/src/aymo_ym7128_arm_neon.c
@@ -0,0 +1,270 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+#include "aymo_cpu_arm_neon_inline.h"
+#include "aymo_ym7128_common.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ym7128_arm_neon.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_ym7128_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_ym7128_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_ym7128_ctor_f)&(aymo_(ctor)),
+    (aymo_ym7128_dtor_f)&(aymo_(dtor)),
+    (aymo_ym7128_read_f)&(aymo_(read)),
+    (aymo_ym7128_write_f)&(aymo_(write)),
+    (aymo_ym7128_process_i16_f)&(aymo_(process_i16))
+};
+
+
+const struct aymo_ym7128_vt* aymo_(get_vt)(void)
+{
+    return &aymo_(vt);
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    // Wipe everything
+    aymo_memset(chip, 0, sizeof(struct aymo_(chip)));
+
+    // Initialize input stage coefficients (-1 as a placeholder for computed values)
+    chip->xxv[2] = 1;
+    chip->xxv[3] = 1;
+
+    chip->kk1 = vseta(0, -1, -1, -1, -0x8000, -0x8000, -0x8000, -0x8000);
+    chip->kk2 = vseta(0, -1, -0x8000, 0, 0, 0, 0x8000, -0x8000);
+    chip->kkm = vseta(0, 0x7FFF, 0x7FFF, 0, 0, 0, AYMO_YM7128_DELAY_LENGTH, AYMO_YM7128_DELAY_LENGTH);
+
+    // Initialize oversampler coefficients
+    const int16_t* k = aymo_ym7128_kernel_linear;
+    chip->ka = vseta(k[ 6], k[ 6], k[ 4], k[ 4], k[ 2], k[ 2], k[ 0], k[ 0]);
+    chip->kb = vseta(k[ 7], k[ 7], k[ 5], k[ 5], k[ 3], k[ 3], k[ 1], k[ 1]);
+    chip->kc = vseta(k[14], k[14], k[12], k[12], k[10], k[10], k[ 8], k[ 8]);
+    chip->kd = vseta(k[15], k[15], k[13], k[13], k[11], k[11], k[ 9], k[ 9]);
+    chip->ke = vseta(    0,     0,     0,     0, k[18], k[18], k[16], k[16]);
+    chip->kf = vseta(    0,     0,     0,     0,     0,     0, k[17], k[17]);
+
+    // Initialize as pass-through
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_gl1, 0x3Fu);
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_gr1, 0x3Fu);
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vm, 0x3Fu);
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vl, 0x3Fu);
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vr, 0x3Fu);
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    assert(chip);
+
+    if (address < (uint16_t)AYMO_YM7128_REG_COUNT) {
+        return chip->regs[address];
+    }
+    return 0x00u;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address <= (uint16_t)aymo_ym7128_reg_gl8) {
+        value &= 0x3Fu;
+        int16_t gl = aymo_ym7128_gain[value];
+        int i = (int)(address - (uint16_t)aymo_ym7128_reg_gl1);
+        chip->kgl = vinsertn(chip->kgl, gl, i);
+    }
+    else if (address <= (uint16_t)aymo_ym7128_reg_gr8) {
+        value &= 0x3Fu;
+        int16_t gr = aymo_ym7128_gain[value];
+        int i = (int)(address - (uint16_t)aymo_ym7128_reg_gr1);
+        chip->kgr = vinsertn(chip->kgr, gr, i);
+    }
+    else if (address <= (uint16_t)aymo_ym7128_reg_vr) {
+        value &= 0x3Fu;
+        int16_t v = aymo_ym7128_gain[value];
+        if (address == (uint16_t)aymo_ym7128_reg_vm) {
+            chip->kk1 = vinsert(chip->kk1, -v, 5);
+        }
+        else if (address == (uint16_t)aymo_ym7128_reg_vc) {
+            chip->kk2 = vinsert(chip->kk2, v, 6);
+        }
+        else if (address == (uint16_t)aymo_ym7128_reg_vl) {
+            chip->kv = vinsert(chip->kv, v, 6);
+        }
+        else {
+            chip->kv = vinsert(chip->kv, v, 7);
+        }
+    }
+    else if (address <= (uint16_t)aymo_ym7128_reg_c1) {
+        value &= 0x3Fu;
+        int16_t v = ((int16_t)value << (16 - AYMO_YM7128_COEFF_BITS));
+        if (address == (uint16_t)aymo_ym7128_reg_c0) {
+            chip->kk1 = vinsert(chip->kk1, v, 4);
+        }
+        else {
+            chip->kk1 = vinsert(chip->kk1, v, 6);
+        }
+    }
+    else if (address <= (uint16_t)aymo_ym7128_reg_t8) {
+        value &= 0x1Fu;
+        int16_t t = aymo_ym7128_tap[value];
+        int16_t hi = chip->xxv[1];  // hi
+        t = (hi - t);
+        if (t < 0) {
+            t += AYMO_YM7128_DELAY_LENGTH;
+        }
+        if (address == (uint16_t)aymo_ym7128_reg_t0) {
+            chip->xxv[0] = t;  // ti0
+        }
+        else {
+            uint16_t i = (address - (uint16_t)aymo_ym7128_reg_t1);
+            chip->tiv[i] = t;
+        }
+    }
+
+    if (address < (uint16_t)AYMO_YM7128_REG_COUNT) {
+        chip->regs[address] = value;
+    }
+}
+
+
+void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[])
+{
+    assert(chip);
+    assert(x);
+    assert(y);
+    if AYMO_UNLIKELY(!count) return;
+
+    int16_t AYMO_ALIGN_V128 vv[8] = {0};
+
+    int16_t ti0 = chip->xxv[0];
+    int16_t t0 = chip->uh[ti0];
+    chip->xxv[4] = t0;
+
+    const int16_t* xe = &x[count];
+
+    while AYMO_LIKELY(x != xe) {
+        chip->xxv[5] = (*x++ & AYMO_YM7128_SIGNAL_MASK);
+
+        vsfence();
+        vi16x8_t xx = vload(chip->xxv);
+        chip->xxv[6] = t0;  // t0d = t0
+        xx = vmulhrs(xx, chip->kk1);
+        xx = vaddsi(xx, vrevv(xx));
+        xx = vmulhrs(xx, chip->kk2);
+        xx = vand(xx, vcmpgt(chip->kkm, xx));
+        xx = vaddsi(xx, vrev64q_s16(xx));
+        vstore(vv, xx);
+        vi16x8_t ti = vload(chip->tiv);
+        vi16x8_t tj = vsub(ti, vset1(-1));
+        vi16x8_t tm = vcmpgt(vset1(AYMO_YM7128_DELAY_LENGTH - 1), ti);  // tj < DL
+        vstore(chip->tiv, vand(tj, tm));
+        vsfence();
+
+        chip->xxv[0] = vv[0];  // ti0'
+        int16_t hj = vv[1];
+        chip->xxv[1] = hj;  // hi'
+        int16_t u = vv[5];
+        chip->uh[hj] = u;
+        int16_t AYMO_ALIGN_V128 tuv[8];
+        for (unsigned i = 0u; i < 8u; ++i) {
+            tuv[i] = chip->uh[chip->tiv[i]];
+        }
+        vsfence();
+        vi16x8_t tu = vload(tuv);
+
+        vi16x8_t gl = vmulhrs(tu, chip->kgl);
+        vi16x8_t gr = vmulhrs(tu, chip->kgr);
+        vi32x4_t ggl = vpaddlq_s16(gl);
+        vi32x4_t ggr = vpaddlq_s16(gr);
+        ggl = vvadd(ggl, vvext(ggl, ggl, 2));
+        ggr = vvadd(ggr, vvext(ggr, ggr, 2));
+        ggl = vvadd(ggl, vvrev(ggl));
+        ggr = vvadd(ggr, vvrev(ggr));
+        vi16x8_t ggrl = vvpacks(ggr, ggl);
+        vi16x8_t gglr = vext(ggrl, ggrl, 1);
+        vi16x8_t vlr = vmulhrs(gglr, chip->kv);
+ 
+        vi16x8_t zc = chip->zc;
+        vi16x8_t zb = chip->zb;
+        zc = vext(zb, zc, 6);  // '543210..'
+        chip->zc = zc;
+
+        vi16x8_t y1 = vmulhrs(zc, chip->kf);
+        vi16x8_t y0 = vmulhrs(zc, chip->ke);
+
+        vi16x8_t za = chip->za;
+        zb = vext(za, zb, 6);  // '543210..'
+        chip->zb = zb;
+
+        y1 = vaddsi(y1, vmulhrs(zb, chip->kd));
+        y0 = vaddsi(y0, vmulhrs(zb, chip->kc));
+
+        za = vext(vlr, za, 6);  // '543210..'
+        chip->za = za;
+
+        y1 = vaddsi(y1, vmulhrs(za, chip->kb));
+        y0 = vaddsi(y0, vmulhrs(za, chip->ka));
+
+        vi16x4_t yy0 = vqadd_s16(vgetlo(y0), vgethi(y0));
+        vi16x4_t yy1 = vqadd_s16(vgetlo(y1), vgethi(y1));
+        yy0 = vqadd_s16(yy0, vext_s16(yy0, yy0, 2));
+        yy1 = vqadd_s16(yy1, vext_s16(yy1, yy1, 2));
+
+        vi16x4_t yy = vext_s16(yy0, yy1, 2);
+        yy = vand_s16(yy, vdup_n_s16((int16_t)AYMO_YM7128_SIGNAL_MASK));
+        vst1_s16(y, yy); y += 4u;
+
+        ti0 = chip->xxv[0];
+        t0 = chip->uh[ti0];
+        chip->xxv[4] = t0;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
diff --git a/src/aymo_ym7128_common.c b/src/aymo_ym7128_common.c
new file mode 100644
index 0000000..e5482b4
--- /dev/null
+++ b/src/aymo_ym7128_common.c
@@ -0,0 +1,192 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_ym7128_common.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#define PGAIN(x)    ((int16_t)((double)(x) * (double)AYMO_YM7128_GAIN_UNIT)  \
+                    & (int16_t)AYMO_YM7128_GAIN_MASK)
+
+#define NGAIN(x)    ((int16_t)(~(int32_t)PGAIN(x) & (int32_t)0xFFFF))  // pseudo-negative
+
+const int16_t aymo_ym7128_gain[64u] =
+{
+    NGAIN(0.000000000000000000),  // -oo dB-
+    NGAIN(0.001000000000000000),  // -60 dB-
+    NGAIN(0.001258925411794167),  // -58 dB-
+    NGAIN(0.001584893192461114),  // -56 dB-
+    NGAIN(0.001995262314968879),  // -54 dB-
+    NGAIN(0.002511886431509579),  // -52 dB-
+    NGAIN(0.003162277660168379),  // -50 dB-
+    NGAIN(0.003981071705534973),  // -48 dB-
+    NGAIN(0.005011872336272725),  // -46 dB-
+    NGAIN(0.006309573444801930),  // -44 dB-
+    NGAIN(0.007943282347242814),  // -42 dB-
+    NGAIN(0.010000000000000000),  // -40 dB-
+    NGAIN(0.012589254117941675),  // -38 dB-
+    NGAIN(0.015848931924611134),  // -36 dB-
+    NGAIN(0.019952623149688799),  // -34 dB-
+    NGAIN(0.025118864315095794),  // -32 dB-
+    NGAIN(0.031622776601683791),  // -30 dB-
+    NGAIN(0.039810717055349734),  // -28 dB-
+    NGAIN(0.050118723362727220),  // -26 dB-
+    NGAIN(0.063095734448019331),  // -24 dB-
+    NGAIN(0.079432823472428138),  // -22 dB-
+    NGAIN(0.100000000000000006),  // -20 dB-
+    NGAIN(0.125892541179416728),  // -18 dB-
+    NGAIN(0.158489319246111343),  // -16 dB-
+    NGAIN(0.199526231496887974),  // -14 dB-
+    NGAIN(0.251188643150958013),  // -12 dB-
+    NGAIN(0.316227766016837941),  // -10 dB-
+    NGAIN(0.398107170553497203),  // - 8 dB-
+    NGAIN(0.501187233627272244),  // - 6 dB-
+    NGAIN(0.630957344480193250),  // - 4 dB-
+    NGAIN(0.794328234724281490),  // - 2 dB-
+    NGAIN(1.000000000000000000),  // - 0 dB-
+
+    PGAIN(0.000000000000000000),  // -oo dB+
+    PGAIN(0.001000000000000000),  // -60 dB+
+    PGAIN(0.001258925411794167),  // -58 dB+
+    PGAIN(0.001584893192461114),  // -56 dB+
+    PGAIN(0.001995262314968879),  // -54 dB+
+    PGAIN(0.002511886431509579),  // -52 dB+
+    PGAIN(0.003162277660168379),  // -50 dB+
+    PGAIN(0.003981071705534973),  // -48 dB+
+    PGAIN(0.005011872336272725),  // -46 dB+
+    PGAIN(0.006309573444801930),  // -44 dB+
+    PGAIN(0.007943282347242814),  // -42 dB+
+    PGAIN(0.010000000000000000),  // -40 dB+
+    PGAIN(0.012589254117941675),  // -38 dB+
+    PGAIN(0.015848931924611134),  // -36 dB+
+    PGAIN(0.019952623149688799),  // -34 dB+
+    PGAIN(0.025118864315095794),  // -32 dB+
+    PGAIN(0.031622776601683791),  // -30 dB+
+    PGAIN(0.039810717055349734),  // -28 dB+
+    PGAIN(0.050118723362727220),  // -26 dB+
+    PGAIN(0.063095734448019331),  // -24 dB+
+    PGAIN(0.079432823472428138),  // -22 dB+
+    PGAIN(0.100000000000000006),  // -20 dB+
+    PGAIN(0.125892541179416728),  // -18 dB+
+    PGAIN(0.158489319246111343),  // -16 dB+
+    PGAIN(0.199526231496887974),  // -14 dB+
+    PGAIN(0.251188643150958013),  // -12 dB+
+    PGAIN(0.316227766016837941),  // -10 dB+
+    PGAIN(0.398107170553497203),  // - 8 dB+
+    PGAIN(0.501187233627272244),  // - 6 dB+
+    PGAIN(0.630957344480193250),  // - 4 dB+
+    PGAIN(0.794328234724281490),  // - 2 dB+
+    PGAIN(1.000000000000000000)   // - 0 dB+
+};
+
+
+#define TAP(i)  ((int16_t)(((i) * (AYMO_YM7128_DELAY_LENGTH - 1)) / (AYMO_YM7128_TAP_COUNT - 1)))
+
+const int16_t aymo_ym7128_tap[32u] =
+{
+    TAP( 0),  //   0.0 ms
+    TAP( 1),  //   3.2 ms
+    TAP( 2),  //   6.5 ms
+    TAP( 3),  //   9.7 ms
+    TAP( 4),  //  12.9 ms
+    TAP( 5),  //  16.1 ms
+    TAP( 6),  //  19.3 ms
+    TAP( 7),  //  22.6 ms
+    TAP( 8),  //  25.8 ms
+    TAP( 9),  //  29.0 ms
+    TAP(10),  //  32.3 ms
+    TAP(11),  //  35.5 ms
+    TAP(12),  //  38.7 ms
+    TAP(13),  //  41.9 ms
+    TAP(14),  //  45.2 ms
+    TAP(15),  //  48.4 ms
+    TAP(16),  //  51.6 ms
+    TAP(17),  //  54.9 ms
+    TAP(18),  //  58.1 ms
+    TAP(19),  //  61.3 ms
+    TAP(20),  //  64.5 ms
+    TAP(21),  //  67.8 ms
+    TAP(22),  //  71.0 ms
+    TAP(23),  //  74.2 ms
+    TAP(24),  //  77.4 ms
+    TAP(25),  //  80.7 ms
+    TAP(26),  //  83.9 ms
+    TAP(27),  //  87.1 ms
+    TAP(28),  //  90.4 ms
+    TAP(29),  //  93.6 ms
+    TAP(30),  //  96.8 ms
+    TAP(31)   // 100.0 ms
+};
+
+
+#undef KERNEL
+#define KERNEL(x)   ((int16_t)((double)(x) * (double)AYMO_YM7128_GAIN_UNIT)  \
+                    & (int16_t)AYMO_YM7128_GAIN_MASK)
+
+const int16_t aymo_ym7128_kernel_linear[19u] =
+{
+    KERNEL(+0.005969087803865891),
+    KERNEL(-0.003826518613910499),
+    KERNEL(-0.016623943725986926),
+    KERNEL(+0.007053928712894589),
+    KERNEL(+0.038895802111020034),
+    KERNEL(-0.010501507751597486),
+    KERNEL(-0.089238395139830201),
+    KERNEL(+0.013171814880420758),
+    KERNEL(+0.312314472963171053),
+    KERNEL(+0.485820312497107776),
+    KERNEL(+0.312314472963171053),
+    KERNEL(+0.013171814880420758),
+    KERNEL(-0.089238395139830201),
+    KERNEL(-0.010501507751597486),
+    KERNEL(+0.038895802111020034),
+    KERNEL(+0.007053928712894589),
+    KERNEL(-0.016623943725986926),
+    KERNEL(-0.003826518613910499),
+    KERNEL(+0.005969087803865891)
+};
+
+
+const int16_t aymo_ym7128_kernel_minphase[19u] =
+{
+    KERNEL(+0.073585247514714749),
+    KERNEL(+0.269340051166713890),
+    KERNEL(+0.442535202999738531),
+    KERNEL(+0.350129745841520346),
+    KERNEL(+0.026195691646307945),
+    KERNEL(-0.178423532471468610),
+    KERNEL(-0.081176763571493171),
+    KERNEL(+0.083194010466739091),
+    KERNEL(+0.067960765530891545),
+    KERNEL(-0.035840063980478287),
+    KERNEL(-0.044393769145659796),
+    KERNEL(+0.013156688603347873),
+    KERNEL(+0.023451305043275420),
+    KERNEL(-0.004374029821991059),
+    KERNEL(-0.009480786001493536),
+    KERNEL(+0.002700502551912207),
+    KERNEL(+0.003347671274177581),
+    KERNEL(-0.002391896275498628),
+    KERNEL(+0.000483958628744376)
+};
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_ym7128_none.c b/src/aymo_ym7128_none.c
new file mode 100644
index 0000000..7d7dc48
--- /dev/null
+++ b/src/aymo_ym7128_none.c
@@ -0,0 +1,130 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_ym7128_common.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ym7128_none.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_ym7128_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_ym7128_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_ym7128_ctor_f)&(aymo_(ctor)),
+    (aymo_ym7128_dtor_f)&(aymo_(dtor)),
+    (aymo_ym7128_read_f)&(aymo_(read)),
+    (aymo_ym7128_write_f)&(aymo_(write)),
+    (aymo_ym7128_process_i16_f)&(aymo_(process_i16))
+};
+
+
+const struct aymo_ym7128_vt* aymo_(get_vt)(void)
+{
+    return &aymo_(vt);
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    YM7128B_ChipFixed* emu = &chip->emu;
+    YM7128B_ChipFixed_Ctor(emu);
+    YM7128B_ChipFixed_Reset(emu);
+
+    // Initialize as pass-through
+    YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_GL1, 0x3Fu);
+    YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_GR1, 0x3Fu);
+    YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_VM, 0x3Fu);
+    YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_VL, 0x3Fu);
+    YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_VR, 0x3Fu);
+
+    YM7128B_ChipFixed_Start(emu);
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    YM7128B_ChipFixed* emu = &chip->emu;
+    YM7128B_ChipFixed_Stop(emu);
+    YM7128B_ChipFixed_Dtor(emu);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    assert(chip);
+
+    if (address <= (uint16_t)YM7128B_Address_Max) {
+        return YM7128B_ChipFixed_Read(&chip->emu, (YM7128B_Address)address);
+    }
+    return 0x00u;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address <= (uint16_t)YM7128B_Address_Max) {
+        YM7128B_ChipFixed_Write(&chip->emu, (YM7128B_Address)address, value);
+    }
+}
+
+
+void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[])
+{
+    assert(chip);
+    assert(x);
+    assert(y);
+    if AYMO_UNLIKELY(!count) return;
+
+    YM7128B_ChipFixed* emu = &chip->emu;
+    YM7128B_ChipFixed_Process_Data data;
+
+    const int16_t* xe = &x[count];
+
+    while AYMO_LIKELY(x != xe) {
+        data.inputs[YM7128B_InputChannel_Mono] = *x++;
+
+        YM7128B_ChipFixed_Process(emu, &data);
+
+        for (int k = 0; k < YM7128B_Oversampler_Factor; ++k) {
+            for (int c = 0; c < YM7128B_OutputChannel_Count; ++c) {
+                *y++ = data.outputs[c][k];
+            }
+        }
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_ym7128_x86_sse41.c b/src/aymo_ym7128_x86_sse41.c
new file mode 100644
index 0000000..67f7a0a
--- /dev/null
+++ b/src/aymo_ym7128_x86_sse41.c
@@ -0,0 +1,270 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published yb the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#include "aymo_cpu_x86_sse41_inline.h"
+#include "aymo_ym7128_common.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ym7128_x86_sse41.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_ym7128_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_ym7128_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_ym7128_ctor_f)&(aymo_(ctor)),
+    (aymo_ym7128_dtor_f)&(aymo_(dtor)),
+    (aymo_ym7128_read_f)&(aymo_(read)),
+    (aymo_ym7128_write_f)&(aymo_(write)),
+    (aymo_ym7128_process_i16_f)&(aymo_(process_i16))
+};
+
+
+const struct aymo_ym7128_vt* aymo_(get_vt)(void)
+{
+    return &aymo_(vt);
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    // Wipe everything
+    aymo_memset(chip, 0, sizeof(struct aymo_(chip)));
+
+    // Initialize input stage coefficients (-1 as a placeholder for computed values)
+    chip->xxv[2] = 1;
+    chip->xxv[3] = 1;
+
+    chip->kk1 = vseta(0, -1, -1, -1, -0x8000, -0x8000, -0x8000, -0x8000);
+    chip->kk2 = vseta(0, -1, -0x8000, 0, 0, 0, 0x8000, -0x8000);
+    chip->kkm = vseta(0, 0x7FFF, 0x7FFF, 0, 0, 0, AYMO_YM7128_DELAY_LENGTH, AYMO_YM7128_DELAY_LENGTH);
+
+    // Initialize oversampler coefficients
+    const int16_t* k = aymo_ym7128_kernel_linear;
+    chip->ka = vseta(k[ 6], k[ 6], k[ 4], k[ 4], k[ 2], k[ 2], k[ 0], k[ 0]);
+    chip->kb = vseta(k[ 7], k[ 7], k[ 5], k[ 5], k[ 3], k[ 3], k[ 1], k[ 1]);
+    chip->kc = vseta(k[14], k[14], k[12], k[12], k[10], k[10], k[ 8], k[ 8]);
+    chip->kd = vseta(k[15], k[15], k[13], k[13], k[11], k[11], k[ 9], k[ 9]);
+    chip->ke = vseta(    0,     0,     0,     0, k[18], k[18], k[16], k[16]);
+    chip->kf = vseta(    0,     0,     0,     0,     0,     0, k[17], k[17]);
+
+    // Initialize as pass-through
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_gl1, 0x3Fu);
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_gr1, 0x3Fu);
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vm, 0x3Fu);
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vl, 0x3Fu);
+    aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vr, 0x3Fu);
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    assert(chip);
+
+    if (address < (uint16_t)AYMO_YM7128_REG_COUNT) {
+        return chip->regs[address];
+    }
+    return 0x00u;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address <= (uint16_t)aymo_ym7128_reg_gl8) {
+        value &= 0x3Fu;
+        int16_t gl = aymo_ym7128_gain[value];
+        int i = (int)(address - (uint16_t)aymo_ym7128_reg_gl1);
+        chip->kgl = vinsertn(chip->kgl, gl, i);
+    }
+    else if (address <= (uint16_t)aymo_ym7128_reg_gr8) {
+        value &= 0x3Fu;
+        int16_t gr = aymo_ym7128_gain[value];
+        int i = (int)(address - (uint16_t)aymo_ym7128_reg_gr1);
+        chip->kgr = vinsertn(chip->kgr, gr, i);
+    }
+    else if (address <= (uint16_t)aymo_ym7128_reg_vr) {
+        value &= 0x3Fu;
+        int16_t v = aymo_ym7128_gain[value];
+        if (address == (uint16_t)aymo_ym7128_reg_vm) {
+            chip->kk1 = vinsert(chip->kk1, -v, 5);
+        }
+        else if (address == (uint16_t)aymo_ym7128_reg_vc) {
+            chip->kk2 = vinsert(chip->kk2, v, 6);
+        }
+        else if (address == (uint16_t)aymo_ym7128_reg_vl) {
+            chip->kv = vinsert(chip->kv, v, 6);
+        }
+        else {
+            chip->kv = vinsert(chip->kv, v, 7);
+        }
+    }
+    else if (address <= (uint16_t)aymo_ym7128_reg_c1) {
+        value &= 0x3Fu;
+        int16_t v = ((int16_t)value << (16 - AYMO_YM7128_COEFF_BITS));
+        if (address == (uint16_t)aymo_ym7128_reg_c0) {
+            chip->kk1 = vinsert(chip->kk1, v, 4);
+        }
+        else {
+            chip->kk1 = vinsert(chip->kk1, v, 6);
+        }
+    }
+    else if (address <= (uint16_t)aymo_ym7128_reg_t8) {
+        value &= 0x1Fu;
+        int16_t t = aymo_ym7128_tap[value];
+        int16_t hi = chip->xxv[1];  // hi
+        t = (hi - t);
+        if (t < 0) {
+            t += AYMO_YM7128_DELAY_LENGTH;
+        }
+        if (address == (uint16_t)aymo_ym7128_reg_t0) {
+            chip->xxv[0] = t;  // ti0
+        }
+        else {
+            uint16_t i = (address - (uint16_t)aymo_ym7128_reg_t1);
+            chip->tiv[i] = t;
+        }
+    }
+
+    if (address < (uint16_t)AYMO_YM7128_REG_COUNT) {
+        chip->regs[address] = value;
+    }
+}
+
+
+void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[])
+{
+    assert(chip);
+    assert(x);
+    assert(y);
+    if AYMO_UNLIKELY(!count) return;
+
+    int16_t AYMO_ALIGN_V128 vv[8] = {0};
+
+    int16_t ti0 = chip->xxv[0];
+    int16_t t0 = chip->uh[ti0];
+    chip->xxv[4] = t0;
+
+    const int16_t* xe = &x[count];
+
+    while AYMO_LIKELY(x != xe) {
+        chip->xxv[5] = (*x++ & AYMO_YM7128_SIGNAL_MASK);
+
+        vsfence();
+        vi16x8_t xx = vload((void*)chip->xxv);
+        chip->xxv[6] = t0;  // t0d = t0
+        xx = vmulhrs(xx, chip->kk1);
+        xx = vaddsi(xx, vvshuffle(xx, KSHUFFLE(2, 3, 0, 1)));  // "2301"
+        xx = vmulhrs(xx, chip->kk2);
+        xx = vand(xx, vcmpgt(chip->kkm, xx));
+        xx = vaddsi(xx, valignr(xx, xx, 2));
+        vstore((void*)vv, xx);
+        vi16x8_t ti = vload((void*)chip->tiv);
+        vi16x8_t tj = vsub(ti, vset1(-1));
+        vi16x8_t tm = vcmpgt(vset1(AYMO_YM7128_DELAY_LENGTH - 1), ti);  // tj < DL
+        vstore((void*)chip->tiv, vand(tj, tm));
+        vsfence();
+
+        chip->xxv[0] = vv[7];  // ti0'
+        int16_t hj = vv[1];
+        chip->xxv[1] = hj;  // hi'
+        int16_t u = vv[5];
+        chip->uh[hj] = u;
+        int16_t AYMO_ALIGN_V128 tuv[8];
+        for (unsigned i = 0u; i < 8u; ++i) {
+            tuv[i] = chip->uh[chip->tiv[i]];
+        }
+        vsfence();
+        vi16x8_t tu = vload((void*)tuv);
+
+        vi16x8_t gl = vmulhrs(tu, chip->kgl);
+        vi16x8_t gr = vmulhrs(tu, chip->kgr);
+        vi32x4_t ggl = vmadd(gl, vset1(1));
+        vi32x4_t ggr = vmadd(gr, vset1(1));
+        ggl = vvadd(ggl, vvshuffle(ggl, KSHUFFLE(1, 0, 3, 2)));  // "1032"
+        ggr = vvadd(ggr, vvshuffle(ggr, KSHUFFLE(1, 0, 3, 2)));  // "1032"
+        ggl = vvadd(ggl, vvshuffle(ggl, KSHUFFLE(2, 3, 0, 1)));  // "2301"
+        ggr = vvadd(ggr, vvshuffle(ggr, KSHUFFLE(2, 3, 0, 1)));  // "2301"
+        vi16x8_t ggrl = vvpacks(ggr, ggl);
+        vi16x8_t gglr = valignr(ggrl, ggrl, 2);
+        vi16x8_t vlr = vmulhrs(gglr, chip->kv);
+
+        vi16x8_t zc = chip->zc;
+        vi16x8_t zb = chip->zb;
+        zc = valignr(zc, zb, 12);  // '543210..'
+        chip->zc = zc;
+
+        vi16x8_t y1 = vmulhrs(zc, chip->kf);
+        vi16x8_t y0 = vmulhrs(zc, chip->ke);
+
+        vi16x8_t za = chip->za;
+        zb = valignr(zb, za, 12);  // '543210..'
+        chip->zb = zb;
+
+        y1 = vaddsi(y1, vmulhrs(zb, chip->kd));
+        y0 = vaddsi(y0, vmulhrs(zb, chip->kc));
+
+        za = valignr(za, vlr, 12);  // '543210..'
+        chip->za = za;
+
+        y1 = vaddsi(y1, vmulhrs(za, chip->kb));
+        y0 = vaddsi(y0, vmulhrs(za, chip->ka));
+
+        y0 = vaddsi(y0, vvshuffle(y0, KSHUFFLE(1, 0, 3, 2)));  // "1032"
+        y1 = vaddsi(y1, vvshuffle(y1, KSHUFFLE(1, 0, 3, 2)));  // "1032"
+        y0 = vaddsi(y0, vvshuffle(y0, KSHUFFLE(2, 3, 0, 1)));  // "2301"
+        y1 = vaddsi(y1, vvshuffle(y1, KSHUFFLE(2, 3, 0, 1)));  // "2301"
+
+        vi16x8_t yy = vblendi(y0, y1, 0xCC);        // '1100''1100'
+        yy = vand(yy, vset1((int16_t)AYMO_YM7128_SIGNAL_MASK));
+        vstorelo((void*)y, yy); y += 4u;
+
+        ti0 = chip->xxv[0];
+        t0 = chip->uh[ti0];
+        chip->xxv[4] = t0;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
diff --git a/src/aymo_ymf262.c b/src/aymo_ymf262.c
new file mode 100644
index 0000000..c2f707c
--- /dev/null
+++ b/src/aymo_ymf262.c
@@ -0,0 +1,250 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <assert.h>
+#include "aymo_cpu.h"
+#include "aymo_ymf262.h"
+#include "aymo_ymf262_arm_neon.h"
+#include "aymo_ymf262_none.h"
+#include "aymo_ymf262_x86_sse41.h"
+#include "aymo_ymf262_x86_avx.h"
+#include "aymo_ymf262_x86_avx2.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+static const struct aymo_ymf262_vt* aymo_ymf262_best_vt;
+
+
+void aymo_ymf262_boot(void)
+{
+    #ifdef AYMO_CPU_SUPPORT_X86_AVX2
+        if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) {
+            aymo_ymf262_best_vt = aymo_ymf262_x86_avx2_get_vt();
+            return;
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_X86_AVX
+        if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX) {
+            aymo_ymf262_best_vt = aymo_ymf262_x86_avx_get_vt();
+            return;
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_X86_SSE41
+        if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) {
+            aymo_ymf262_best_vt = aymo_ymf262_x86_sse41_get_vt();
+            return;
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_ARM_NEON
+        if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) {
+            aymo_ymf262_best_vt = aymo_ymf262_arm_neon_get_vt();
+        }
+    #endif
+
+    aymo_ymf262_best_vt = aymo_ymf262_none_get_vt();
+}
+
+
+const struct aymo_ymf262_vt* aymo_ymf262_get_vt(const char* cpu_ext)
+{
+    if (cpu_ext == NULL) {
+        return NULL;
+    }
+
+    #ifdef AYMO_CPU_SUPPORT_X86_AVX2
+        if (!aymo_strcmp(cpu_ext, "x86_avx2")) {
+            if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) {
+                return aymo_ymf262_x86_avx2_get_vt();
+            }
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_X86_AVX
+        if (!aymo_strcmp(cpu_ext, "x86_avx")) {
+            if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX) {
+                return aymo_ymf262_x86_avx_get_vt();
+            }
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_X86_SSE41
+        if (!aymo_strcmp(cpu_ext, "x86_sse41")) {
+            if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) {
+                return aymo_ymf262_x86_sse41_get_vt();
+            }
+        }
+    #endif
+
+    #ifdef AYMO_CPU_SUPPORT_ARM_NEON
+        if (!aymo_strcmp(cpu_ext, "arm_neon")) {
+            if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) {
+                return aymo_ymf262_arm_neon_get_vt();
+            }
+        }
+    #endif
+
+    if (!aymo_strcmp(cpu_ext, "none")) {
+        return aymo_ymf262_none_get_vt();
+    }
+    return NULL;
+}
+
+
+const struct aymo_ymf262_vt* aymo_ymf262_get_best_vt(void)
+{
+    return aymo_ymf262_best_vt;
+}
+
+
+uint32_t aymo_ymf262_get_sizeof(struct aymo_ymf262_chip* chip)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->get_sizeof);
+
+    return chip->vt->get_sizeof();
+}
+
+
+void aymo_ymf262_ctor(struct aymo_ymf262_chip* chip)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->ctor);
+
+    chip->vt->ctor(chip);
+}
+
+
+void aymo_ymf262_dtor(struct aymo_ymf262_chip* chip)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->dtor);
+
+    chip->vt->dtor(chip);
+}
+
+
+uint8_t aymo_ymf262_read(struct aymo_ymf262_chip* chip, uint16_t address)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->read);
+
+    return chip->vt->read(chip, address);
+}
+
+
+void aymo_ymf262_write(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->write);
+
+    chip->vt->write(chip, address, value);
+}
+
+
+int aymo_ymf262_enqueue_write(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->enqueue_write);
+
+    return chip->vt->enqueue_write(chip, address, value);
+}
+
+
+int aymo_ymf262_enqueue_delay(struct aymo_ymf262_chip* chip, uint32_t count)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->enqueue_delay);
+
+    return chip->vt->enqueue_delay(chip, count);
+}
+
+
+int16_t aymo_ymf262_get_output(struct aymo_ymf262_chip* chip, uint8_t channel)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->get_output);
+
+    return chip->vt->get_output(chip, channel);
+}
+
+
+void aymo_ymf262_tick(struct aymo_ymf262_chip* chip, uint32_t count)
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->tick);
+
+    chip->vt->tick(chip, count);
+}
+
+
+void aymo_ymf262_generate_i16x2(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->generate_i16x2);
+
+    chip->vt->generate_i16x2(chip, count, y);
+}
+
+
+void aymo_ymf262_generate_i16x4(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->generate_i16x4);
+
+    chip->vt->generate_i16x4(chip, count, y);
+}
+
+
+void aymo_ymf262_generate_f32x2(struct aymo_ymf262_chip* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->generate_f32x2);
+
+    chip->vt->generate_f32x2(chip, count, y);
+}
+
+
+void aymo_ymf262_generate_f32x4(struct aymo_ymf262_chip* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(chip->vt);
+    assert(chip->vt->generate_f32x4);
+
+    chip->vt->generate_f32x4(chip, count, y);
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_ymf262_arm_neon.c b/src/aymo_ymf262_arm_neon.c
new file mode 100644
index 0000000..52a8f57
--- /dev/null
+++ b/src/aymo_ymf262_arm_neon.c
@@ -0,0 +1,1688 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <assert.h>
+#include "aymo_cpu_arm_neon_inline.h"
+#include "aymo_ymf262.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_arm_neon.h"
+
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_ymf262_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_ymf262_ctor_f)&(aymo_(ctor)),
+    (aymo_ymf262_dtor_f)&(aymo_(dtor)),
+    (aymo_ymf262_read_f)&(aymo_(read)),
+    (aymo_ymf262_write_f)&(aymo_(write)),
+    (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)),
+    (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)),
+    (aymo_ymf262_get_output_f)&(aymo_(get_output)),
+    (aymo_ymf262_tick_f)&(aymo_(tick)),
+    (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)),
+    (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)),
+    (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)),
+    (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4))
+};
+
+
+// 32-bit Slot Group side (lo/hi)
+const int8_t aymo_(sgo_side)[8] =
+{
+    0, 0, 0, 0,  1, 1, 1, 1
+};
+
+// 32-bit Slot Group cell
+const int8_t aymo_(sgo_cell)[8] =
+{
+    0, 1, 2, 3,  0, 1, 2, 3
+};
+
+
+const int16_t aymo_(eg_incstep_table)[4] =
+{
+    ((1 << 3) | (1 << 2) | (1 << 1) | (0 << 0)),
+    ((1 << 3) | (0 << 2) | (0 << 1) | (0 << 0)),
+    ((1 << 3) | (1 << 2) | (0 << 1) | (0 << 0)),
+    ((0 << 3) | (0 << 2) | (0 << 1) | (0 << 0))
+};
+
+
+// Wave descriptors
+const struct aymo_(wave) aymo_(wave_table)[8] =  // TODO: share bits; select vit shifts
+{
+    { 0,  0x0000,  0x0200,  0x0100,  0x00FF,  -1 },
+    { 0,  0x0200,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 0,  0x0000,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 0,  0x0100,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0400,  0x0200,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0400,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 0,  0x0000,  0x0200,  0x0200,  0x0001,   0 },
+    { 3,  0x0000,  0x1000,  0x1000,  0x1FFF,   0 }
+};
+
+
+// 2-channel connection descriptors
+const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */] =
+{
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,  -1 }
+    },
+};
+
+// 4-channel connection descriptors
+const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */] =
+{
+    {
+        { -1,   0,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,  -1 },
+        {  0,   0,  -1 }
+    },
+};
+
+// Rhythm connection descriptors
+const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */] =
+{
+    // Channel 6: BD, FM
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    // Channel 6: BD, AM
+    {
+        { -1,   0,   0 },
+        {  0,   0,  -1 }
+    },
+    // Channel 7: HH + SD
+    {
+        {  0,   0,  -1 },
+        {  0,   0,  -1 }
+    },
+    // Channel 8: TT + TC
+    {
+        {  0,   0,  -1 },
+        {  0,   0,  -1 }
+    }
+};
+
+
+// Slot mask output delay for outputs A and C
+const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)] =  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+{
+    0xF8,
+    0xF8,
+    0xF8,
+    0xFF,
+    0xF8,
+    0xFF,
+    0xF8,
+    0xFF
+};
+
+
+// Slot mask output delay for outputs B and D
+const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)] =  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+{
+    0x88,
+    0xF8,
+    0x88,
+    0xF8,
+    0x88,
+    0xFF,
+    0x88,
+    0xFF
+};
+
+
+// Updates phase generator
+static inline
+void aymo_(pg_update_deltafreq)(
+    struct aymo_(chip)* chip,
+    struct aymo_(ch2x_group)* cg,
+    struct aymo_(slot_group)* sg
+)
+{
+    // Update phase
+    vi16_t fnum = cg->pg_fnum;
+    vi16_t range = vand(fnum, vset1(7 << 7));
+    range = vand(sg->pg_vib, vsllv(range, chip->pg_vib_shs));
+    range = vmullo(range, chip->pg_vib_sign);
+    fnum = vadd(fnum, range);
+
+    vi32_t fnum_lo = vunpacklo(fnum);
+    vi32_t fnum_hi = vunpackhi(fnum);
+    vi32_t block_sll_lo = vunpacklo(cg->pg_block);
+    vi32_t block_sll_hi = vunpackhi(cg->pg_block);
+    vi32_t basefreq_lo = vvsrli(vvsllv(fnum_lo, block_sll_lo), 1);
+    vi32_t basefreq_hi = vvsrli(vvsllv(fnum_hi, block_sll_hi), 1);
+    vi32_t pg_mult_x2_lo = vunpacklo(sg->pg_mult_x2);
+    vi32_t pg_mult_x2_hi = vunpackhi(sg->pg_mult_x2);
+    vi32_t deltafreq_lo = vvsrli(vvmullo(basefreq_lo, pg_mult_x2_lo), 1);
+    vi32_t deltafreq_hi = vvsrli(vvmullo(basefreq_hi, pg_mult_x2_hi), 1);
+    sg->pg_deltafreq_lo = deltafreq_lo;
+    sg->pg_deltafreq_hi = deltafreq_hi;
+}
+
+
+// Updates noise generator
+static inline
+void aymo_(ng_update)(struct aymo_(chip)* chip, unsigned times)
+{
+    // Update noise
+    uint32_t noise = chip->ng_noise;
+    while (times--) {
+        uint32_t n_bit = (((noise >> 14) ^ noise) & 1);
+        noise = ((noise >> 1) | (n_bit << 22));
+    }
+    chip->ng_noise = noise;
+}
+
+
+// Updates rhythm manager, slot group 1
+static inline
+void aymo_(rm_update_sg1)(struct aymo_(chip)* chip)
+{
+    struct aymo_(slot_group)* sg = &chip->sg[1];
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Double rhythm outputs
+        vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0);
+        vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask);
+        chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a));
+        chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b));
+        chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c));
+        chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d));
+    }
+
+    vi16_t phase = sg->pg_phase_out;
+    uint16_t phase13 = (uint16_t)vextract(phase, 1);
+
+    // Update noise bits
+    chip->rm_hh_bit2 = ((phase13 >> 2) & 1);
+    chip->rm_hh_bit3 = ((phase13 >> 3) & 1);
+    chip->rm_hh_bit7 = ((phase13 >> 7) & 1);
+    chip->rm_hh_bit8 = ((phase13 >> 8) & 1);
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Calculate noise bit
+        uint16_t rm_xor = (
+            (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) |
+            (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) |
+            (chip->rm_tc_bit3 ^ chip->rm_tc_bit5)
+        );
+
+        // Update HH
+        uint16_t noise = (uint16_t)chip->ng_noise;
+        phase13 = (rm_xor << 9);
+        if (rm_xor ^ (noise & 1)) {
+            phase13 |= 0xD0;
+        } else {
+            phase13 |= 0x34;
+        }
+        phase = vinsert(phase, (int16_t)phase13, 1);
+
+        sg->pg_phase_out = phase;
+    }
+}
+
+
+// Updates rhythm manager, slot group 3
+static inline
+void aymo_(rm_update_sg3)(struct aymo_(chip)* chip)
+{
+    struct aymo_(slot_group)* sg = &chip->sg[3];
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Double rhythm outputs
+        vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0);
+        vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask);
+        chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a));
+        chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b));
+        chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c));
+        chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d));
+
+        // Calculate noise bit
+        uint16_t rm_xor = (
+            (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) |
+            (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) |
+            (chip->rm_tc_bit3 ^ chip->rm_tc_bit5)
+        );
+        vi16_t phase = sg->pg_phase_out;
+
+        // Update SD
+        uint16_t noise = (uint16_t)chip->ng_noise;
+        uint16_t phase16 = (
+            ((uint16_t)chip->rm_hh_bit8 << 9) |
+            ((uint16_t)(chip->rm_hh_bit8 ^ (noise & 1)) << 8)
+        );
+        phase = vinsert(phase, (int16_t)phase16, 1);
+
+        // Update TC
+        uint32_t phase17 = vextract(phase, 2);
+        chip->rm_tc_bit3 = ((phase17 >> 3) & 1);
+        chip->rm_tc_bit5 = ((phase17 >> 5) & 1);
+        phase17 = ((rm_xor << 9) | 0x80);
+        phase = vinsert(phase, (int16_t)phase17, 2);
+
+        sg->pg_phase_out = phase;
+    }
+}
+
+
+// Updates slot generators
+static
+void aymo_(sg_update)(
+    struct aymo_(chip)* chip,
+    struct aymo_(slot_group)* sg
+)
+{
+    // EG: Compute envelope output
+    vi16_t sg_eg_rout = sg->eg_rout;
+    sg->eg_out = vadd(vadd(sg_eg_rout, sg->eg_tremolo_am), sg->eg_ksl_sh_tl_x4);
+
+    // PG: Compute phase output
+    vi32_t phase_out_mask = vvset1(0xFFFF);
+    vi32_t phase_out_lo = vvand(vvsrli(sg->pg_phase_lo, 9), phase_out_mask);
+    vi32_t phase_out_hi = vvand(vvsrli(sg->pg_phase_hi, 9), phase_out_mask);
+    vi16_t phase_out = vvpack(phase_out_lo, phase_out_hi);
+    sg->pg_phase_out = phase_out;
+
+    // EG: Compute rate
+    vi16_t eg_prgen = sg->eg_gen;
+    vi16_t eg_gen_rel = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_RELEASE)));
+    vi16_t notreset = vcmpz(vand(sg->eg_key, eg_gen_rel));
+    vi16_t eg_gen_shl = vblendv(vset1(AYMO_(EG_GEN_SHL_ATTACK)), sg->eg_gen_shl, notreset);
+    vi16_t reg_rate = vsllv(sg->eg_adsr, eg_gen_shl);  // move to top nibble
+    vi16_t rate_temp = vand(reg_rate, vset1((int16_t)0xF000));  // keep top nibble
+    rate_temp = vsrli(rate_temp, AYMO_(EG_GEN_SRLHI));
+    vi16_t rate = vadd(sg->eg_ks, rate_temp);
+    vi16_t rate_lo = vand(rate, vset1(3));
+    vi16_t rate_hi = vsrli(rate, 2);
+    rate_hi = vmini(rate_hi, vset1(15));
+
+    // PG: Update phase
+    vi32_t notreset_lo = vunpacklo(notreset);
+    vi32_t notreset_hi = vunpackhi(notreset);
+    vi32_t pg_phase_lo = vvand(notreset_lo, sg->pg_phase_lo);
+    vi32_t pg_phase_hi = vvand(notreset_hi, sg->pg_phase_hi);
+    sg->pg_phase_lo = vvadd(pg_phase_lo, sg->pg_deltafreq_lo);
+    sg->pg_phase_hi = vvadd(pg_phase_hi, sg->pg_deltafreq_hi);
+
+    // EG: Compute shift (< 12)
+    vi16_t eg_shift = vadd(rate_hi, chip->eg_add);
+    vi16_t rate_pre_lt12 = vor(vslli(rate_lo, 1), vset1(8));
+    vi16_t shift_lt12 = vsrlv(rate_pre_lt12, vu2i(vsubsu(vi2u(vset1(15)), vi2u(eg_shift))));
+    vi16_t eg_state = vset1((int16_t)chip->eg_state);
+    shift_lt12 = vand(shift_lt12, eg_state);
+
+    // WG: Compute feedback and modulation inputs
+    vi16_t fbsum = vadd(sg->wg_out, sg->wg_prout);
+    vi16_t fbsum_sh = vsllv(fbsum, sg->wg_fb_shs);
+    vi16_t prmod = vand(chip->wg_mod, sg->wg_prmod_gate);
+    vi16_t fbmod = vand(fbsum_sh, sg->wg_fbmod_gate);
+    sg->wg_prout = sg->wg_out;
+
+    // WG: Compute operator phase input
+    vi16_t modsum = vadd(fbmod, prmod);
+    vi16_t phase = vadd(phase_out, modsum);
+
+    // EG: Compute shift (>= 12)
+    vi16_t incstep_ge12 = vand(vsrlv(chip->eg_incstep, rate_lo), vset1(1));
+    vi16_t shift_ge12 = vadd(vand(rate_hi, vset1(3)), incstep_ge12);
+    shift_ge12 = vmini(shift_ge12, vset1(3));
+    shift_ge12 = vblendv(shift_ge12, eg_state, vcmpz(shift_ge12));
+
+    vi16_t shift = vblendv(shift_lt12, shift_ge12, vcmpgt(rate_hi, vset1(11)));
+    shift = vandnot(vcmpz(rate_temp), shift);
+
+    // EG: Instant attack
+    vi16_t eg_rout = sg_eg_rout;
+    eg_rout = vandnot(vandnot(notreset, vcmpeq(rate_hi, vset1(15))), eg_rout);
+
+    // WG: Process phase
+    vi16_t phase_sped = vsllv(phase, sg->wg_phase_shl);
+    vi16_t phase_gate = vcmpz(vand(phase_sped, sg->wg_phase_zero));
+    vi16_t phase_flip = vcmpp(vand(phase_sped, sg->wg_phase_flip));
+    vi16_t phase_mask = sg->wg_phase_mask;
+    vi16_t phase_xor = vand(phase_flip, phase_mask);
+    vi16_t phase_idx = vxor(phase_sped, phase_xor);
+    phase_out = vand(vand(phase_gate, phase_mask), phase_idx);
+
+    // EG: Envelope off
+    vi16_t eg_off = vcmpgt(sg_eg_rout, vset1(0x01F7));
+    vi16_t eg_gen_natk_and_nrst = vand(vcmpp(eg_prgen), notreset);
+    eg_rout = vblendv(eg_rout, vset1(0x01FF), vand(eg_gen_natk_and_nrst, eg_off));
+
+    // WG: Compute logsin variant
+    vi16_t phase_lo = phase_out;  // vgather() masks to low byte
+    vi16_t logsin_val = vgather(aymo_ymf262_logsin_table, phase_lo);
+    logsin_val = vblendv(vset1(0x1000), logsin_val, phase_gate);
+
+    // EG: Compute common increment not in attack state
+    vi16_t eg_inc_natk_cond = vand(vand(notreset, vcmpz(eg_off)), vcmpp(shift));
+    vi16_t eg_inc_natk = vand(eg_inc_natk_cond, vpow2m1lt4(shift));
+    vi16_t eg_gen = eg_prgen;
+
+    // WG: Compute exponential output
+    vi16_t exp_in = vblendv(phase_out, logsin_val, sg->wg_sine_gate);
+    vi16_t exp_level = vadd(exp_in, vslli(sg->eg_out, 3));
+    exp_level = vmini(exp_level, vset1(0x1FFF));
+    vi16_t exp_level_lo = exp_level;  // vgather() masks to low byte
+    vi16_t exp_level_hi = vsrli(exp_level, 8);
+    vi16_t exp_value = vgather(aymo_ymf262_exp_x2_table, exp_level_lo);
+    vi16_t exp_out = vsrlv(exp_value, exp_level_hi);
+
+    // EG: Move attack to decay state
+    vi16_t eg_inc_atk_cond = vand(vand(vcmpp(sg->eg_key), vcmpp(shift)),
+                                  vand(vcmpz(eg_prgen), vcmpgt(vset1(15), rate_hi)));
+    vi16_t eg_inc_atk_ninc = vsrlv(sg->eg_rout, vsub(vset1(4), shift));
+    vi16_t eg_inc = vandnot(eg_inc_atk_ninc, eg_inc_atk_cond);
+    vi16_t eg_gen_atk_to_dec = vcmpz(vor(eg_prgen, sg->eg_rout));
+    eg_gen = vsub(eg_gen, eg_gen_atk_to_dec);  // 0 --> 1
+    eg_inc = vblendv(eg_inc_natk, eg_inc, vcmpz(eg_prgen));
+    eg_inc = vandnot(eg_gen_atk_to_dec, eg_inc);
+
+    // WG: Compute operator wave output
+    vi16_t wave_pos = vcmpz(vand(phase_sped, sg->wg_phase_neg));
+    vi16_t wave_neg = vandnot(wave_pos, phase_gate);
+    vi16_t wave_out = vxor(exp_out, wave_neg);
+    sg->wg_out = wave_out;
+    chip->wg_mod = wave_out;
+
+    // EG: Move decay to sustain state
+    vi16_t eg_gen_dec = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_DECAY)));
+    vi16_t sl_hit = vcmpeq(vsrli(sg->eg_rout, 4), sg->eg_sl);
+    vi16_t eg_gen_dec_to_sus = vand(eg_gen_dec, sl_hit);
+    eg_gen = vsub(eg_gen, eg_gen_dec_to_sus);  // 1 --> 2
+    eg_inc = vandnot(eg_gen_dec_to_sus, eg_inc);
+
+    // WG: Update chip output accumulators, with quirky slot output delay
+    vi16_t og_out_ac = vblendv(wave_out, sg->og_prout, sg->og_prout_ac);
+    vi16_t og_out_bd = vblendv(wave_out, sg->og_prout, sg->og_prout_bd);
+    sg->og_prout = wave_out;
+    chip->og_acc_a = vadd(chip->og_acc_a, vand(og_out_ac, sg->og_out_ch_gate_a));
+    chip->og_acc_c = vadd(chip->og_acc_c, vand(og_out_ac, sg->og_out_ch_gate_c));
+    chip->og_acc_b = vadd(chip->og_acc_b, vand(og_out_bd, sg->og_out_ch_gate_b));
+    chip->og_acc_d = vadd(chip->og_acc_d, vand(og_out_bd, sg->og_out_ch_gate_d));
+
+    // EG: Move back to attack state
+    eg_gen = vand(notreset, eg_gen);  // * --> 0
+
+    // EG: Move to release state
+    eg_gen = vor(eg_gen, vsrli(vcmpz(sg->eg_key), 14));  // * --> 3
+
+    // EG: Update envelope generator
+    eg_rout = vadd(eg_rout, eg_inc);
+    eg_rout = vand(eg_rout, vset1(0x01FF));
+    sg->eg_rout = eg_rout;
+    sg->eg_gen = eg_gen;
+    sg->eg_gen_shl = vslli(eg_gen, 2);
+
+#ifdef AYMO_DEBUG
+    sg->eg_rate = rate;
+    sg->eg_inc = eg_inc;
+    sg->wg_fbmod = fbsum_sh;
+    sg->wg_mod = modsum;
+#endif
+}
+
+
+// Clear output accumulators
+static inline
+void aymo_(og_clear)(struct aymo_(chip)* chip)
+{
+    chip->og_acc_a = vsetz();
+    chip->og_acc_b = vsetz();
+    chip->og_acc_c = vsetz();
+    chip->og_acc_d = vsetz();
+}
+
+
+// Updates output mixdown
+static inline
+void aymo_(og_update)(struct aymo_(chip)* chip)
+{
+    vi32x4_t sum_a = vpaddlq_s16(chip->og_acc_a);
+    vi32x4_t sum_b = vpaddlq_s16(chip->og_acc_b);
+    vi32x4_t sum_c = vpaddlq_s16(chip->og_acc_c);
+    vi32x4_t sum_d = vpaddlq_s16(chip->og_acc_d);
+
+    sum_a = vaddq_s32(sum_a, vrev64q_s32(sum_a));
+    sum_b = vaddq_s32(sum_b, vrev64q_s32(sum_b));
+    sum_c = vaddq_s32(sum_c, vrev64q_s32(sum_c));
+    sum_d = vaddq_s32(sum_d, vrev64q_s32(sum_d));
+
+    vi32x2_t tot_a = vadd_s32(vget_low_s32(sum_a), vget_high_s32(sum_a));
+    vi32x2_t tot_b = vadd_s32(vget_low_s32(sum_b), vget_high_s32(sum_b));
+    vi32x2_t tot_c = vadd_s32(vget_low_s32(sum_c), vget_high_s32(sum_c));
+    vi32x2_t tot_d = vadd_s32(vget_low_s32(sum_d), vget_high_s32(sum_d));
+
+    vi32x2_t tot_ab = vext_s32(tot_a, tot_b, 1);
+    vi32x2_t tot_cd = vext_s32(tot_c, tot_d, 1);
+    vi16x4_t sat_abcd = vqmovn_s32(vcombine_s32(tot_ab, tot_cd));
+
+    vu16x4_t sel_old = vcreate_u16(0x0000FFFF0000FFFFuLL);
+    vi16x4_t out_abcd = vbsl_s16(sel_old, chip->og_old, sat_abcd);
+
+    chip->og_out = out_abcd;
+    chip->og_old = sat_abcd;
+}
+
+
+// Updates timer management
+static inline
+void aymo_(tm_update)(struct aymo_(chip)* chip)
+{
+    // Update tremolo
+    if AYMO_UNLIKELY((chip->tm_timer & 0x3F) == 0x3F) {
+        chip->eg_tremolopos = ((chip->eg_tremolopos + 1) % 210);
+
+        uint16_t eg_tremolopos = chip->eg_tremolopos;
+        if (eg_tremolopos >= 105) {
+            eg_tremolopos = (210 - eg_tremolopos);
+        }
+        vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift));
+
+        for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            struct aymo_(slot_group)* sg = &chip->sg[sgi];
+            sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am);
+        }
+    }
+
+    // Update vibrato
+    if AYMO_UNLIKELY((chip->tm_timer & 0x3FF) == 0x3FF) {
+        chip->pg_vibpos = ((chip->pg_vibpos + 1) & 7);
+        uint8_t vibpos = chip->pg_vibpos;
+        int16_t pg_vib_shs = -7;
+        int16_t pg_vib_sign = +1;
+
+        if (!(vibpos & 3)) {
+            pg_vib_shs = +16;
+        }
+        else if (vibpos & 1) {
+            pg_vib_shs -= 1;
+        }
+        pg_vib_shs -= (int16_t)(uint16_t)chip->eg_vibshift;
+
+        if (vibpos & 4) {
+            pg_vib_sign = -1;
+        }
+        chip->pg_vib_shs = vset1(pg_vib_shs);
+        chip->pg_vib_sign = vset1(pg_vib_sign);
+
+        for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            int cgi = aymo_(sgi_to_cgi)(sgi);
+            struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+            struct aymo_(slot_group)* sg = &chip->sg[sgi];
+            aymo_(pg_update_deltafreq)(chip, cg, sg);
+        }
+    }
+
+    chip->tm_timer++;
+    int16_t eg_incstep = aymo_(eg_incstep_table)[chip->tm_timer & 3];
+    chip->eg_incstep = vset1(eg_incstep);
+
+    // Update timed envelope patterns
+    int16_t eg_shift = (int16_t)uffsll(chip->eg_timer);
+    int16_t eg_add = ((eg_shift > 13) ? 0 : eg_shift);
+    chip->eg_add = vset1(eg_add);
+
+    // Update envelope timer and flip state
+    if (chip->eg_state | chip->eg_timerrem) {
+        if (chip->eg_timer < ((1ULL << AYMO_YMF262_SLOT_NUM) - 1ULL)) {
+            chip->eg_timer++;
+            chip->eg_timerrem = 0;
+        }
+        else {
+            chip->eg_timer = 0;
+            chip->eg_timerrem = 1;
+        }
+    }
+    chip->eg_state ^= 1;
+}
+
+
+// Updates the register queue
+static inline
+void aymo_(rq_update)(struct aymo_(chip)* chip)
+{
+    if (chip->rq_delay) {
+        if (--chip->rq_delay) {
+            return;
+        }
+    }
+    if (chip->rq_head != chip->rq_tail) {
+        struct aymo_(reg_queue_item)* item = &chip->rq_buffer[chip->rq_head];
+
+        if (item->address & 0x8000u) {
+            chip->rq_delay = AYMO_(REG_QUEUE_LATENCY);
+            chip->rq_delay += (((uint32_t)(item->address & 0x7FFFu) << 16) | item->value);
+        }
+        else {
+            aymo_(write)(chip, item->address, item->value);
+        }
+
+        if (++chip->rq_head >= AYMO_(REG_QUEUE_LENGTH)) {
+            chip->rq_head = 0;
+        }
+    }
+}
+
+
+static
+void aymo_(tick_once)(struct aymo_(chip)* chip)
+{
+    int sgi;
+
+    // Clear output accumulators
+    aymo_(og_clear)(chip);
+
+    // Process slot group 0
+    sgi = 0;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 2
+    sgi = 2;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 4
+    sgi = 4;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 6
+    sgi = 6;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 1
+    sgi = 1;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+    aymo_(ng_update)(chip, (36 - 3));  // slot 16 --> slot 13
+    aymo_(rm_update_sg1)(chip);
+
+    // Process slot group 3
+    sgi = 3;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+    aymo_(ng_update)(chip, 3);  // slot 13 --> slot 16
+    aymo_(rm_update_sg3)(chip);
+
+    if AYMO_UNLIKELY(chip->process_all_slots) {
+        // Process slot group 5
+        sgi = 5;
+        aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+        // Process slot group 7
+        sgi = 7;
+        aymo_(sg_update)(chip, &chip->sg[sgi]);
+    }
+
+    // Update outputs
+    aymo_(og_update)(chip);
+
+    // Update timers
+    aymo_(tm_update)(chip);
+
+    // Dequeue registers
+    aymo_(rq_update)(chip);
+}
+
+
+static
+void aymo_(eg_update_ksl)(struct aymo_(chip)* chip, int word)
+{
+    int slot = aymo_ymf262_word_to_slot[word];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h);
+
+    int16_t pg_fnum = vextractv(cg->pg_fnum, sgo);
+    int16_t pg_fnum_hn = ((pg_fnum >> 6) & 15);
+
+    int ch2x = aymo_ymf262_word_to_ch2x[aymo_ymf262_slot_to_word[slot]];
+    int16_t eg_block = (int16_t)(chip->ch2x_regs[ch2x].reg_B0h.block);
+    int16_t eg_ksl = aymo_ymf262_eg_ksl_table[pg_fnum_hn];
+    eg_ksl = ((eg_ksl << 2) - ((8 - eg_block) << 5));
+    if (eg_ksl < 0) {
+        eg_ksl = 0;
+    }
+    int16_t eg_kslsh = aymo_ymf262_eg_kslsh_table[reg_40h->ksl];
+    int16_t eg_ksl_sh = (eg_ksl >> eg_kslsh);
+
+    int16_t eg_tl_x4 = ((int16_t)reg_40h->tl << 2);
+
+    int16_t eg_ksl_sh_tl_x4 = (eg_ksl_sh + eg_tl_x4);
+    vinsertv(sg->eg_ksl_sh_tl_x4, eg_ksl_sh_tl_x4, sgo);
+
+#ifdef AYMO_DEBUG
+    vinsertv(sg->eg_ksl, eg_ksl, sgo);
+#endif
+}
+
+
+static
+void aymo_(chip_pg_update_nts)(struct aymo_(chip)* chip)
+{
+    for (int slot = 0; slot < AYMO_(SLOT_NUM_MAX); ++slot) {
+        int word = aymo_ymf262_slot_to_word[slot];
+        int ch2x = aymo_ymf262_word_to_ch2x[word];
+        struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+        struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+        struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h);
+        int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8));
+        int16_t eg_ksv = ((reg_B0h->block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1));
+
+        int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+        int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+        int cgi = aymo_(sgi_to_cgi)(sgi);
+        struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+        struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+
+        struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h);
+        int16_t ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1));
+
+        vinsertv(cg->eg_ksv, eg_ksv, sgo);
+        vinsertv(sg->eg_ks,  ks,     sgo);
+    }
+}
+
+
+static
+void aymo_(pg_update_fnum)(
+    struct aymo_(chip)* chip, int ch2x,
+    int16_t pg_fnum, int16_t eg_ksv, int16_t pg_block
+)
+{
+    int word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int sgi0 = (word0 / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word0 % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi0);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+
+    vinsertv(cg->pg_block, pg_block, sgo);
+    vinsertv(cg->pg_fnum, pg_fnum, sgo);
+    vinsertv(cg->eg_ksv, eg_ksv, sgo);
+
+    struct aymo_(slot_group)* sg0 = &(chip->sg[sgi0]);
+    int slot0 = aymo_ymf262_word_to_slot[word0];
+    struct aymo_ymf262_reg_20h* reg_20h0 = &(chip->slot_regs[slot0].reg_20h);
+    int16_t ks0 = (eg_ksv >> ((reg_20h0->ksr ^ 1) << 1));
+    vinsertv(sg0->eg_ks, ks0, sgo);
+    aymo_(eg_update_ksl)(chip, word0);
+    aymo_(pg_update_deltafreq)(chip, cg, sg0);
+
+    int word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+    int sgi1 = (word1 / AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg1 = &(chip->sg[sgi1]);
+    int slot1 = aymo_ymf262_word_to_slot[word1];
+    struct aymo_ymf262_reg_20h* reg_20h1 = &(chip->slot_regs[slot1].reg_20h);
+    int16_t ks1 = (eg_ksv >> ((reg_20h1->ksr ^ 1) << 1));
+    vinsertv(sg1->eg_ks, ks1, sgo);
+    aymo_(eg_update_ksl)(chip, word1);
+    aymo_(pg_update_deltafreq)(chip, cg, sg1);
+}
+
+
+static
+void aymo_(ch2x_update_fnum)(struct aymo_(chip)* chip, int ch2x, int8_t ch2p)
+{
+    struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+    struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+    struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h);
+    int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8));
+    int16_t pg_block = (int16_t)reg_B0h->block;
+    int16_t eg_ksv = ((pg_block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1));
+
+    aymo_(pg_update_fnum)(chip, ch2x, pg_fnum, eg_ksv, pg_block);
+
+    if (ch2p >= 0) {
+        aymo_(pg_update_fnum)(chip, ch2p, pg_fnum, eg_ksv, pg_block);
+    }
+}
+
+
+static inline
+void aymo_(eg_key_on)(struct aymo_(chip)* chip, int word, int16_t mode)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    int16_t eg_key = vextractv(sg->eg_key, sgo);
+    eg_key |= mode;
+    vinsertv(sg->eg_key, eg_key, sgo);
+}
+
+
+static inline
+void aymo_(eg_key_off)(struct aymo_(chip)* chip, int word, int16_t mode)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    int16_t eg_key = vextractv(sg->eg_key, sgo);
+    eg_key &= (int16_t)~mode;
+    vinsertv(sg->eg_key, eg_key, sgo);
+}
+
+
+static
+void aymo_(ch2x_key_on)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm) {
+        unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+        unsigned ch2x_is_drum    = (chip->og_ch2x_drum    & (1UL << ch2x));
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+
+        if (ch2x_is_pairing && !ch2x_is_secondary) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0];
+            int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1];
+            aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL));
+        }
+        else if (!ch2x_is_pairing || ch2x_is_drum) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+        }
+    }
+    else {
+        int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+        int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+        aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+        aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+    }
+}
+
+
+static
+void aymo_(ch2x_key_off)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm) {
+        unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+        unsigned ch2x_is_drum    = (chip->og_ch2x_drum    & (1UL << ch2x));
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+
+        if (ch2x_is_pairing && !ch2x_is_secondary) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0];
+            int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1];
+            aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL));
+        }
+        else if (!ch2x_is_pairing || ch2x_is_drum) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+        }
+    }
+    else {
+        int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+        int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+        aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+        aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+    }
+}
+
+
+static
+void aymo_(cm_rewire_slot)(struct aymo_(chip)* chip, int word, const struct aymo_(conn)* conn)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    vinsertv(sg->wg_fbmod_gate, conn->wg_fbmod_gate, sgo);
+    vinsertv(sg->wg_prmod_gate, conn->wg_prmod_gate, sgo);
+    int16_t og_out_gate = conn->og_out_gate;
+    vinsertv(sg->og_out_gate, og_out_gate, sgo);
+
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+    vinsertv(sg->og_out_ch_gate_a, (vextractv(cg->og_ch_gate_a, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_b, (vextractv(cg->og_ch_gate_b, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_c, (vextractv(cg->og_ch_gate_c, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_d, (vextractv(cg->og_ch_gate_d, sgo) & og_out_gate), sgo);
+}
+
+
+static
+void aymo_(cm_rewire_ch2x)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm && (chip->og_ch2x_pairing & (1UL << ch2x))) {
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+        if (ch2x_is_secondary) {
+            int t = ch2x;
+            ch2x = ch2p;
+            ch2p = t;
+        }
+        unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+        unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+        unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt);
+        const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt];
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]);
+    }
+    else {
+        unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+        const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt];
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]);
+    }
+}
+
+
+static
+void aymo_(cm_rewire_conn)(
+    struct aymo_(chip)* chip,
+    const struct aymo_ymf262_reg_104h* reg_104h_prev
+)
+{
+    struct aymo_ymf262_reg_104h* reg_104h = &chip->chip_regs.reg_104h;
+    unsigned diff = (reg_104h_prev ? (reg_104h_prev->conn ^ reg_104h->conn) : 0xFF);
+
+    for (int ch4x = 0; ch4x < (AYMO_(CHANNEL_NUM_MAX) / 2); ++ch4x) {
+        if (diff & (1 << ch4x)) {
+            int ch2x = aymo_ymf262_ch4x_to_pair[ch4x][0];
+            int ch2p = aymo_ymf262_ch4x_to_pair[ch4x][1];
+
+            if (reg_104h->conn & (1 << ch4x)) {
+                chip->og_ch2x_pairing |= ((1UL << ch2x) | (1UL << ch2p));
+
+                unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+                unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+                unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt);
+                const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]);
+            }
+            else {
+                chip->og_ch2x_pairing &= ~((1UL << ch2x) | (1UL << ch2p));
+
+                unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+                const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]);
+
+                unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+                const struct aymo_(conn)* ch2p_conn = aymo_(conn_ch2x_table)[ch2p_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch2p_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch2p_conn[1]);
+            }
+        }
+    }
+}
+
+
+static
+void aymo_(cm_rewire_rhythm)(
+    struct aymo_(chip)* chip,
+    const struct aymo_ymf262_reg_BDh* reg_BDh_prev
+)
+{
+    const struct aymo_ymf262_reg_BDh reg_BDh_zero = { 0, 0, 0, 0, 0, 0, 0, 0 };
+    const struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh;
+    int force_update = 0;
+
+    if (reg_BDh->ryt) {
+        if (!reg_BDh_prev->ryt) {
+            // Apply special connection for rhythm mode
+            unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt;
+            const struct aymo_(conn)* ch6_conn = aymo_(conn_ryt_table)[ch6_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]);
+
+            const struct aymo_(conn)* ch7_conn = aymo_(conn_ryt_table)[2];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]);
+
+            const struct aymo_(conn)* ch8_conn = aymo_(conn_ryt_table)[3];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]);
+
+            force_update = 1;
+        }
+    }
+    else {
+        if (reg_BDh_prev->ryt) {
+            // Apply standard Channel_2xOP connection
+            unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt;
+            const struct aymo_(conn)* ch6_conn = aymo_(conn_ch2x_table)[ch6_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]);
+
+            unsigned ch7_cnt = chip->ch2x_regs[7].reg_C0h.cnt;
+            const struct aymo_(conn)* ch7_conn = aymo_(conn_ch2x_table)[ch7_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]);
+
+            unsigned ch8_cnt = chip->ch2x_regs[8].reg_C0h.cnt;
+            const struct aymo_(conn)* ch8_conn = aymo_(conn_ch2x_table)[ch8_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]);
+
+            reg_BDh = &reg_BDh_zero;  // force all keys off
+            force_update = 1;
+        }
+    }
+
+    if ((reg_BDh->hh != reg_BDh_prev->hh) || force_update) {
+        int word_hh = aymo_ymf262_ch2x_to_word[7][0];
+        if (reg_BDh->hh) {
+            aymo_(eg_key_on)(chip, word_hh, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_hh, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->tc != reg_BDh_prev->tc) || force_update) {
+        int word_tc = aymo_ymf262_ch2x_to_word[8][1];
+        if (reg_BDh->tc) {
+            aymo_(eg_key_on)(chip, word_tc, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_tc, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->tom != reg_BDh_prev->tom) || force_update) {
+        int word_tom = aymo_ymf262_ch2x_to_word[8][0];
+        if (reg_BDh->tom) {
+            aymo_(eg_key_on)(chip, word_tom, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_tom, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->sd != reg_BDh_prev->sd) || force_update) {
+        int word_sd = aymo_ymf262_ch2x_to_word[7][1];
+        if (reg_BDh->sd) {
+            aymo_(eg_key_on)(chip, word_sd, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_sd, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->bd != reg_BDh_prev->bd) || force_update) {
+        int word_bd0 = aymo_ymf262_ch2x_to_word[6][0];
+        int word_bd1 = aymo_ymf262_ch2x_to_word[6][1];
+        if (reg_BDh->bd) {
+            aymo_(eg_key_on)(chip, word_bd0, AYMO_(EG_KEY_DRUM));
+            aymo_(eg_key_on)(chip, word_bd1, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_bd0, AYMO_(EG_KEY_DRUM));
+            aymo_(eg_key_off)(chip, word_bd1, AYMO_(EG_KEY_DRUM));
+        }
+    }
+}
+
+
+static
+void aymo_(write_00h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    switch (address) {
+    case 0x01: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_01h) = value;
+        break;
+    }
+    case 0x02: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_02h) = value;
+        break;
+    }
+    case 0x03: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_03h) = value;
+        break;
+    }
+    case 0x04: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_04h) = value;
+        break;
+    }
+    case 0x104: {
+        struct aymo_ymf262_reg_104h reg_104h_prev = chip->chip_regs.reg_104h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_104h) = value;
+        aymo_(cm_rewire_conn)(chip, &reg_104h_prev);
+        break;
+    }
+    case 0x105: {
+        struct aymo_ymf262_reg_105h reg_105h_prev = chip->chip_regs.reg_105h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_105h) = value;
+        if (chip->chip_regs.reg_105h.newm != reg_105h_prev.newm) {
+            ;
+        }
+        break;
+    }
+    case 0x08: {
+        struct aymo_ymf262_reg_08h reg_08h_prev = chip->chip_regs.reg_08h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_08h) = value;
+        if (chip->chip_regs.reg_08h.nts != reg_08h_prev.nts) {
+            aymo_(chip_pg_update_nts)(chip);
+        }
+        break;
+    }
+    }
+}
+
+
+static
+void aymo_(write_20h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int sgi = (aymo_ymf262_slot_to_word[slot] / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (aymo_ymf262_slot_to_word[slot] % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h);
+    struct aymo_ymf262_reg_20h reg_20h_prev = *reg_20h;
+    *(uint8_t*)(void*)reg_20h = value;
+    unsigned update_deltafreq = 0;
+
+    if (reg_20h->mult != reg_20h_prev.mult) {
+        int16_t pg_mult_x2 = aymo_ymf262_pg_mult_x2_table[reg_20h->mult];
+        vinsertv(sg->pg_mult_x2, pg_mult_x2, sgo);
+        update_deltafreq = 1;  // force
+    }
+
+    if (reg_20h->ksr != reg_20h_prev.ksr) {
+        int16_t eg_ksv = vextractv(cg->eg_ksv, sgo);
+        int16_t eg_ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1));
+        vinsertv(sg->eg_ks, eg_ks, sgo);
+    }
+
+    if (reg_20h->egt != reg_20h_prev.egt) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->sr = (reg_20h->egt ? 0 : chip->slot_regs[slot].reg_80h.rr);
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+    }
+
+    if (reg_20h->vib != reg_20h_prev.vib) {
+        int16_t pg_vib = -(int16_t)reg_20h->vib;
+        vinsertv(sg->pg_vib, pg_vib, sgo);
+        update_deltafreq = 1;  // force
+    }
+
+    if (reg_20h->am != reg_20h_prev.am) {
+        int16_t eg_am = -(int16_t)reg_20h->am;
+        vinsertv(sg->eg_am, eg_am, sgo);
+
+        uint16_t eg_tremolopos = chip->eg_tremolopos;
+        if (eg_tremolopos >= 105) {
+            eg_tremolopos = (210 - eg_tremolopos);
+        }
+        vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift));
+        vsfence();
+        sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am);
+    }
+
+    if (update_deltafreq) {
+        for (sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            cgi = aymo_(sgi_to_cgi)(sgi);
+            cg = &chip->cg[cgi];
+            sg = &chip->sg[sgi];
+            aymo_(pg_update_deltafreq)(chip, cg, sg);
+        }
+    }
+}
+
+
+static
+void aymo_(write_40h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h);
+    struct aymo_ymf262_reg_40h reg_40h_prev = *reg_40h;
+    *(uint8_t*)(void*)reg_40h = value;
+
+    if ((reg_40h->tl != reg_40h_prev.tl) || (reg_40h->ksl != reg_40h_prev.ksl)) {
+        aymo_(eg_update_ksl)(chip, word);
+    }
+}
+
+
+static
+void aymo_(write_60h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_60h* reg_60h = &(chip->slot_regs[slot].reg_60h);
+    struct aymo_ymf262_reg_60h reg_60h_prev = *reg_60h;
+    *(uint8_t*)(void*)reg_60h = value;
+
+    if ((reg_60h->dr != reg_60h_prev.dr) || (reg_60h->ar != reg_60h_prev.ar)) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->dr = reg_60h->dr;
+        eg_adsr->ar = reg_60h->ar;
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+    }
+}
+
+
+static
+void aymo_(write_80h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_80h* reg_80h = &(chip->slot_regs[slot].reg_80h);
+    struct aymo_ymf262_reg_80h reg_80h_prev = *reg_80h;
+    *(uint8_t*)(void*)reg_80h = value;
+
+    if ((reg_80h->rr != reg_80h_prev.rr) || (reg_80h->sl != reg_80h_prev.sl)) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->sr = (chip->slot_regs[slot].reg_20h.egt ? 0 : reg_80h->rr);
+        eg_adsr->rr = reg_80h->rr;
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+        int16_t eg_sl = (int16_t)reg_80h->sl;
+        if (eg_sl == 0x0F) {
+            eg_sl = 0x1F;
+        }
+        vinsertv(sg->eg_sl, eg_sl, sgo);
+    }
+}
+
+
+static
+void aymo_(write_E0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_E0h* reg_E0h = &(chip->slot_regs[slot].reg_E0h);
+    struct aymo_ymf262_reg_E0h reg_E0h_prev = *reg_E0h;
+    *(uint8_t*)(void*)reg_E0h = value;
+
+    if (!chip->chip_regs.reg_105h.newm) {
+        reg_E0h->ws &= 3;
+    }
+
+    if (reg_E0h->ws != reg_E0h_prev.ws) {
+        const struct aymo_(wave)* wave = &aymo_(wave_table)[reg_E0h->ws];
+        vinsertv(sg->wg_phase_shl,   wave->wg_phase_shl,   sgo);
+        vinsertv(sg->wg_phase_zero,  wave->wg_phase_zero,  sgo);
+        vinsertv(sg->wg_phase_neg,   wave->wg_phase_neg,   sgo);
+        vinsertv(sg->wg_phase_flip,  wave->wg_phase_flip,  sgo);
+        vinsertv(sg->wg_phase_mask,  wave->wg_phase_mask,  sgo);
+        vinsertv(sg->wg_sine_gate,   wave->wg_sine_gate,   sgo);
+    }
+}
+
+
+static
+void aymo_(write_A0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+    int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+    int ch2x_is_secondary = (ch2p < ch2x);
+    if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) {
+        return;
+    }
+    if (!ch2x_is_pairing || ch2x_is_secondary) {
+        ch2p = -1;
+    }
+
+    struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+    struct aymo_ymf262_reg_A0h reg_A0h_prev = *reg_A0h;
+    *(uint8_t*)(void*)reg_A0h = value;
+
+    if (reg_A0h->fnum_lo != reg_A0h_prev.fnum_lo) {
+        aymo_(ch2x_update_fnum)(chip, ch2x, ch2p);
+    }
+}
+
+
+static
+void aymo_(write_B0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+    int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+    int ch2x_is_secondary = (ch2p < ch2x);
+    if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) {
+        return;
+    }
+    if (!ch2x_is_pairing || ch2x_is_secondary) {
+        ch2p = -1;
+    }
+
+    if (address == 0xBD) {
+        struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh;
+        struct aymo_ymf262_reg_BDh reg_BDh_prev = *reg_BDh;
+        *(uint8_t*)(void*)reg_BDh = value;
+
+        chip->eg_tremoloshift = (((reg_BDh->dam ^ 1) << 1) + 2);
+        chip->eg_vibshift = (reg_BDh->dvb ^ 1);
+        aymo_(cm_rewire_rhythm)(chip, &reg_BDh_prev);
+    }
+    else {
+        struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+        struct aymo_ymf262_reg_B0h reg_B0h_prev = *reg_B0h;
+        *(uint8_t*)(void*)reg_B0h = value;
+
+        if ((reg_B0h->fnum_hi != reg_B0h_prev.fnum_hi) || (reg_B0h->block != reg_B0h_prev.block)) {
+            aymo_(ch2x_update_fnum)(chip, ch2x, ch2p);
+        }
+
+        if (reg_B0h->kon != reg_B0h_prev.kon) {
+            if (reg_B0h->kon) {
+                aymo_(ch2x_key_on)(chip, ch2x);
+            } else {
+                aymo_(ch2x_key_off)(chip, ch2x);
+            }
+        }
+    }
+}
+
+
+static
+void aymo_(write_C0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    struct aymo_ymf262_reg_C0h* reg_C0h = &(chip->ch2x_regs[ch2x].reg_C0h);
+    struct aymo_ymf262_reg_C0h reg_C0h_prev = *reg_C0h;
+    if (!chip->chip_regs.reg_105h.newm) {
+        value = ((value | 0x30) & 0x3F);
+    }
+    *(uint8_t*)(void*)reg_C0h = value;
+
+    int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+    int sgo = (ch2x_word0 % AYMO_(SLOT_GROUP_LENGTH));
+    int sgi0 = (ch2x_word0 / AYMO_(SLOT_GROUP_LENGTH));
+    int sgi1 = (ch2x_word1 / AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg0 = &chip->sg[sgi0];
+    struct aymo_(slot_group)* sg1 = &chip->sg[sgi1];
+    int cgi = aymo_(sgi_to_cgi)(sgi0);
+    struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+
+    if (reg_C0h->cha != reg_C0h_prev.cha) {
+        int16_t og_ch_gate_a = -(int16_t)reg_C0h->cha;
+        vinsertv(cg->og_ch_gate_a, og_ch_gate_a, sgo);
+        vinsertv(sg0->og_out_ch_gate_a, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_a), sgo);
+        vinsertv(sg1->og_out_ch_gate_a, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_a), sgo);
+    }
+    if (reg_C0h->chb != reg_C0h_prev.chb) {
+        int16_t og_ch_gate_b = -(int16_t)reg_C0h->chb;
+        vinsertv(cg->og_ch_gate_b, og_ch_gate_b, sgo);
+        vinsertv(sg0->og_out_ch_gate_b, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_b), sgo);
+        vinsertv(sg1->og_out_ch_gate_b, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_b), sgo);
+    }
+    if (reg_C0h->chc != reg_C0h_prev.chc) {
+        int16_t og_ch_gate_c = -(int16_t)reg_C0h->chc;
+        vinsertv(cg->og_ch_gate_c, og_ch_gate_c, sgo);
+        vinsertv(sg0->og_out_ch_gate_c, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_c), sgo);
+        vinsertv(sg1->og_out_ch_gate_c, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_c), sgo);
+    }
+    if (reg_C0h->chd != reg_C0h_prev.chd) {
+        int16_t og_ch_gate_d = -(int16_t)reg_C0h->chd;
+        vinsertv(cg->og_ch_gate_d, og_ch_gate_d, sgo);
+        vinsertv(sg0->og_out_ch_gate_d, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_d), sgo);
+        vinsertv(sg1->og_out_ch_gate_d, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_d), sgo);
+    }
+
+    if (reg_C0h->fb != reg_C0h_prev.fb) {
+        int16_t fb_shs = (reg_C0h->fb ? -(int16_t)(9u - reg_C0h->fb) : +16);
+        vinsertv(sg0->wg_fb_shs, fb_shs, sgo);
+        vinsertv(sg1->wg_fb_shs, fb_shs, sgo);
+    }
+
+    if (chip->chip_regs.reg_105h.stereo) {
+        // TODO
+    }
+
+    if (reg_C0h->cnt != reg_C0h_prev.cnt) {
+        aymo_(cm_rewire_ch2x)(chip, ch2x);
+    }
+}
+
+
+static
+void aymo_(write_D0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    *(uint8_t*)(void*)&(chip->ch2x_regs[ch2x].reg_C0h) = value;
+
+    if (chip->chip_regs.reg_105h.stereo) {
+        // TODO
+    }
+}
+
+
+static
+int aymo_(rq_enqueue)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    uint16_t rq_tail = chip->rq_tail;
+    uint16_t rq_next = (rq_tail + 1);
+    if (rq_next >= AYMO_(REG_QUEUE_LENGTH)) {
+        rq_next = 0u;
+    }
+
+    if (rq_next != chip->rq_head) {
+        chip->rq_buffer[rq_tail].address = address;
+        chip->rq_buffer[rq_tail].value = value;
+        chip->rq_tail = rq_next;
+        return 1;
+    }
+    return 0;
+}
+
+
+const struct aymo_ymf262_vt* aymo_(get_vt)(void)
+{
+    return &(aymo_(vt));
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    // Wipe everything, except VT
+    const struct aymo_ymf262_vt* vt = chip->parent.vt;
+    aymo_memset(chip, 0, sizeof(*chip));
+    chip->parent.vt = vt;
+
+    // Initialize slots
+    for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+        struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+        sg->wg_fb_shs       = vset1(16);
+        sg->eg_rout         = vset1(0x01FF);
+        sg->eg_out          = vset1(0x01FF);
+        sg->eg_gen          = vset1(AYMO_(EG_GEN_RELEASE));
+        sg->eg_gen_shl      = vset1(AYMO_(EG_GEN_SHL_RELEASE));
+        sg->pg_mult_x2      = vset1(aymo_ymf262_pg_mult_x2_table[0]);
+        sg->og_prout_ac     = vsetm(aymo_(og_prout_ac)[sgi]);
+        sg->og_prout_bd     = vsetm(aymo_(og_prout_bd)[sgi]);
+
+        const struct aymo_(wave)* wave = &aymo_(wave_table)[0];
+        sg->wg_phase_shl    = vset1(wave->wg_phase_shl);
+        sg->wg_phase_zero   = vset1(wave->wg_phase_zero);
+        sg->wg_phase_neg    = vset1(wave->wg_phase_neg);
+        sg->wg_phase_flip   = vset1(wave->wg_phase_flip);
+        sg->wg_phase_mask   = vset1(wave->wg_phase_mask);
+        sg->wg_sine_gate    = vset1(wave->wg_sine_gate);
+    }
+
+    // Initialize channels
+    for (int cgi = 0; cgi < (AYMO_(SLOT_GROUP_NUM) / 2); ++cgi) {
+        struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+        cg->og_ch_gate_a = vset1(-1);
+        cg->og_ch_gate_b = vset1(-1);
+    }
+    for (int ch2x = 0; ch2x < AYMO_(CHANNEL_NUM_MAX); ++ch2x) {
+        aymo_(cm_rewire_ch2x)(chip, ch2x);
+    }
+
+    // Initialize chip
+    chip->ng_noise = 1;
+
+    chip->eg_tremoloshift = 4;
+    chip->eg_vibshift = 1;
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    AYMO_UNUSED_VAR(chip);
+    AYMO_UNUSED_VAR(address);
+    assert(chip);
+
+    // not supported
+    return 0u;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address > 0x1FF) {
+        return;
+    }
+
+    switch (address & 0xF0) {
+    case 0x00: {
+        aymo_(write_00h)(chip, address, value);
+        break;
+    }
+    case 0x20:
+    case 0x30: {
+        aymo_(write_20h)(chip, address, value);
+        break;
+    }
+    case 0x40:
+    case 0x50: {
+        aymo_(write_40h)(chip, address, value);
+        break;
+    }
+    case 0x60:
+    case 0x70: {
+        aymo_(write_60h)(chip, address, value);
+        break;
+    }
+    case 0x80:
+    case 0x90: {
+        aymo_(write_80h)(chip, address, value);
+        break;
+    }
+    case 0xE0:
+    case 0xF0: {
+        aymo_(write_E0h)(chip, address, value);
+        break;
+    }
+    case 0xA0: {
+        aymo_(write_A0h)(chip, address, value);
+        break;
+    }
+    case 0xB0: {
+        aymo_(write_B0h)(chip, address, value);
+        break;
+    }
+    case 0xC0: {
+        aymo_(write_C0h)(chip, address, value);
+        break;
+    }
+    case 0xD0: {
+        aymo_(write_D0h)(chip, address, value);
+        break;
+    }
+    }
+    vsfence();
+}
+
+
+int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address < 0x8000u) {
+        return aymo_(rq_enqueue)(chip, address, value);
+    }
+    return 0;
+}
+
+
+int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    if (count < 0x8000u) {
+        uint16_t address = (uint16_t)((count >> 8) | 0x8000u);
+        uint8_t value = (uint8_t)(count & 0xFFu);
+        return aymo_(rq_enqueue)(chip, address, value);
+    }
+    return 0;
+}
+
+
+int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel)
+{
+    assert(chip);
+
+    switch (channel) {
+        case 0u: return vget_lane_s16(chip->og_out, 0);
+        case 1u: return vget_lane_s16(chip->og_out, 1);
+        case 2u: return vget_lane_s16(chip->og_out, 2);
+        case 3u: return vget_lane_s16(chip->og_out, 3);
+        default: return 0;
+    }
+}
+
+
+void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+    }
+}
+
+
+void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 3u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        *(int32_t*)(void*)y = vget_lane_s32(vreinterpret_s32_s16(chip->og_out), 0);
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 7u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vst1_s16(y, chip->og_out);
+        y += 4u;
+    }
+}
+
+
+void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 7u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vi32x4_t s32 = vmovl_s16(chip->og_out);
+        vf32x2_t f32 = vcvt_f32_s32(vget_low_s32(s32));
+        vst1_f32(y, f32);
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 15u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vi32x4_t s32 = vmovl_s16(chip->og_out);
+        vf32x4_t f32 = vcvtq_f32_s32(s32);
+        vst1q_f32(y, f32);
+        y += 4u;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
diff --git a/src/aymo_ymf262_common.c b/src/aymo_ymf262_common.c
new file mode 100644
index 0000000..5fbdc66
--- /dev/null
+++ b/src/aymo_ymf262_common.c
@@ -0,0 +1,263 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_ymf262_common.h"
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+// TODO: common tables
+
+
+// Exponential look-up table
+// Values are pre-multiplied by 2
+const int16_t AYMO_ALIGN(4) aymo_ymf262_exp_x2_table[256 + 4] =
+{
+    0x0FF4, 0x0FEA, 0x0FDE, 0x0FD4, 0x0FC8, 0x0FBE, 0x0FB4, 0x0FA8,
+    0x0F9E, 0x0F92, 0x0F88, 0x0F7E, 0x0F72, 0x0F68, 0x0F5C, 0x0F52,
+    0x0F48, 0x0F3E, 0x0F32, 0x0F28, 0x0F1E, 0x0F14, 0x0F08, 0x0EFE,
+    0x0EF4, 0x0EEA, 0x0EE0, 0x0ED4, 0x0ECA, 0x0EC0, 0x0EB6, 0x0EAC,
+    0x0EA2, 0x0E98, 0x0E8E, 0x0E84, 0x0E7A, 0x0E70, 0x0E66, 0x0E5C,
+    0x0E52, 0x0E48, 0x0E3E, 0x0E34, 0x0E2A, 0x0E20, 0x0E16, 0x0E0C,
+    0x0E04, 0x0DFA, 0x0DF0, 0x0DE6, 0x0DDC, 0x0DD2, 0x0DCA, 0x0DC0,
+    0x0DB6, 0x0DAC, 0x0DA4, 0x0D9A, 0x0D90, 0x0D88, 0x0D7E, 0x0D74,
+    0x0D6A, 0x0D62, 0x0D58, 0x0D50, 0x0D46, 0x0D3C, 0x0D34, 0x0D2A,
+    0x0D22, 0x0D18, 0x0D10, 0x0D06, 0x0CFE, 0x0CF4, 0x0CEC, 0x0CE2,
+    0x0CDA, 0x0CD0, 0x0CC8, 0x0CBE, 0x0CB6, 0x0CAE, 0x0CA4, 0x0C9C,
+    0x0C92, 0x0C8A, 0x0C82, 0x0C78, 0x0C70, 0x0C68, 0x0C60, 0x0C56,
+    0x0C4E, 0x0C46, 0x0C3C, 0x0C34, 0x0C2C, 0x0C24, 0x0C1C, 0x0C12,
+    0x0C0A, 0x0C02, 0x0BFA, 0x0BF2, 0x0BEA, 0x0BE0, 0x0BD8, 0x0BD0,
+    0x0BC8, 0x0BC0, 0x0BB8, 0x0BB0, 0x0BA8, 0x0BA0, 0x0B98, 0x0B90,
+    0x0B88, 0x0B80, 0x0B78, 0x0B70, 0x0B68, 0x0B60, 0x0B58, 0x0B50,
+    0x0B48, 0x0B40, 0x0B38, 0x0B32, 0x0B2A, 0x0B22, 0x0B1A, 0x0B12,
+    0x0B0A, 0x0B02, 0x0AFC, 0x0AF4, 0x0AEC, 0x0AE4, 0x0ADE, 0x0AD6,
+    0x0ACE, 0x0AC6, 0x0AC0, 0x0AB8, 0x0AB0, 0x0AA8, 0x0AA2, 0x0A9A,
+    0x0A92, 0x0A8C, 0x0A84, 0x0A7C, 0x0A76, 0x0A6E, 0x0A68, 0x0A60,
+    0x0A58, 0x0A52, 0x0A4A, 0x0A44, 0x0A3C, 0x0A36, 0x0A2E, 0x0A28,
+    0x0A20, 0x0A18, 0x0A12, 0x0A0C, 0x0A04, 0x09FE, 0x09F6, 0x09F0,
+    0x09E8, 0x09E2, 0x09DA, 0x09D4, 0x09CE, 0x09C6, 0x09C0, 0x09B8,
+    0x09B2, 0x09AC, 0x09A4, 0x099E, 0x0998, 0x0990, 0x098A, 0x0984,
+    0x097C, 0x0976, 0x0970, 0x096A, 0x0962, 0x095C, 0x0956, 0x0950,
+    0x0948, 0x0942, 0x093C, 0x0936, 0x0930, 0x0928, 0x0922, 0x091C,
+    0x0916, 0x0910, 0x090A, 0x0904, 0x08FC, 0x08F6, 0x08F0, 0x08EA,
+    0x08E4, 0x08DE, 0x08D8, 0x08D2, 0x08CC, 0x08C6, 0x08C0, 0x08BA,
+    0x08B4, 0x08AE, 0x08A8, 0x08A2, 0x089C, 0x0896, 0x0890, 0x088A,
+    0x0884, 0x087E, 0x0878, 0x0872, 0x086C, 0x0866, 0x0860, 0x085A,
+    0x0854, 0x0850, 0x084A, 0x0844, 0x083E, 0x0838, 0x0832, 0x082C,
+    0x0828, 0x0822, 0x081C, 0x0816, 0x0810, 0x080C, 0x0806, 0x0800,
+    0x0800, 0x0800, 0x0800, 0x0800
+};
+
+
+// Logsin look-up table
+const int16_t AYMO_ALIGN(4) aymo_ymf262_logsin_table[256 + 4] =
+{
+    0x0859, 0x06C3, 0x0607, 0x058B, 0x052E, 0x04E4, 0x04A6, 0x0471,
+    0x0443, 0x041A, 0x03F5, 0x03D3, 0x03B5, 0x0398, 0x037E, 0x0365,
+    0x034E, 0x0339, 0x0324, 0x0311, 0x02FF, 0x02ED, 0x02DC, 0x02CD,
+    0x02BD, 0x02AF, 0x02A0, 0x0293, 0x0286, 0x0279, 0x026D, 0x0261,
+    0x0256, 0x024B, 0x0240, 0x0236, 0x022C, 0x0222, 0x0218, 0x020F,
+    0x0206, 0x01FD, 0x01F5, 0x01EC, 0x01E4, 0x01DC, 0x01D4, 0x01CD,
+    0x01C5, 0x01BE, 0x01B7, 0x01B0, 0x01A9, 0x01A2, 0x019B, 0x0195,
+    0x018F, 0x0188, 0x0182, 0x017C, 0x0177, 0x0171, 0x016B, 0x0166,
+    0x0160, 0x015B, 0x0155, 0x0150, 0x014B, 0x0146, 0x0141, 0x013C,
+    0x0137, 0x0133, 0x012E, 0x0129, 0x0125, 0x0121, 0x011C, 0x0118,
+    0x0114, 0x010F, 0x010B, 0x0107, 0x0103, 0x00FF, 0x00FB, 0x00F8,
+    0x00F4, 0x00F0, 0x00EC, 0x00E9, 0x00E5, 0x00E2, 0x00DE, 0x00DB,
+    0x00D7, 0x00D4, 0x00D1, 0x00CD, 0x00CA, 0x00C7, 0x00C4, 0x00C1,
+    0x00BE, 0x00BB, 0x00B8, 0x00B5, 0x00B2, 0x00AF, 0x00AC, 0x00A9,
+    0x00A7, 0x00A4, 0x00A1, 0x009F, 0x009C, 0x0099, 0x0097, 0x0094,
+    0x0092, 0x008F, 0x008D, 0x008A, 0x0088, 0x0086, 0x0083, 0x0081,
+    0x007F, 0x007D, 0x007A, 0x0078, 0x0076, 0x0074, 0x0072, 0x0070,
+    0x006E, 0x006C, 0x006A, 0x0068, 0x0066, 0x0064, 0x0062, 0x0060,
+    0x005E, 0x005C, 0x005B, 0x0059, 0x0057, 0x0055, 0x0053, 0x0052,
+    0x0050, 0x004E, 0x004D, 0x004B, 0x004A, 0x0048, 0x0046, 0x0045,
+    0x0043, 0x0042, 0x0040, 0x003F, 0x003E, 0x003C, 0x003B, 0x0039,
+    0x0038, 0x0037, 0x0035, 0x0034, 0x0033, 0x0031, 0x0030, 0x002F,
+    0x002E, 0x002D, 0x002B, 0x002A, 0x0029, 0x0028, 0x0027, 0x0026,
+    0x0025, 0x0024, 0x0023, 0x0022, 0x0021, 0x0020, 0x001F, 0x001E,
+    0x001D, 0x001C, 0x001B, 0x001A, 0x0019, 0x0018, 0x0017, 0x0017,
+    0x0016, 0x0015, 0x0014, 0x0014, 0x0013, 0x0012, 0x0011, 0x0011,
+    0x0010, 0x000F, 0x000F, 0x000E, 0x000D, 0x000D, 0x000C, 0x000C,
+    0x000B, 0x000A, 0x000A, 0x0009, 0x0009, 0x0008, 0x0008, 0x0007,
+    0x0007, 0x0007, 0x0006, 0x0006, 0x0005, 0x0005, 0x0005, 0x0004,
+    0x0004, 0x0004, 0x0003, 0x0003, 0x0003, 0x0002, 0x0002, 0x0002,
+    0x0002, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000
+};
+
+
+// Word index to Slot index
+const int8_t aymo_ymf262_word_to_slot[AYMO_YMF262_SLOT_NUM_MAX] =
+{
+     0,  1,  2, 48, 18, 19, 20, 52,
+    12, 13, 14, 56, 30, 31, 32, 60,
+     3,  4,  5, 49, 21, 22, 23, 53,
+    15, 16, 17, 57, 33, 34, 35, 61,
+     6,  7,  8, 50, 24, 25, 26, 54,
+    42, 43, 44, 58, 36, 37, 38, 62,
+     9, 10, 11, 51, 27, 28, 29, 55,
+    45, 46, 47, 59, 39, 40, 41, 63
+};
+
+// Slot index to Word index
+const int8_t aymo_ymf262_slot_to_word[AYMO_YMF262_SLOT_NUM_MAX] =
+{
+     0,  1,  2, 16, 17, 18, 32, 33,
+    34, 48, 49, 50,  8,  9, 10, 24,
+    25, 26,  4,  5,  6, 20, 21, 22,
+    36, 37, 38, 52, 53, 54, 12, 13,
+    14, 28, 29, 30, 44, 45, 46, 60,
+    61, 62, 40, 41, 42, 56, 57, 58,
+     3, 19, 35, 51,  7, 23, 39, 55,
+    11, 27, 43, 59, 15, 31, 47, 63
+};
+
+
+// Word index to Channel_2xOP index
+const int8_t aymo_ymf262_word_to_ch2x[AYMO_YMF262_SLOT_NUM_MAX] =
+{
+     0,  1,  2, 24,  9, 10, 11, 26,
+     6,  7,  8, 28, 15, 16, 17, 30,
+     0,  1,  2, 24,  9, 10, 11, 26,
+     6,  7,  8, 28, 15, 16, 17, 30,
+     3,  4,  5, 25, 12, 13, 14, 27,
+    21, 22, 23, 29, 18, 19, 20, 31,
+     3,  4,  5, 25, 12, 13, 14, 27,
+    21, 22, 23, 29, 18, 19, 20, 31
+};
+
+// Channel_2xOP index to Word index
+const int8_t aymo_ymf262_ch2x_to_word[AYMO_YMF262_SLOT_NUM_MAX / 2][2/* slot */] =
+{
+    {  0, 16 },  {  1, 17 },  {  2, 18 },  { 32, 48 },
+    { 33, 49 },  { 34, 50 },  {  8, 24 },  {  9, 25 },
+    { 10, 26 },  {  4, 20 },  {  5, 21 },  {  6, 22 },
+    { 36, 52 },  { 37, 53 },  { 38, 54 },  { 12, 28 },
+    { 13, 29 },  { 14, 30 },  { 44, 60 },  { 45, 61 },
+    { 46, 62 },  { 40, 56 },  { 41, 57 },  { 42, 58 },
+    {  3, 19 },  { 35, 51 },  {  7, 23 },  { 39, 55 },
+    { 11, 27 },  { 43, 59 },  { 15, 31 },  { 47, 63 }
+};
+
+
+// Word index to Channel_4xOP index
+const int8_t aymo_ymf262_word_to_ch4x[AYMO_YMF262_SLOT_NUM_MAX] =
+{
+     0,  1,  2, 12,  3,  4,  5, 13,
+     6,  7,  8, 14,  9, 10, 11, 15,
+     0,  1,  2, 12,  3,  4,  5, 13,
+     6,  7,  8, 14,  9, 10, 11, 15,
+     0,  1,  2, 12,  3,  4,  5, 13,
+     6,  7,  8, 14,  9, 10, 11, 15,
+     0,  1,  2, 12,  3,  4,  5, 13,
+     6,  7,  8, 14,  9, 10, 11, 15
+};
+
+// Channel_4xOP index to Word index
+const int8_t aymo_ymf262_ch4x_to_word[AYMO_YMF262_SLOT_NUM_MAX / 4][4/* slot */] =
+{
+    {  0, 16, 32, 48 },  {  1, 17, 33, 49 },
+    {  2, 18, 34, 50 },  {  4, 20, 36, 52 },
+    {  5, 21, 37, 53 },  {  6, 22, 38, 54 },
+    {  8, 24, 40, 56 },  {  9, 25, 41, 57 },
+    { 10, 26, 42, 58 },  { 12, 28, 44, 60 },
+    { 13, 29, 45, 61 },  { 14, 30, 46, 62 },
+    {  3, 19, 35, 51 },  {  7, 23, 39, 55 },
+    { 11, 27, 43, 59 },  { 15, 31, 47, 63 }
+};
+
+// Channel_4xOP index to Channel_2xOP index pairs
+const int8_t aymo_ymf262_ch4x_to_pair[AYMO_YMF262_CHANNEL_NUM_MAX / 2][2/* slot */] =
+{
+    {  0,  3 },  {  1,  4 },  {  2,  5 },
+    {  9, 12 },  { 10, 13 },  { 11, 14 },
+    {  6, 21 },  {  7, 22 },  {  8, 23 },
+    { 15, 18 },  { 16, 19 },  { 17, 20 },
+    { 24, 25 },  { 26, 27 },  { 28, 29 },  { 30, 31 }
+};
+
+// Paired Channel_2xOP index
+const int8_t aymo_ymf262_ch2x_paired[AYMO_YMF262_CHANNEL_NUM_MAX] =
+{
+     3,  4,  5,
+     0,  1,  2,
+    21, 22, 23,
+    12, 13, 14,
+     9, 10, 11,
+    18, 19, 20,
+    15, 16, 17,
+     6,  7,  8,
+    25, 24, 27, 26,
+    29, 28, 31, 30
+};
+
+
+// Sub-address to Slot index
+const int8_t aymo_ymf262_subaddr_to_slot[AYMO_YMF262_SLOT_NUM_MAX] =
+{
+     0,  1,  2,  3,  4,  5, 48, 49,
+     6,  7,  8,  9, 10, 11, 50, 51,
+    12, 13, 14, 15, 16, 17, 52, 53,
+    36, 37, 38, 39, 40, 41, 54, 55,
+
+    18, 19, 20, 21, 22, 23, 56, 57,
+    24, 25, 26, 27, 28, 29, 58, 59,
+    30, 31, 32, 33, 34, 35, 60, 61,
+    42, 43, 44, 45, 46, 47, 62, 63
+};
+
+
+// TODO: slot_to_addr[]
+
+
+// Sub-addres to Channel_2xOP index
+const int8_t aymo_ymf262_subaddr_to_ch2x[AYMO_YMF262_CHANNEL_NUM_MAX] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,
+    18, 19, 20, 21, 22, 23, 24,
+
+     9, 10, 11, 12, 13, 14, 15, 16, 17,
+    25, 26, 27, 28, 29, 30, 31
+};
+
+
+// TODO: ch2x_to_addr[]
+
+
+const int8_t aymo_ymf262_pg_mult_x2_table[16] =
+{
+    1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 20, 24, 24, 30, 30
+};
+
+
+const int8_t aymo_ymf262_eg_ksl_table[16] =
+{
+    0, 32, 40, 45, 48, 51, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64
+};
+
+const int8_t aymo_ymf262_eg_kslsh_table[4] =
+{
+    8, 1, 2, 0
+};
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_ymf262_none.c b/src/aymo_ymf262_none.c
new file mode 100644
index 0000000..ce3c5e7
--- /dev/null
+++ b/src/aymo_ymf262_none.c
@@ -0,0 +1,200 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_ymf262.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_none.h"
+
+#include "opl3.h"
+
+#include <assert.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_ymf262_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_ymf262_ctor_f)&(aymo_(ctor)),
+    (aymo_ymf262_dtor_f)&(aymo_(dtor)),
+    (aymo_ymf262_read_f)&(aymo_(read)),
+    (aymo_ymf262_write_f)&(aymo_(write)),
+    (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)),
+    (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)),
+    (aymo_ymf262_get_output_f)&(aymo_(get_output)),
+    (aymo_ymf262_tick_f)&(aymo_(tick)),
+    (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)),
+    (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)),
+    (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)),
+    (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4))
+};
+
+
+const struct aymo_ymf262_vt* aymo_(get_vt)(void)
+{
+    return &(aymo_(vt));
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    // Wipe everything, except VT
+    const struct aymo_ymf262_vt* vt = chip->parent.vt;
+    aymo_memset(chip, 0, sizeof(*chip));
+    chip->parent.vt = vt;
+
+    OPL3_Reset(&chip->opl3, (uint32_t)AYMO_YMF262_SAMPLE_RATE);
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    AYMO_UNUSED_VAR(chip);
+    AYMO_UNUSED_VAR(address);
+    assert(chip);
+
+    // not supported
+    return 0u;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    OPL3_WriteReg(&chip->opl3, address, value);
+}
+
+
+int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    // not checked
+    OPL3_WriteRegBuffered(&chip->opl3, address, value);
+    return 1;
+}
+
+
+int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    // not supported
+    (void)chip;
+    (void)count;
+    return 0;
+}
+
+
+int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel)
+{
+    assert(chip);
+
+    if (channel < 4u) {
+        return chip->outs[channel];
+    }
+    return 0;
+}
+
+
+void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    while (count--) {
+        OPL3_Generate4Ch(&chip->opl3, chip->outs);
+    }
+}
+
+
+void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+
+    while (count--) {
+        OPL3_Generate4Ch(&chip->opl3, chip->outs);
+        y[0] = chip->outs[0];
+        y[1] = chip->outs[1];
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+
+    while (count--) {
+        OPL3_Generate4Ch(&chip->opl3, chip->outs);
+        y[0] = chip->outs[0];
+        y[1] = chip->outs[1];
+        y[2] = chip->outs[2];
+        y[3] = chip->outs[3];
+        y += 4u;
+    }
+}
+
+
+void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+
+    while (count--) {
+        OPL3_Generate4Ch(&chip->opl3, chip->outs);
+        y[0] = (float)chip->outs[0];
+        y[1] = (float)chip->outs[1];
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+
+    while (count--) {
+        OPL3_Generate4Ch(&chip->opl3, chip->outs);
+        y[0] = (float)chip->outs[0];
+        y[1] = (float)chip->outs[1];
+        y[2] = (float)chip->outs[2];
+        y[3] = (float)chip->outs[3];
+        y += 4u;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/src/aymo_ymf262_x86_avx.c b/src/aymo_ymf262_x86_avx.c
new file mode 100644
index 0000000..0bdcd88
--- /dev/null
+++ b/src/aymo_ymf262_x86_avx.c
@@ -0,0 +1,1691 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <assert.h>
+#include "aymo_cpu_x86_sse41_inline.h"  // actually using SSE4.1
+#include "aymo_ymf262.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_x86_avx.h"
+
+#ifdef AYMO_CPU_SUPPORT_X86_AVX
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_ymf262_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_ymf262_ctor_f)&(aymo_(ctor)),
+    (aymo_ymf262_dtor_f)&(aymo_(dtor)),
+    (aymo_ymf262_read_f)&(aymo_(read)),
+    (aymo_ymf262_write_f)&(aymo_(write)),
+    (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)),
+    (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)),
+    (aymo_ymf262_get_output_f)&(aymo_(get_output)),
+    (aymo_ymf262_tick_f)&(aymo_(tick)),
+    (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)),
+    (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)),
+    (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)),
+    (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4))
+};
+
+
+// 32-bit Slot Group side (lo/hi)
+const int8_t aymo_(sgo_side)[8] =
+{
+    0, 0, 0, 0,  1, 1, 1, 1
+};
+
+// 32-bit Slot Group cell
+const int8_t aymo_(sgo_cell)[8] =
+{
+    0, 1, 2, 3,  0, 1, 2, 3
+};
+
+
+const uint16_t aymo_(eg_incstep_table)[4] =
+{
+    ((1 << 15) | (1 << 14) | (1 << 13)),
+    ((0 << 15) | (0 << 14) | (1 << 13)),
+    ((0 << 15) | (1 << 14) | (1 << 13)),
+    ((0 << 15) | (0 << 14) | (0 << 13))
+};
+
+
+// Wave descriptors
+const struct aymo_(wave) aymo_(wave_table)[8] =  // TODO: share bits; select vit shifts
+{
+    { 1,  0x0000,  0x0200,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0200,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0000,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0100,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 2,  0x0400,  0x0200,  0x0100,  0x00FF,  -1 },
+    { 2,  0x0400,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0000,  0x0200,  0x0200,  0x0001,   0 },
+    { 8,  0x0000,  0x1000,  0x1000,  0x1FFF,   0 }
+};
+
+
+// 2-channel connection descriptors
+const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */] =
+{
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,  -1 }
+    },
+};
+
+// 4-channel connection descriptors
+const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */] =
+{
+    {
+        { -1,   0,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,  -1 },
+        {  0,   0,  -1 }
+    },
+};
+
+// Rhythm connection descriptors
+const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */] =
+{
+    // Channel 6: BD, FM
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    // Channel 6: BD, AM
+    {
+        { -1,   0,   0 },
+        {  0,   0,  -1 }
+    },
+    // Channel 7: HH + SD
+    {
+        {  0,   0,  -1 },
+        {  0,   0,  -1 }
+    },
+    // Channel 8: TT + TC
+    {
+        {  0,   0,  -1 },
+        {  0,   0,  -1 }
+    }
+};
+
+
+// Slot mask output delay for outputs A and C
+const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)] =  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+{
+    0xF8,
+    0xF8,
+    0xF8,
+    0xFF,
+    0xF8,
+    0xFF,
+    0xF8,
+    0xFF
+};
+
+
+// Slot mask output delay for outputs B and D
+const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)] =  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+{
+    0x88,
+    0xF8,
+    0x88,
+    0xF8,
+    0x88,
+    0xFF,
+    0x88,
+    0xFF
+};
+
+
+// Updates phase generator
+static inline
+void aymo_(pg_update_deltafreq)(
+    struct aymo_(chip)* chip,
+    struct aymo_(ch2x_group)* cg,
+    struct aymo_(slot_group)* sg
+)
+{
+    // Update phase
+    vi16_t fnum = cg->pg_fnum;
+    vi16_t range = vand(fnum, vset1(7 << 7));
+    range = vmulihi(range, vand(sg->pg_vib, chip->pg_vib_mulhi));
+    range = vsub(vxor(range, chip->pg_vib_neg), chip->pg_vib_neg);  // flip sign
+    fnum = vadd(fnum, range);
+
+    vi32_t zero = vsetz();
+    vi32_t fnum_lo = vunpacklo(fnum, zero);
+    vi32_t fnum_hi = vunpackhi(fnum, zero);
+    vi32_t block_sll_lo = vunpacklo(cg->pg_block, zero);
+    vi32_t block_sll_hi = vunpackhi(cg->pg_block, zero);
+    vi32_t basefreq_lo = vvsrli(vvsllv(fnum_lo, block_sll_lo), 1);
+    vi32_t basefreq_hi = vvsrli(vvsllv(fnum_hi, block_sll_hi), 1);
+    vi32_t pg_mult_x2_lo = vunpacklo(sg->pg_mult_x2, zero);
+    vi32_t pg_mult_x2_hi = vunpackhi(sg->pg_mult_x2, zero);
+    vi32_t deltafreq_lo = vvsrli(vvmullo(basefreq_lo, pg_mult_x2_lo), 1);
+    vi32_t deltafreq_hi = vvsrli(vvmullo(basefreq_hi, pg_mult_x2_hi), 1);
+    sg->pg_deltafreq_lo = deltafreq_lo;
+    sg->pg_deltafreq_hi = deltafreq_hi;
+}
+
+
+// Updates noise generator
+static inline
+void aymo_(ng_update)(struct aymo_(chip)* chip, unsigned times)
+{
+    // Update noise
+    uint32_t noise = chip->ng_noise;
+    while (times--) {
+        uint32_t n_bit = (((noise >> 14) ^ noise) & 1);
+        noise = ((noise >> 1) | (n_bit << 22));
+    }
+    chip->ng_noise = noise;
+}
+
+
+// Updates rhythm manager, slot group 1
+static inline
+void aymo_(rm_update_sg1)(struct aymo_(chip)* chip)
+{
+    struct aymo_(slot_group)* sg = &chip->sg[1];
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Double rhythm outputs
+        vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0);
+        vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask);
+        chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a));
+        chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b));
+        chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c));
+        chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d));
+    }
+
+    vi16_t phase = sg->pg_phase_out;
+    uint16_t phase13 = (uint16_t)vextract(phase, 1);
+
+    // Update noise bits
+    chip->rm_hh_bit2 = ((phase13 >> 2) & 1);
+    chip->rm_hh_bit3 = ((phase13 >> 3) & 1);
+    chip->rm_hh_bit7 = ((phase13 >> 7) & 1);
+    chip->rm_hh_bit8 = ((phase13 >> 8) & 1);
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Calculate noise bit
+        uint16_t rm_xor = (
+            (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) |
+            (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) |
+            (chip->rm_tc_bit3 ^ chip->rm_tc_bit5)
+        );
+
+        // Update HH
+        uint16_t noise = (uint16_t)chip->ng_noise;
+        phase13 = (rm_xor << 9);
+        if (rm_xor ^ (noise & 1)) {
+            phase13 |= 0xD0;
+        } else {
+            phase13 |= 0x34;
+        }
+        phase = vinsert(phase, (int16_t)phase13, 1);
+
+        sg->pg_phase_out = phase;
+    }
+}
+
+
+// Updates rhythm manager, slot group 3
+static inline
+void aymo_(rm_update_sg3)(struct aymo_(chip)* chip)
+{
+    struct aymo_(slot_group)* sg = &chip->sg[3];
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Double rhythm outputs
+        vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0);
+        vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask);
+        chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a));
+        chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b));
+        chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c));
+        chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d));
+
+        // Calculate noise bit
+        uint16_t rm_xor = (
+            (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) |
+            (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) |
+            (chip->rm_tc_bit3 ^ chip->rm_tc_bit5)
+        );
+        vi16_t phase = sg->pg_phase_out;
+
+        // Update SD
+        uint16_t noise = (uint16_t)chip->ng_noise;
+        uint16_t phase16 = (
+            ((uint16_t)chip->rm_hh_bit8 << 9) |
+            ((uint16_t)(chip->rm_hh_bit8 ^ (noise & 1)) << 8)
+        );
+        phase = vinsert(phase, (int16_t)phase16, 1);
+
+        // Update TC
+        uint32_t phase17 = vextract(phase, 2);
+        chip->rm_tc_bit3 = ((phase17 >> 3) & 1);
+        chip->rm_tc_bit5 = ((phase17 >> 5) & 1);
+        phase17 = ((rm_xor << 9) | 0x80);
+        phase = vinsert(phase, (int16_t)phase17, 2);
+
+        sg->pg_phase_out = phase;
+    }
+}
+
+
+// Updates slot generators
+static
+void aymo_(sg_update)(
+    struct aymo_(chip)* chip,
+    struct aymo_(slot_group)* sg
+)
+{
+    // EG: Compute envelope output
+    vi16_t sg_eg_rout = sg->eg_rout;
+    sg->eg_out = vadd(vadd(sg_eg_rout, sg->eg_tremolo_am), sg->eg_ksl_sh_tl_x4);
+
+    // PG: Compute phase output
+    vi32_t phase_out_mask = vvset1(0xFFFF);
+    vi32_t phase_out_lo = vvand(vvsrli(sg->pg_phase_lo, 9), phase_out_mask);
+    vi32_t phase_out_hi = vvand(vvsrli(sg->pg_phase_hi, 9), phase_out_mask);
+    vi16_t phase_out = vvpackus(phase_out_lo, phase_out_hi);
+    sg->pg_phase_out = phase_out;
+
+    // EG: Compute rate
+    vi16_t eg_prgen = sg->eg_gen;
+    vi16_t eg_gen_rel = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_RELEASE)));
+    vi16_t notreset = vcmpz(vand(sg->eg_key, eg_gen_rel));
+    vi16_t eg_gen_mullo = vblendv(vset1(AYMO_(EG_GEN_MULLO_ATTACK)), sg->eg_gen_mullo, notreset);
+    vi16_t reg_rate = vu2i(vmululo(vi2u(sg->eg_adsr), vi2u(eg_gen_mullo)));  // move to top nibble
+    vi16_t rate_temp = vand(reg_rate, vset1((int16_t)0xF000));  // keep top nibble
+    rate_temp = vsrli(rate_temp, AYMO_(EG_GEN_SRLHI));
+    vi16_t rate = vadd(sg->eg_ks, rate_temp);
+    vi16_t rate_lo = vand(rate, vset1(3));
+    vi16_t rate_hi = vsrli(rate, 2);
+    rate_hi = vmini(rate_hi, vset1(15));
+
+    // PG: Update phase
+    vi32_t notreset_lo = vunpacklo(notreset, notreset);
+    vi32_t notreset_hi = vunpackhi(notreset, notreset);
+    vi32_t pg_phase_lo = vvand(notreset_lo, sg->pg_phase_lo);
+    vi32_t pg_phase_hi = vvand(notreset_hi, sg->pg_phase_hi);
+    sg->pg_phase_lo = vvadd(pg_phase_lo, sg->pg_deltafreq_lo);
+    sg->pg_phase_hi = vvadd(pg_phase_hi, sg->pg_deltafreq_hi);
+
+    // EG: Compute shift (< 12)
+    vi16_t eg_shift = vadd(rate_hi, chip->eg_add);
+    vi16_t rate_pre_lt12 = vor(vslli(rate_lo, 1), vset1(8));
+    vi16_t shift_lt12 = vsrlv(rate_pre_lt12, vsubsu(vset1(15), eg_shift));
+    vi16_t eg_state = vset1((int16_t)chip->eg_state);
+    shift_lt12 = vand(shift_lt12, eg_state);
+
+    // WG: Compute feedback and modulation inputs
+    vi16_t fbsum = vslli(vadd(sg->wg_out, sg->wg_prout), 1);
+    vi16_t fbsum_sh = vmulihi(fbsum, sg->wg_fb_mulhi);
+    vi16_t prmod = vand(chip->wg_mod, sg->wg_prmod_gate);
+    vi16_t fbmod = vand(fbsum_sh, sg->wg_fbmod_gate);
+    sg->wg_prout = sg->wg_out;
+
+    // WG: Compute operator phase input
+    vi16_t modsum = vadd(fbmod, prmod);
+    vi16_t phase = vadd(phase_out, modsum);
+
+    // EG: Compute shift (>= 12)
+    vu16_t rate_lo_muluhi = vi2u(vslli(vpow2m1lt4(rate_lo), 1));
+    vi16_t incstep_ge12 = vand(vu2i(vmuluhi(chip->eg_incstep, rate_lo_muluhi)), vset1(1));
+    vi16_t shift_ge12 = vadd(vand(rate_hi, vset1(3)), incstep_ge12);
+    shift_ge12 = vmini(shift_ge12, vset1(3));
+    shift_ge12 = vblendv(shift_ge12, eg_state, vcmpz(shift_ge12));
+
+    vi16_t shift = vblendv(shift_lt12, shift_ge12, vcmpgt(rate_hi, vset1(11)));
+    shift = vandnot(vcmpz(rate_temp), shift);
+
+    // EG: Instant attack
+    vi16_t eg_rout = sg_eg_rout;
+    eg_rout = vandnot(vandnot(notreset, vcmpeq(rate_hi, vset1(15))), eg_rout);
+
+    // WG: Process phase
+    vi16_t phase_sped = vu2i(vmululo(vi2u(phase), sg->wg_phase_mullo));
+    vi16_t phase_gate = vcmpz(vand(phase_sped, sg->wg_phase_zero));
+    vi16_t phase_flip = vcmpp(vand(phase_sped, sg->wg_phase_flip));
+    vi16_t phase_mask = sg->wg_phase_mask;
+    vi16_t phase_xor = vand(phase_flip, phase_mask);
+    vi16_t phase_idx = vxor(phase_sped, phase_xor);
+    phase_out = vand(vand(phase_gate, phase_mask), phase_idx);
+
+    // EG: Envelope off
+    vi16_t eg_off = vcmpgt(sg_eg_rout, vset1(0x01F7));
+    vi16_t eg_gen_natk_and_nrst = vand(vcmpp(eg_prgen), notreset);
+    eg_rout = vblendv(eg_rout, vset1(0x01FF), vand(eg_gen_natk_and_nrst, eg_off));
+
+    // WG: Compute logsin variant
+    vi16_t phase_lo = phase_out;  // vgather() masks to low byte
+    vi16_t logsin_val = vgather(aymo_ymf262_logsin_table, phase_lo);
+    logsin_val = vblendv(vset1(0x1000), logsin_val, phase_gate);
+
+    // EG: Compute common increment not in attack state
+    vi16_t eg_inc_natk_cond = vand(vand(notreset, vcmpz(eg_off)), vcmpp(shift));
+    vi16_t eg_inc_natk = vand(eg_inc_natk_cond, vpow2m1lt4(shift));
+    vi16_t eg_gen = eg_prgen;
+
+    // WG: Compute exponential output
+    vi16_t exp_in = vblendv(phase_out, logsin_val, sg->wg_sine_gate);
+    vi16_t exp_level = vadd(exp_in, vslli(sg->eg_out, 3));
+    exp_level = vmini(exp_level, vset1(0x1FFF));
+    vi16_t exp_level_lo = exp_level;  // vgather() masks to low byte
+    vi16_t exp_level_hi = vsrli(exp_level, 8);
+    vi16_t exp_value = vgather(aymo_ymf262_exp_x2_table, exp_level_lo);
+    vi16_t exp_out = vsrlv(exp_value, exp_level_hi);
+
+    // EG: Move attack to decay state
+    vi16_t eg_inc_atk_cond = vand(vand(vcmpp(sg->eg_key), vcmpp(shift)),
+                                  vand(vcmpz(eg_prgen), vcmpgt(vset1(15), rate_hi)));
+    vi16_t eg_inc_atk_ninc = vsrlv(sg_eg_rout, vsub(vset1(4), shift));
+    vi16_t eg_inc = vandnot(eg_inc_atk_ninc, eg_inc_atk_cond);
+    vi16_t eg_gen_atk_to_dec = vcmpz(vor(eg_prgen, sg_eg_rout));
+    eg_gen = vsub(eg_gen, eg_gen_atk_to_dec);  // 0 --> 1
+    eg_inc = vblendv(eg_inc_natk, eg_inc, vcmpz(eg_prgen));
+    eg_inc = vandnot(eg_gen_atk_to_dec, eg_inc);
+
+    // WG: Compute operator wave output
+    vi16_t wave_pos = vcmpz(vand(phase_sped, sg->wg_phase_neg));
+    vi16_t wave_neg = vandnot(wave_pos, phase_gate);
+    vi16_t wave_out = vxor(exp_out, wave_neg);
+    sg->wg_out = wave_out;
+    chip->wg_mod = wave_out;
+
+    // EG: Move decay to sustain state
+    vi16_t eg_gen_dec = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_DECAY)));
+    vi16_t sl_hit = vcmpeq(vsrli(sg_eg_rout, 4), sg->eg_sl);
+    vi16_t eg_gen_dec_to_sus = vand(eg_gen_dec, sl_hit);
+    eg_gen = vsub(eg_gen, eg_gen_dec_to_sus);  // 1 --> 2
+    eg_inc = vandnot(eg_gen_dec_to_sus, eg_inc);
+
+    // WG: Update chip output accumulators, with quirky slot output delay
+    vi16_t og_out_ac = vblendv(wave_out, sg->og_prout, sg->og_prout_ac);
+    vi16_t og_out_bd = vblendv(wave_out, sg->og_prout, sg->og_prout_bd);
+    sg->og_prout = wave_out;
+    chip->og_acc_a = vadd(chip->og_acc_a, vand(og_out_ac, sg->og_out_ch_gate_a));
+    chip->og_acc_c = vadd(chip->og_acc_c, vand(og_out_ac, sg->og_out_ch_gate_c));
+    chip->og_acc_b = vadd(chip->og_acc_b, vand(og_out_bd, sg->og_out_ch_gate_b));
+    chip->og_acc_d = vadd(chip->og_acc_d, vand(og_out_bd, sg->og_out_ch_gate_d));
+
+    // EG: Move back to attack state
+    eg_gen = vand(notreset, eg_gen);  // * --> 0
+
+    // EG: Move to release state
+    eg_gen = vor(eg_gen, vsrli(vcmpz(sg->eg_key), 14));  // * --> 3
+
+    // EG: Update envelope generator
+    eg_rout = vadd(eg_rout, eg_inc);
+    eg_rout = vand(eg_rout, vset1(0x01FF));
+    sg->eg_rout = eg_rout;
+    sg->eg_gen = eg_gen;
+    sg->eg_gen_mullo = vsllv(vset1(1), vslli(eg_gen, 2));
+
+#ifdef AYMO_DEBUG
+    sg->eg_rate = rate;
+    sg->eg_inc = eg_inc;
+    sg->wg_fbmod = fbsum_sh;
+    sg->wg_mod = modsum;
+#endif
+}
+
+
+// Clear output accumulators
+static inline
+void aymo_(og_clear)(struct aymo_(chip)* chip)
+{
+    chip->og_acc_a = vsetz();
+    chip->og_acc_b = vsetz();
+    chip->og_acc_c = vsetz();
+    chip->og_acc_d = vsetz();
+}
+
+
+// Updates output mixdown
+static inline
+void aymo_(og_update)(struct aymo_(chip)* chip)
+{
+    vi16x8_t one = _mm_set1_epi16(1);
+    vi32x4_t tot_a = _mm_madd_epi16(chip->og_acc_a, one);
+    vi32x4_t tot_b = _mm_madd_epi16(chip->og_acc_b, one);
+    vi32x4_t tot_c = _mm_madd_epi16(chip->og_acc_c, one);
+    vi32x4_t tot_d = _mm_madd_epi16(chip->og_acc_d, one);
+
+    tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(2, 3, 0, 1)));
+
+    tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(1, 0, 3, 2)));
+
+    vi32x4_t tot_ab = _mm_blend_epi16(tot_a, tot_b, 0xCC);
+    vi32x4_t tot_cd = _mm_blend_epi16(tot_c, tot_d, 0x33);
+    vi32x4_t tot_abcd = _mm_blend_epi16(tot_ab, tot_cd, 0xF0);
+    vi16x8_t sat_abcd = _mm_packs_epi32(tot_abcd, tot_abcd);
+
+    vi16x8_t old_abcd = _mm_shuffle_epi32(chip->og_out, _MM_SHUFFLE(1, 0, 3, 2));
+    vi16x8_t out_abcd = _mm_blend_epi16(old_abcd, sat_abcd, 0xF5);
+
+    chip->og_out = out_abcd;
+}
+
+
+// Updates timer management
+static inline
+void aymo_(tm_update)(struct aymo_(chip)* chip)
+{
+    // Update tremolo
+    if AYMO_UNLIKELY((chip->tm_timer & 0x3F) == 0x3F) {
+        chip->eg_tremolopos = ((chip->eg_tremolopos + 1) % 210);
+
+        uint16_t eg_tremolopos = chip->eg_tremolopos;
+        if (eg_tremolopos >= 105) {
+            eg_tremolopos = (210 - eg_tremolopos);
+        }
+        vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift));
+
+        for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            struct aymo_(slot_group)* sg = &chip->sg[sgi];
+            sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am);
+        }
+    }
+
+    // Update vibrato
+    if AYMO_UNLIKELY((chip->tm_timer & 0x3FF) == 0x3FF) {
+        chip->pg_vibpos = ((chip->pg_vibpos + 1) & 7);
+        uint8_t vibpos = chip->pg_vibpos;
+        int16_t pg_vib_mulhi = (0x10000 >> 7);
+        int16_t pg_vib_neg = 0;
+
+        if (!(vibpos & 3)) {
+            pg_vib_mulhi = 0;
+        }
+        else if (vibpos & 1) {
+            pg_vib_mulhi >>= 1;
+        }
+        pg_vib_mulhi >>= chip->eg_vibshift;
+        pg_vib_mulhi &= 0x7F80;
+
+        if (vibpos & 4) {
+            pg_vib_neg = -1;
+        }
+        chip->pg_vib_mulhi = vset1(pg_vib_mulhi);
+        chip->pg_vib_neg = vset1(pg_vib_neg);
+
+        for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            int cgi = aymo_(sgi_to_cgi)(sgi);
+            struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+            struct aymo_(slot_group)* sg = &chip->sg[sgi];
+            aymo_(pg_update_deltafreq)(chip, cg, sg);
+        }
+    }
+
+    chip->tm_timer++;
+    uint16_t eg_incstep = aymo_(eg_incstep_table)[chip->tm_timer & 3];
+    chip->eg_incstep = vi2u(vset1((int16_t)eg_incstep));
+
+    // Update timed envelope patterns
+    int16_t eg_shift = (int16_t)uffsll(chip->eg_timer);
+    int16_t eg_add = ((eg_shift > 13) ? 0 : eg_shift);
+    chip->eg_add = vset1(eg_add);
+
+    // Update envelope timer and flip state
+    if (chip->eg_state | chip->eg_timerrem) {
+        if (chip->eg_timer < ((1ULL << AYMO_YMF262_SLOT_NUM) - 1ULL)) {
+            chip->eg_timer++;
+            chip->eg_timerrem = 0;
+        }
+        else {
+            chip->eg_timer = 0;
+            chip->eg_timerrem = 1;
+        }
+    }
+    chip->eg_state ^= 1;
+}
+
+
+// Updates the register queue
+static inline
+void aymo_(rq_update)(struct aymo_(chip)* chip)
+{
+    if (chip->rq_delay) {
+        if (--chip->rq_delay) {
+            return;
+        }
+    }
+    if (chip->rq_head != chip->rq_tail) {
+        struct aymo_(reg_queue_item)* item = &chip->rq_buffer[chip->rq_head];
+
+        if (item->address & 0x8000u) {
+            chip->rq_delay = AYMO_(REG_QUEUE_LATENCY);
+            chip->rq_delay += (((uint32_t)(item->address & 0x7FFFu) << 16) | item->value);
+        }
+        else {
+            aymo_(write)(chip, item->address, item->value);
+        }
+
+        if (++chip->rq_head >= AYMO_(REG_QUEUE_LENGTH)) {
+            chip->rq_head = 0;
+        }
+    }
+}
+
+
+static
+void aymo_(tick_once)(struct aymo_(chip)* chip)
+{
+    int sgi;
+
+    // Clear output accumulators
+    aymo_(og_clear)(chip);
+
+    // Process slot group 0
+    sgi = 0;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 2
+    sgi = 2;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 4
+    sgi = 4;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 6
+    sgi = 6;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 1
+    sgi = 1;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+    aymo_(ng_update)(chip, (36 - 3));  // slot 16 --> slot 13
+    aymo_(rm_update_sg1)(chip);
+
+    // Process slot group 3
+    sgi = 3;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+    aymo_(ng_update)(chip, 3);  // slot 13 --> slot 16
+    aymo_(rm_update_sg3)(chip);
+
+    if AYMO_UNLIKELY(chip->process_all_slots) {
+        // Process slot group 5
+        sgi = 5;
+        aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+        // Process slot group 7
+        sgi = 7;
+        aymo_(sg_update)(chip, &chip->sg[sgi]);
+    }
+
+    // Update outputs
+    aymo_(og_update)(chip);
+
+    // Update timers
+    aymo_(tm_update)(chip);
+
+    // Dequeue registers
+    aymo_(rq_update)(chip);
+}
+
+
+static
+void aymo_(eg_update_ksl)(struct aymo_(chip)* chip, int word)
+{
+    int slot = aymo_ymf262_word_to_slot[word];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h);
+
+    int16_t pg_fnum = vextractv(cg->pg_fnum, sgo);
+    int16_t pg_fnum_hn = ((pg_fnum >> 6) & 15);
+
+    int ch2x = aymo_ymf262_word_to_ch2x[aymo_ymf262_slot_to_word[slot]];
+    int16_t eg_block = (int16_t)(chip->ch2x_regs[ch2x].reg_B0h.block);
+    int16_t eg_ksl = aymo_ymf262_eg_ksl_table[pg_fnum_hn];
+    eg_ksl = ((eg_ksl << 2) - ((8 - eg_block) << 5));
+    if (eg_ksl < 0) {
+        eg_ksl = 0;
+    }
+    int16_t eg_kslsh = aymo_ymf262_eg_kslsh_table[reg_40h->ksl];
+    int16_t eg_ksl_sh = (eg_ksl >> eg_kslsh);
+
+    int16_t eg_tl_x4 = ((int16_t)reg_40h->tl << 2);
+
+    int16_t eg_ksl_sh_tl_x4 = (eg_ksl_sh + eg_tl_x4);
+    vinsertv(sg->eg_ksl_sh_tl_x4, eg_ksl_sh_tl_x4, sgo);
+
+#ifdef AYMO_DEBUG
+    vinsertv(sg->eg_ksl, eg_ksl, sgo);
+#endif
+}
+
+
+static
+void aymo_(chip_pg_update_nts)(struct aymo_(chip)* chip)
+{
+    for (int slot = 0; slot < AYMO_(SLOT_NUM_MAX); ++slot) {
+        int word = aymo_ymf262_slot_to_word[slot];
+        int ch2x = aymo_ymf262_word_to_ch2x[word];
+        struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+        struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+        struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h);
+        int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8));
+        int16_t eg_ksv = ((reg_B0h->block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1));
+
+        int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+        int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+        int cgi = aymo_(sgi_to_cgi)(sgi);
+        struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+        struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+
+        struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h);
+        int16_t ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1));
+
+        vinsertv(cg->eg_ksv, eg_ksv, sgo);
+        vinsertv(sg->eg_ks,  ks,     sgo);
+    }
+}
+
+
+static
+void aymo_(pg_update_fnum)(
+    struct aymo_(chip)* chip, int ch2x,
+    int16_t pg_fnum, int16_t eg_ksv, int16_t pg_block
+)
+{
+    int word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int sgi0 = (word0 / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word0 % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi0);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+
+    vinsertv(cg->pg_block, pg_block, sgo);
+    vinsertv(cg->pg_fnum, pg_fnum, sgo);
+    vinsertv(cg->eg_ksv, eg_ksv, sgo);
+
+    struct aymo_(slot_group)* sg0 = &(chip->sg[sgi0]);
+    int slot0 = aymo_ymf262_word_to_slot[word0];
+    struct aymo_ymf262_reg_20h* reg_20h0 = &(chip->slot_regs[slot0].reg_20h);
+    int16_t ks0 = (eg_ksv >> ((reg_20h0->ksr ^ 1) << 1));
+    vinsertv(sg0->eg_ks, ks0, sgo);
+    aymo_(eg_update_ksl)(chip, word0);
+    aymo_(pg_update_deltafreq)(chip, cg, sg0);
+
+    int word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+    int sgi1 = (word1 / AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg1 = &(chip->sg[sgi1]);
+    int slot1 = aymo_ymf262_word_to_slot[word1];
+    struct aymo_ymf262_reg_20h* reg_20h1 = &(chip->slot_regs[slot1].reg_20h);
+    int16_t ks1 = (eg_ksv >> ((reg_20h1->ksr ^ 1) << 1));
+    vinsertv(sg1->eg_ks, ks1, sgo);
+    aymo_(eg_update_ksl)(chip, word1);
+    aymo_(pg_update_deltafreq)(chip, cg, sg1);
+}
+
+
+static
+void aymo_(ch2x_update_fnum)(struct aymo_(chip)* chip, int ch2x, int8_t ch2p)
+{
+    struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+    struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+    struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h);
+    int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8));
+    int16_t pg_block = (int16_t)reg_B0h->block;
+    int16_t eg_ksv = ((pg_block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1));
+
+    aymo_(pg_update_fnum)(chip, ch2x, pg_fnum, eg_ksv, pg_block);
+
+    if (ch2p >= 0) {
+        aymo_(pg_update_fnum)(chip, ch2p, pg_fnum, eg_ksv, pg_block);
+    }
+}
+
+
+static inline
+void aymo_(eg_key_on)(struct aymo_(chip)* chip, int word, int16_t mode)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    int16_t eg_key = vextractv(sg->eg_key, sgo);
+    eg_key |= mode;
+    vinsertv(sg->eg_key, eg_key, sgo);
+}
+
+
+static inline
+void aymo_(eg_key_off)(struct aymo_(chip)* chip, int word, int16_t mode)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    int16_t eg_key = vextractv(sg->eg_key, sgo);
+    eg_key &= (int16_t)~mode;
+    vinsertv(sg->eg_key, eg_key, sgo);
+}
+
+
+static
+void aymo_(ch2x_key_on)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm) {
+        unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+        unsigned ch2x_is_drum    = (chip->og_ch2x_drum    & (1UL << ch2x));
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+
+        if (ch2x_is_pairing && !ch2x_is_secondary) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0];
+            int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1];
+            aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL));
+        }
+        else if (!ch2x_is_pairing || ch2x_is_drum) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+        }
+    }
+    else {
+        int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+        int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+        aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+        aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+    }
+}
+
+
+static
+void aymo_(ch2x_key_off)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm) {
+        unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+        unsigned ch2x_is_drum    = (chip->og_ch2x_drum    & (1UL << ch2x));
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+
+        if (ch2x_is_pairing && !ch2x_is_secondary) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0];
+            int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1];
+            aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL));
+        }
+        else if (!ch2x_is_pairing || ch2x_is_drum) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+        }
+    }
+    else {
+        int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+        int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+        aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+        aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+    }
+}
+
+
+static
+void aymo_(cm_rewire_slot)(struct aymo_(chip)* chip, int word, const struct aymo_(conn)* conn)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    vinsertv(sg->wg_fbmod_gate, conn->wg_fbmod_gate, sgo);
+    vinsertv(sg->wg_prmod_gate, conn->wg_prmod_gate, sgo);
+    int16_t og_out_gate = conn->og_out_gate;
+    vinsertv(sg->og_out_gate, og_out_gate, sgo);
+
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+    vinsertv(sg->og_out_ch_gate_a, (vextractv(cg->og_ch_gate_a, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_b, (vextractv(cg->og_ch_gate_b, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_c, (vextractv(cg->og_ch_gate_c, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_d, (vextractv(cg->og_ch_gate_d, sgo) & og_out_gate), sgo);
+}
+
+
+static
+void aymo_(cm_rewire_ch2x)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm && (chip->og_ch2x_pairing & (1UL << ch2x))) {
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+        if (ch2x_is_secondary) {
+            int t = ch2x;
+            ch2x = ch2p;
+            ch2p = t;
+        }
+        unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+        unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+        unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt);
+        const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt];
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]);
+    }
+    else {
+        unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+        const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt];
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]);
+    }
+}
+
+
+static
+void aymo_(cm_rewire_conn)(
+    struct aymo_(chip)* chip,
+    const struct aymo_ymf262_reg_104h* reg_104h_prev
+)
+{
+    struct aymo_ymf262_reg_104h* reg_104h = &chip->chip_regs.reg_104h;
+    unsigned diff = (reg_104h_prev ? (reg_104h_prev->conn ^ reg_104h->conn) : 0xFF);
+
+    for (int ch4x = 0; ch4x < (AYMO_(CHANNEL_NUM_MAX) / 2); ++ch4x) {
+        if (diff & (1 << ch4x)) {
+            int ch2x = aymo_ymf262_ch4x_to_pair[ch4x][0];
+            int ch2p = aymo_ymf262_ch4x_to_pair[ch4x][1];
+
+            if (reg_104h->conn & (1 << ch4x)) {
+                chip->og_ch2x_pairing |= ((1UL << ch2x) | (1UL << ch2p));
+
+                unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+                unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+                unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt);
+                const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]);
+            }
+            else {
+                chip->og_ch2x_pairing &= ~((1UL << ch2x) | (1UL << ch2p));
+
+                unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+                const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]);
+
+                unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+                const struct aymo_(conn)* ch2p_conn = aymo_(conn_ch2x_table)[ch2p_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch2p_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch2p_conn[1]);
+            }
+        }
+    }
+}
+
+
+static
+void aymo_(cm_rewire_rhythm)(
+    struct aymo_(chip)* chip,
+    const struct aymo_ymf262_reg_BDh* reg_BDh_prev
+)
+{
+    const struct aymo_ymf262_reg_BDh reg_BDh_zero = { 0, 0, 0, 0, 0, 0, 0, 0 };
+    const struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh;
+    int force_update = 0;
+
+    if (reg_BDh->ryt) {
+        if (!reg_BDh_prev->ryt) {
+            // Apply special connection for rhythm mode
+            unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt;
+            const struct aymo_(conn)* ch6_conn = aymo_(conn_ryt_table)[ch6_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]);
+
+            const struct aymo_(conn)* ch7_conn = aymo_(conn_ryt_table)[2];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]);
+
+            const struct aymo_(conn)* ch8_conn = aymo_(conn_ryt_table)[3];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]);
+
+            force_update = 1;
+        }
+    }
+    else {
+        if (reg_BDh_prev->ryt) {
+            // Apply standard Channel_2xOP connection
+            unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt;
+            const struct aymo_(conn)* ch6_conn = aymo_(conn_ch2x_table)[ch6_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]);
+
+            unsigned ch7_cnt = chip->ch2x_regs[7].reg_C0h.cnt;
+            const struct aymo_(conn)* ch7_conn = aymo_(conn_ch2x_table)[ch7_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]);
+
+            unsigned ch8_cnt = chip->ch2x_regs[8].reg_C0h.cnt;
+            const struct aymo_(conn)* ch8_conn = aymo_(conn_ch2x_table)[ch8_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]);
+
+            reg_BDh = &reg_BDh_zero;  // force all keys off
+            force_update = 1;
+        }
+    }
+
+    if ((reg_BDh->hh != reg_BDh_prev->hh) || force_update) {
+        int word_hh = aymo_ymf262_ch2x_to_word[7][0];
+        if (reg_BDh->hh) {
+            aymo_(eg_key_on)(chip, word_hh, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_hh, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->tc != reg_BDh_prev->tc) || force_update) {
+        int word_tc = aymo_ymf262_ch2x_to_word[8][1];
+        if (reg_BDh->tc) {
+            aymo_(eg_key_on)(chip, word_tc, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_tc, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->tom != reg_BDh_prev->tom) || force_update) {
+        int word_tom = aymo_ymf262_ch2x_to_word[8][0];
+        if (reg_BDh->tom) {
+            aymo_(eg_key_on)(chip, word_tom, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_tom, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->sd != reg_BDh_prev->sd) || force_update) {
+        int word_sd = aymo_ymf262_ch2x_to_word[7][1];
+        if (reg_BDh->sd) {
+            aymo_(eg_key_on)(chip, word_sd, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_sd, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->bd != reg_BDh_prev->bd) || force_update) {
+        int word_bd0 = aymo_ymf262_ch2x_to_word[6][0];
+        int word_bd1 = aymo_ymf262_ch2x_to_word[6][1];
+        if (reg_BDh->bd) {
+            aymo_(eg_key_on)(chip, word_bd0, AYMO_(EG_KEY_DRUM));
+            aymo_(eg_key_on)(chip, word_bd1, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_bd0, AYMO_(EG_KEY_DRUM));
+            aymo_(eg_key_off)(chip, word_bd1, AYMO_(EG_KEY_DRUM));
+        }
+    }
+}
+
+
+static
+void aymo_(write_00h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    switch (address) {
+    case 0x01: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_01h) = value;
+        break;
+    }
+    case 0x02: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_02h) = value;
+        break;
+    }
+    case 0x03: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_03h) = value;
+        break;
+    }
+    case 0x04: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_04h) = value;
+        break;
+    }
+    case 0x104: {
+        struct aymo_ymf262_reg_104h reg_104h_prev = chip->chip_regs.reg_104h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_104h) = value;
+        aymo_(cm_rewire_conn)(chip, &reg_104h_prev);
+        break;
+    }
+    case 0x105: {
+        struct aymo_ymf262_reg_105h reg_105h_prev = chip->chip_regs.reg_105h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_105h) = value;
+        if (chip->chip_regs.reg_105h.newm != reg_105h_prev.newm) {
+            ;
+        }
+        break;
+    }
+    case 0x08: {
+        struct aymo_ymf262_reg_08h reg_08h_prev = chip->chip_regs.reg_08h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_08h) = value;
+        if (chip->chip_regs.reg_08h.nts != reg_08h_prev.nts) {
+            aymo_(chip_pg_update_nts)(chip);
+        }
+        break;
+    }
+    }
+}
+
+
+static
+void aymo_(write_20h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int sgi = (aymo_ymf262_slot_to_word[slot] / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (aymo_ymf262_slot_to_word[slot] % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h);
+    struct aymo_ymf262_reg_20h reg_20h_prev = *reg_20h;
+    *(uint8_t*)(void*)reg_20h = value;
+    unsigned update_deltafreq = 0;
+
+    if (reg_20h->mult != reg_20h_prev.mult) {
+        int16_t pg_mult_x2 = aymo_ymf262_pg_mult_x2_table[reg_20h->mult];
+        vinsertv(sg->pg_mult_x2, pg_mult_x2, sgo);
+        update_deltafreq = 1;  // force
+    }
+
+    if (reg_20h->ksr != reg_20h_prev.ksr) {
+        int16_t eg_ksv = vextractv(cg->eg_ksv, sgo);
+        int16_t eg_ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1));
+        vinsertv(sg->eg_ks, eg_ks, sgo);
+    }
+
+    if (reg_20h->egt != reg_20h_prev.egt) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->sr = (reg_20h->egt ? 0 : chip->slot_regs[slot].reg_80h.rr);
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+    }
+
+    if (reg_20h->vib != reg_20h_prev.vib) {
+        int16_t pg_vib = -(int16_t)reg_20h->vib;
+        vinsertv(sg->pg_vib, pg_vib, sgo);
+        update_deltafreq = 1;  // force
+    }
+
+    if (reg_20h->am != reg_20h_prev.am) {
+        int16_t eg_am = -(int16_t)reg_20h->am;
+        vinsertv(sg->eg_am, eg_am, sgo);
+
+        uint16_t eg_tremolopos = chip->eg_tremolopos;
+        if (eg_tremolopos >= 105) {
+            eg_tremolopos = (210 - eg_tremolopos);
+        }
+        vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift));
+        vsfence();
+        sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am);
+    }
+
+    if (update_deltafreq) {
+        for (sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            cgi = aymo_(sgi_to_cgi)(sgi);
+            cg = &chip->cg[cgi];
+            sg = &chip->sg[sgi];
+            aymo_(pg_update_deltafreq)(chip, cg, sg);
+        }
+    }
+}
+
+
+static
+void aymo_(write_40h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h);
+    struct aymo_ymf262_reg_40h reg_40h_prev = *reg_40h;
+    *(uint8_t*)(void*)reg_40h = value;
+
+    if ((reg_40h->tl != reg_40h_prev.tl) || (reg_40h->ksl != reg_40h_prev.ksl)) {
+        aymo_(eg_update_ksl)(chip, word);
+    }
+}
+
+
+static
+void aymo_(write_60h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_60h* reg_60h = &(chip->slot_regs[slot].reg_60h);
+    struct aymo_ymf262_reg_60h reg_60h_prev = *reg_60h;
+    *(uint8_t*)(void*)reg_60h = value;
+
+    if ((reg_60h->dr != reg_60h_prev.dr) || (reg_60h->ar != reg_60h_prev.ar)) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->dr = reg_60h->dr;
+        eg_adsr->ar = reg_60h->ar;
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+    }
+}
+
+
+static
+void aymo_(write_80h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_80h* reg_80h = &(chip->slot_regs[slot].reg_80h);
+    struct aymo_ymf262_reg_80h reg_80h_prev = *reg_80h;
+    *(uint8_t*)(void*)reg_80h = value;
+
+    if ((reg_80h->rr != reg_80h_prev.rr) || (reg_80h->sl != reg_80h_prev.sl)) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->sr = (chip->slot_regs[slot].reg_20h.egt ? 0 : reg_80h->rr);
+        eg_adsr->rr = reg_80h->rr;
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+        int16_t eg_sl = (int16_t)reg_80h->sl;
+        if (eg_sl == 0x0F) {
+            eg_sl = 0x1F;
+        }
+        vinsertv(sg->eg_sl, eg_sl, sgo);
+    }
+}
+
+
+static
+void aymo_(write_E0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_E0h* reg_E0h = &(chip->slot_regs[slot].reg_E0h);
+    struct aymo_ymf262_reg_E0h reg_E0h_prev = *reg_E0h;
+    *(uint8_t*)(void*)reg_E0h = value;
+
+    if (!chip->chip_regs.reg_105h.newm) {
+        reg_E0h->ws &= 3;
+    }
+
+    if (reg_E0h->ws != reg_E0h_prev.ws) {
+        const struct aymo_(wave)* wave = &aymo_(wave_table)[reg_E0h->ws];
+        vinsertv(sg->wg_phase_mullo, wave->wg_phase_mullo, sgo);
+        vinsertv(sg->wg_phase_zero,  wave->wg_phase_zero,  sgo);
+        vinsertv(sg->wg_phase_neg,   wave->wg_phase_neg,   sgo);
+        vinsertv(sg->wg_phase_flip,  wave->wg_phase_flip,  sgo);
+        vinsertv(sg->wg_phase_mask,  wave->wg_phase_mask,  sgo);
+        vinsertv(sg->wg_sine_gate,   wave->wg_sine_gate,   sgo);
+    }
+}
+
+
+static
+void aymo_(write_A0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+    int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+    int ch2x_is_secondary = (ch2p < ch2x);
+    if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) {
+        return;
+    }
+    if (!ch2x_is_pairing || ch2x_is_secondary) {
+        ch2p = -1;
+    }
+
+    struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+    struct aymo_ymf262_reg_A0h reg_A0h_prev = *reg_A0h;
+    *(uint8_t*)(void*)reg_A0h = value;
+
+    if (reg_A0h->fnum_lo != reg_A0h_prev.fnum_lo) {
+        aymo_(ch2x_update_fnum)(chip, ch2x, ch2p);
+    }
+}
+
+
+static
+void aymo_(write_B0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+    int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+    int ch2x_is_secondary = (ch2p < ch2x);
+    if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) {
+        return;
+    }
+    if (!ch2x_is_pairing || ch2x_is_secondary) {
+        ch2p = -1;
+    }
+
+    if (address == 0xBD) {
+        struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh;
+        struct aymo_ymf262_reg_BDh reg_BDh_prev = *reg_BDh;
+        *(uint8_t*)(void*)reg_BDh = value;
+
+        chip->eg_tremoloshift = (((reg_BDh->dam ^ 1) << 1) + 2);
+        chip->eg_vibshift = (reg_BDh->dvb ^ 1);
+        aymo_(cm_rewire_rhythm)(chip, &reg_BDh_prev);
+    }
+    else {
+        struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+        struct aymo_ymf262_reg_B0h reg_B0h_prev = *reg_B0h;
+        *(uint8_t*)(void*)reg_B0h = value;
+
+        if ((reg_B0h->fnum_hi != reg_B0h_prev.fnum_hi) || (reg_B0h->block != reg_B0h_prev.block)) {
+            aymo_(ch2x_update_fnum)(chip, ch2x, ch2p);
+        }
+
+        if (reg_B0h->kon != reg_B0h_prev.kon) {
+            if (reg_B0h->kon) {
+                aymo_(ch2x_key_on)(chip, ch2x);
+            } else {
+                aymo_(ch2x_key_off)(chip, ch2x);
+            }
+        }
+    }
+}
+
+
+static
+void aymo_(write_C0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    struct aymo_ymf262_reg_C0h* reg_C0h = &(chip->ch2x_regs[ch2x].reg_C0h);
+    struct aymo_ymf262_reg_C0h reg_C0h_prev = *reg_C0h;
+    if (!chip->chip_regs.reg_105h.newm) {
+        value = ((value | 0x30) & 0x3F);
+    }
+    *(uint8_t*)(void*)reg_C0h = value;
+
+    int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+    int sgo = (ch2x_word0 % AYMO_(SLOT_GROUP_LENGTH));
+    int sgi0 = (ch2x_word0 / AYMO_(SLOT_GROUP_LENGTH));
+    int sgi1 = (ch2x_word1 / AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg0 = &chip->sg[sgi0];
+    struct aymo_(slot_group)* sg1 = &chip->sg[sgi1];
+    int cgi = aymo_(sgi_to_cgi)(sgi0);
+    struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+
+    if (reg_C0h->cha != reg_C0h_prev.cha) {
+        int16_t og_ch_gate_a = -(int16_t)reg_C0h->cha;
+        vinsertv(cg->og_ch_gate_a, og_ch_gate_a, sgo);
+        vinsertv(sg0->og_out_ch_gate_a, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_a), sgo);
+        vinsertv(sg1->og_out_ch_gate_a, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_a), sgo);
+    }
+    if (reg_C0h->chb != reg_C0h_prev.chb) {
+        int16_t og_ch_gate_b = -(int16_t)reg_C0h->chb;
+        vinsertv(cg->og_ch_gate_b, og_ch_gate_b, sgo);
+        vinsertv(sg0->og_out_ch_gate_b, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_b), sgo);
+        vinsertv(sg1->og_out_ch_gate_b, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_b), sgo);
+    }
+    if (reg_C0h->chc != reg_C0h_prev.chc) {
+        int16_t og_ch_gate_c = -(int16_t)reg_C0h->chc;
+        vinsertv(cg->og_ch_gate_c, og_ch_gate_c, sgo);
+        vinsertv(sg0->og_out_ch_gate_c, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_c), sgo);
+        vinsertv(sg1->og_out_ch_gate_c, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_c), sgo);
+    }
+    if (reg_C0h->chd != reg_C0h_prev.chd) {
+        int16_t og_ch_gate_d = -(int16_t)reg_C0h->chd;
+        vinsertv(cg->og_ch_gate_d, og_ch_gate_d, sgo);
+        vinsertv(sg0->og_out_ch_gate_d, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_d), sgo);
+        vinsertv(sg1->og_out_ch_gate_d, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_d), sgo);
+    }
+
+    if (reg_C0h->fb != reg_C0h_prev.fb) {
+        int16_t fb_mulhi = (reg_C0h->fb ? (0x0040 << reg_C0h->fb) : 0);
+        vinsertv(sg0->wg_fb_mulhi, fb_mulhi, sgo);
+        vinsertv(sg1->wg_fb_mulhi, fb_mulhi, sgo);
+    }
+
+    if (chip->chip_regs.reg_105h.stereo) {
+        // TODO
+    }
+
+    if (reg_C0h->cnt != reg_C0h_prev.cnt) {
+        aymo_(cm_rewire_ch2x)(chip, ch2x);
+    }
+}
+
+
+static
+void aymo_(write_D0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    *(uint8_t*)(void*)&(chip->ch2x_regs[ch2x].reg_C0h) = value;
+
+    if (chip->chip_regs.reg_105h.stereo) {
+        // TODO
+    }
+}
+
+
+static
+int aymo_(rq_enqueue)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    uint16_t rq_tail = chip->rq_tail;
+    uint16_t rq_next = (rq_tail + 1);
+    if (rq_next >= AYMO_(REG_QUEUE_LENGTH)) {
+        rq_next = 0u;
+    }
+
+    if (rq_next != chip->rq_head) {
+        chip->rq_buffer[rq_tail].address = address;
+        chip->rq_buffer[rq_tail].value = value;
+        chip->rq_tail = rq_next;
+        return 1;
+    }
+    return 0;
+}
+
+
+const struct aymo_ymf262_vt* aymo_(get_vt)(void)
+{
+    return &(aymo_(vt));
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    // Wipe everything, except VT
+    const struct aymo_ymf262_vt* vt = chip->parent.vt;
+    aymo_memset(chip, 0, sizeof(*chip));
+    chip->parent.vt = vt;
+
+    // Initialize slots
+    for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+        struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+        sg->eg_rout         = vset1(0x01FF);
+        sg->eg_out          = vset1(0x01FF);
+        sg->eg_gen          = vset1(AYMO_(EG_GEN_RELEASE));
+        sg->eg_gen_mullo    = vset1(AYMO_(EG_GEN_MULLO_RELEASE));
+        sg->pg_mult_x2      = vset1(aymo_ymf262_pg_mult_x2_table[0]);
+        sg->og_prout_ac     = vsetm(aymo_(og_prout_ac)[sgi]);
+        sg->og_prout_bd     = vsetm(aymo_(og_prout_bd)[sgi]);
+
+        const struct aymo_(wave)* wave = &aymo_(wave_table)[0];
+        sg->wg_phase_mullo  = vset1(wave->wg_phase_mullo);
+        sg->wg_phase_zero   = vset1(wave->wg_phase_zero);
+        sg->wg_phase_neg    = vset1(wave->wg_phase_neg);
+        sg->wg_phase_flip   = vset1(wave->wg_phase_flip);
+        sg->wg_phase_mask   = vset1(wave->wg_phase_mask);
+        sg->wg_sine_gate    = vset1(wave->wg_sine_gate);
+    }
+
+    // Initialize channels
+    for (int cgi = 0; cgi < (AYMO_(SLOT_GROUP_NUM) / 2); ++cgi) {
+        struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+        cg->og_ch_gate_a = vset1(-1);
+        cg->og_ch_gate_b = vset1(-1);
+    }
+    for (int ch2x = 0; ch2x < AYMO_(CHANNEL_NUM_MAX); ++ch2x) {
+        aymo_(cm_rewire_ch2x)(chip, ch2x);
+    }
+
+    // Initialize chip
+    chip->ng_noise = 1;
+
+    chip->eg_tremoloshift = 4;
+    chip->eg_vibshift = 1;
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    AYMO_UNUSED_VAR(chip);
+    AYMO_UNUSED_VAR(address);
+    assert(chip);
+
+    // not supported
+    return 0u;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address > 0x1FF) {
+        return;
+    }
+
+    switch (address & 0xF0) {
+    case 0x00: {
+        aymo_(write_00h)(chip, address, value);
+        break;
+    }
+    case 0x20:
+    case 0x30: {
+        aymo_(write_20h)(chip, address, value);
+        break;
+    }
+    case 0x40:
+    case 0x50: {
+        aymo_(write_40h)(chip, address, value);
+        break;
+    }
+    case 0x60:
+    case 0x70: {
+        aymo_(write_60h)(chip, address, value);
+        break;
+    }
+    case 0x80:
+    case 0x90: {
+        aymo_(write_80h)(chip, address, value);
+        break;
+    }
+    case 0xE0:
+    case 0xF0: {
+        aymo_(write_E0h)(chip, address, value);
+        break;
+    }
+    case 0xA0: {
+        aymo_(write_A0h)(chip, address, value);
+        break;
+    }
+    case 0xB0: {
+        aymo_(write_B0h)(chip, address, value);
+        break;
+    }
+    case 0xC0: {
+        aymo_(write_C0h)(chip, address, value);
+        break;
+    }
+    case 0xD0: {
+        aymo_(write_D0h)(chip, address, value);
+        break;
+    }
+    }
+    vsfence();
+}
+
+
+int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address < 0x8000u) {
+        return aymo_(rq_enqueue)(chip, address, value);
+    }
+    return 0;
+}
+
+
+int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    if (count < 0x8000u) {
+        uint16_t address = (uint16_t)((count >> 8) | 0x8000u);
+        uint8_t value = (uint8_t)(count & 0xFFu);
+        return aymo_(rq_enqueue)(chip, address, value);
+    }
+    return 0;
+}
+
+
+int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel)
+{
+    assert(chip);
+
+    switch (channel) {
+        case 0u: return _mm_extract_epi16(chip->og_out, 0);
+        case 1u: return _mm_extract_epi16(chip->og_out, 1);
+        case 2u: return _mm_extract_epi16(chip->og_out, 2);
+        case 3u: return _mm_extract_epi16(chip->og_out, 3);
+        default: return 0;
+    }
+}
+
+
+void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+    }
+}
+
+
+void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 3u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        *(int32_t*)y = _mm_cvtsi128_si32(chip->og_out);
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 7u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        _mm_storel_epi64((void*)y, chip->og_out);
+        y += 4u;
+    }
+}
+
+
+void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 7u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out);
+        vf32x4_t vf32 = _mm_cvtepi32_ps(vi32);
+        _mm_storel_pi((void*)y, vf32);
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 15u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out);
+        vf32x4_t vf32 = _mm_cvtepi32_ps(vi32);
+        _mm_store_ps(y, vf32);
+        y += 4u;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX
diff --git a/src/aymo_ymf262_x86_avx2.c b/src/aymo_ymf262_x86_avx2.c
new file mode 100644
index 0000000..30e19e0
--- /dev/null
+++ b/src/aymo_ymf262_x86_avx2.c
@@ -0,0 +1,1683 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <assert.h>
+#include "aymo_cpu_x86_avx2_inline.h"
+#include "aymo_ymf262.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_x86_avx2.h"
+
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_ymf262_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_ymf262_ctor_f)&(aymo_(ctor)),
+    (aymo_ymf262_dtor_f)&(aymo_(dtor)),
+    (aymo_ymf262_read_f)&(aymo_(read)),
+    (aymo_ymf262_write_f)&(aymo_(write)),
+    (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)),
+    (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)),
+    (aymo_ymf262_get_output_f)&(aymo_(get_output)),
+    (aymo_ymf262_tick_f)&(aymo_(tick)),
+    (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)),
+    (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)),
+    (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)),
+    (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4))
+};
+
+
+// 32-bit Slot Group side (lo/hi)
+const int8_t aymo_(sgo_side)[16] =
+{
+    0, 0, 0, 0,  1, 1, 1, 1,
+    0, 0, 0, 0,  1, 1, 1, 1
+};
+
+// 32-bit Slot Group cell
+const int8_t aymo_(sgo_cell)[16] =
+{
+    0, 1, 2, 3,  0, 1, 2, 3,
+    4, 5, 6, 7,  4, 5, 6, 7
+};
+
+
+const uint16_t aymo_(eg_incstep_table)[4] =
+{
+    ((1 << 15) | (1 << 14) | (1 << 13)),
+    ((0 << 15) | (0 << 14) | (1 << 13)),
+    ((0 << 15) | (1 << 14) | (1 << 13)),
+    ((0 << 15) | (0 << 14) | (0 << 13))
+};
+
+
+// Wave descriptors
+const struct aymo_(wave) aymo_(wave_table)[8] =  // TODO: share bits; select vit shifts
+{
+    { 1,  0x0000,  0x0200,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0200,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0000,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0100,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 2,  0x0400,  0x0200,  0x0100,  0x00FF,  -1 },
+    { 2,  0x0400,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0000,  0x0200,  0x0200,  0x0001,   0 },
+    { 8,  0x0000,  0x1000,  0x1000,  0x1FFF,   0 }
+};
+
+
+// 2-channel connection descriptors
+const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */] =
+{
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,  -1 }
+    },
+};
+
+// 4-channel connection descriptors
+const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */] =
+{
+    {
+        { -1,   0,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,  -1 },
+        {  0,   0,  -1 }
+    },
+};
+
+// Rhythm connection descriptors
+const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */] =
+{
+    // Channel 6: BD, FM
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    // Channel 6: BD, AM
+    {
+        { -1,   0,   0 },
+        {  0,   0,  -1 }
+    },
+    // Channel 7: HH + SD
+    {
+        {  0,   0,  -1 },
+        {  0,   0,  -1 }
+    },
+    // Channel 8: TT + TC
+    {
+        {  0,   0,  -1 },
+        {  0,   0,  -1 }
+    }
+};
+
+
+// Slot mask output delay for outputs A and C
+const uint16_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)] =  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+{
+    0xF8F8,
+    0xFFF8,
+    0xFFF8,
+    0xFFF8
+};
+
+
+// Slot mask output delay for outputs B and D
+const uint16_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)] =  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+{
+    0xF888,
+    0xF888,
+    0xFF88,
+    0xFF88
+};
+
+
+// Updates phase generator
+static inline
+void aymo_(pg_update_deltafreq)(
+    struct aymo_(chip)* chip,
+    struct aymo_(ch2x_group)* cg,
+    struct aymo_(slot_group)* sg
+)
+{
+    // Update phase
+    vi16_t fnum = cg->pg_fnum;
+    vi16_t range = vand(fnum, vset1(7 << 7));
+    range = vmulihi(range, vand(sg->pg_vib, chip->pg_vib_mulhi));
+    range = vsub(vxor(range, chip->pg_vib_neg), chip->pg_vib_neg);  // flip sign
+    fnum = vadd(fnum, range);
+
+    vi32_t zero = vsetz();
+    vi32_t fnum_lo = vunpacklo(fnum, zero);
+    vi32_t fnum_hi = vunpackhi(fnum, zero);
+    vi32_t block_sll_lo = vunpacklo(cg->pg_block, zero);
+    vi32_t block_sll_hi = vunpackhi(cg->pg_block, zero);
+    vi32_t basefreq_lo = vvsrli(vvsllv(fnum_lo, block_sll_lo), 1);
+    vi32_t basefreq_hi = vvsrli(vvsllv(fnum_hi, block_sll_hi), 1);
+    vi32_t pg_mult_x2_lo = vunpacklo(sg->pg_mult_x2, zero);
+    vi32_t pg_mult_x2_hi = vunpackhi(sg->pg_mult_x2, zero);
+    vi32_t deltafreq_lo = vvsrli(vvmullo(basefreq_lo, pg_mult_x2_lo), 1);
+    vi32_t deltafreq_hi = vvsrli(vvmullo(basefreq_hi, pg_mult_x2_hi), 1);
+    sg->pg_deltafreq_lo = deltafreq_lo;
+    sg->pg_deltafreq_hi = deltafreq_hi;
+}
+
+
+// Updates noise generator
+static inline
+void aymo_(ng_update)(struct aymo_(chip)* chip, unsigned times)
+{
+    // Update noise
+    uint32_t noise = chip->ng_noise;
+    while (times--) {
+        uint32_t n_bit = (((noise >> 14) ^ noise) & 1);
+        noise = ((noise >> 1) | (n_bit << 22));
+    }
+    chip->ng_noise = noise;
+}
+
+
+// Updates rhythm manager, slot group 0
+static inline
+void aymo_(rm_update_sg0)(struct aymo_(chip)* chip)
+{
+    struct aymo_(slot_group)* sg = &chip->sg[0];
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Double rhythm outputs
+        vi16_t ryt_slot_mask = vsetr(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0);
+        vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask);
+        chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a));
+        chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b));
+        chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c));
+        chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d));
+    }
+
+    vi16_t phase = sg->pg_phase_out;
+    uint16_t phase13 = (uint16_t)vextract(phase, 9);
+
+    // Update noise bits
+    chip->rm_hh_bit2 = ((phase13 >> 2) & 1);
+    chip->rm_hh_bit3 = ((phase13 >> 3) & 1);
+    chip->rm_hh_bit7 = ((phase13 >> 7) & 1);
+    chip->rm_hh_bit8 = ((phase13 >> 8) & 1);
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Calculate noise bit
+        uint16_t rm_xor = (
+            (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) |
+            (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) |
+            (chip->rm_tc_bit3 ^ chip->rm_tc_bit5)
+        );
+
+        // Update HH
+        uint16_t noise = (uint16_t)chip->ng_noise;
+        phase13 = (rm_xor << 9);
+        if (rm_xor ^ (noise & 1)) {
+            phase13 |= 0xD0;
+        } else {
+            phase13 |= 0x34;
+        }
+        phase = vinsert(phase, (int16_t)phase13, 9);
+
+        sg->pg_phase_out = phase;
+    }
+}
+
+
+// Updates rhythm manager, slot group 1
+static inline
+void aymo_(rm_update_sg1)(struct aymo_(chip)* chip)
+{
+    struct aymo_(slot_group)* sg = &chip->sg[1];
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Double rhythm outputs
+        vi16_t ryt_slot_mask = vsetr(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0);
+        vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask);
+        chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a));
+        chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b));
+        chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c));
+        chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d));
+
+        // Calculate noise bit
+        uint16_t rm_xor = (
+            (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) |
+            (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) |
+            (chip->rm_tc_bit3 ^ chip->rm_tc_bit5)
+        );
+        vi16_t phase = sg->pg_phase_out;
+
+        // Update SD
+        uint16_t noise = (uint16_t)chip->ng_noise;
+        uint16_t phase16 = (
+            ((uint16_t)chip->rm_hh_bit8 << 9) |
+            ((uint16_t)(chip->rm_hh_bit8 ^ (noise & 1)) << 8)
+        );
+        phase = vinsert(phase, (int16_t)phase16, 9);
+
+        // Update TC
+        uint32_t phase17 = vextract(phase, 10);
+        chip->rm_tc_bit3 = ((phase17 >> 3) & 1);
+        chip->rm_tc_bit5 = ((phase17 >> 5) & 1);
+        phase17 = ((rm_xor << 9) | 0x80);
+        phase = vinsert(phase, (int16_t)phase17, 10);
+
+        sg->pg_phase_out = phase;
+    }
+}
+
+
+// Updates slot generators
+static
+void aymo_(sg_update)(
+    struct aymo_(chip)* chip,
+    struct aymo_(slot_group)* sg
+)
+{
+    // EG: Compute envelope output
+    vi16_t sg_eg_rout = sg->eg_rout;
+    sg->eg_out = vadd(vadd(sg_eg_rout, sg->eg_tremolo_am), sg->eg_ksl_sh_tl_x4);
+
+    // PG: Compute phase output
+    vi32_t phase_out_mask = vvset1(0xFFFF);
+    vi32_t phase_out_lo = vvand(vvsrli(sg->pg_phase_lo, 9), phase_out_mask);
+    vi32_t phase_out_hi = vvand(vvsrli(sg->pg_phase_hi, 9), phase_out_mask);
+    vi16_t phase_out = vvpackus(phase_out_lo, phase_out_hi);
+    sg->pg_phase_out = phase_out;
+
+    // EG: Compute rate
+    vi16_t eg_prgen = sg->eg_gen;
+    vi16_t eg_gen_rel = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_RELEASE)));
+    vi16_t notreset = vcmpz(vand(sg->eg_key, eg_gen_rel));
+    vi16_t eg_gen_mullo = vblendv(vset1(AYMO_(EG_GEN_MULLO_ATTACK)), sg->eg_gen_mullo, notreset);
+    vi16_t reg_rate = vu2i(vmululo(vi2u(sg->eg_adsr), vi2u(eg_gen_mullo)));  // move to top nibble
+    vi16_t rate_temp = vand(reg_rate, vset1((int16_t)0xF000));  // keep top nibble
+    rate_temp = vsrli(rate_temp, AYMO_(EG_GEN_SRLHI));
+    vi16_t rate = vadd(sg->eg_ks, rate_temp);
+    vi16_t rate_lo = vand(rate, vset1(3));
+    vi16_t rate_hi = vsrli(rate, 2);
+    rate_hi = vmini(rate_hi, vset1(15));
+
+    // PG: Update phase
+    vi32_t notreset_lo = vunpacklo(notreset, notreset);
+    vi32_t notreset_hi = vunpackhi(notreset, notreset);
+    vi32_t pg_phase_lo = vvand(notreset_lo, sg->pg_phase_lo);
+    vi32_t pg_phase_hi = vvand(notreset_hi, sg->pg_phase_hi);
+    sg->pg_phase_lo = vvadd(pg_phase_lo, sg->pg_deltafreq_lo);
+    sg->pg_phase_hi = vvadd(pg_phase_hi, sg->pg_deltafreq_hi);
+
+    // EG: Compute shift (< 12)
+    vi16_t eg_shift = vadd(rate_hi, chip->eg_add);
+    vi16_t rate_pre_lt12 = vor(vslli(rate_lo, 1), vset1(8));
+    vi16_t shift_lt12 = vsrlv(rate_pre_lt12, vsubsu(vset1(15), eg_shift));
+    vi16_t eg_state = vset1((int16_t)chip->eg_state);
+    shift_lt12 = vand(shift_lt12, eg_state);
+
+    // WG: Compute feedback and modulation inputs
+    vi16_t fbsum = vslli(vadd(sg->wg_out, sg->wg_prout), 1);
+    vi16_t fbsum_sh = vmulihi(fbsum, sg->wg_fb_mulhi);
+    vi16_t prmod = vand(chip->wg_mod, sg->wg_prmod_gate);
+    vi16_t fbmod = vand(fbsum_sh, sg->wg_fbmod_gate);
+    sg->wg_prout = sg->wg_out;
+
+    // WG: Compute operator phase input
+    vi16_t modsum = vadd(fbmod, prmod);
+    vi16_t phase = vadd(phase_out, modsum);
+
+    // EG: Compute shift (>= 12)
+    vu16_t rate_lo_muluhi = vi2u(vslli(vpow2m1lt4(rate_lo), 1));
+    vi16_t incstep_ge12 = vand(vu2i(vmuluhi(chip->eg_incstep, rate_lo_muluhi)), vset1(1));
+    vi16_t shift_ge12 = vadd(vand(rate_hi, vset1(3)), incstep_ge12);
+    shift_ge12 = vmini(shift_ge12, vset1(3));
+    shift_ge12 = vblendv(shift_ge12, eg_state, vcmpz(shift_ge12));
+
+    vi16_t shift = vblendv(shift_lt12, shift_ge12, vcmpgt(rate_hi, vset1(11)));
+    shift = vandnot(vcmpz(rate_temp), shift);
+
+    // EG: Instant attack
+    vi16_t eg_rout = sg_eg_rout;
+    eg_rout = vandnot(vandnot(notreset, vcmpeq(rate_hi, vset1(15))), eg_rout);
+
+    // WG: Process phase
+    vi16_t phase_sped = vu2i(vmululo(vi2u(phase), sg->wg_phase_mullo));
+    vi16_t phase_gate = vcmpz(vand(phase_sped, sg->wg_phase_zero));
+    vi16_t phase_flip = vcmpp(vand(phase_sped, sg->wg_phase_flip));
+    vi16_t phase_mask = sg->wg_phase_mask;
+    vi16_t phase_xor = vand(phase_flip, phase_mask);
+    vi16_t phase_idx = vxor(phase_sped, phase_xor);
+    phase_out = vand(vand(phase_gate, phase_mask), phase_idx);
+
+    // EG: Envelope off
+    vi16_t eg_off = vcmpgt(sg_eg_rout, vset1(0x01F7));
+    vi16_t eg_gen_natk_and_nrst = vand(vcmpp(eg_prgen), notreset);
+    eg_rout = vblendv(eg_rout, vset1(0x01FF), vand(eg_gen_natk_and_nrst, eg_off));
+
+    // WG: Compute logsin variant
+    vi16_t phase_lo = phase_out;  // vgather() masks to low byte
+    vi16_t logsin_val = vgather(aymo_ymf262_logsin_table, phase_lo);
+    logsin_val = vblendv(vset1(0x1000), logsin_val, phase_gate);
+
+    // EG: Compute common increment not in attack state
+    vi16_t eg_inc_natk_cond = vand(vand(notreset, vcmpz(eg_off)), vcmpp(shift));
+    vi16_t eg_inc_natk = vand(eg_inc_natk_cond, vpow2m1lt4(shift));
+    vi16_t eg_gen = eg_prgen;
+
+    // WG: Compute exponential output
+    vi16_t exp_in = vblendv(phase_out, logsin_val, sg->wg_sine_gate);
+    vi16_t exp_level = vadd(exp_in, vslli(sg->eg_out, 3));
+    exp_level = vmini(exp_level, vset1(0x1FFF));
+    vi16_t exp_level_lo = exp_level;  // vgather() masks to low byte
+    vi16_t exp_level_hi = vsrli(exp_level, 8);
+    vi16_t exp_value = vgather(aymo_ymf262_exp_x2_table, exp_level_lo);
+    vi16_t exp_out = vsrlv(exp_value, exp_level_hi);
+
+    // EG: Move attack to decay state
+    vi16_t eg_inc_atk_cond = vand(vand(vcmpp(sg->eg_key), vcmpp(shift)),
+                                  vand(vcmpz(eg_prgen), vcmpgt(vset1(15), rate_hi)));
+    vi16_t eg_inc_atk_ninc = vsrlv(sg_eg_rout, vsub(vset1(4), shift));
+    vi16_t eg_inc = vandnot(eg_inc_atk_ninc, eg_inc_atk_cond);
+    vi16_t eg_gen_atk_to_dec = vcmpz(vor(eg_prgen, sg_eg_rout));
+    eg_gen = vsub(eg_gen, eg_gen_atk_to_dec);  // 0 --> 1
+    eg_inc = vblendv(eg_inc_natk, eg_inc, vcmpz(eg_prgen));
+    eg_inc = vandnot(eg_gen_atk_to_dec, eg_inc);
+
+    // WG: Compute operator wave output
+    vi16_t wave_pos = vcmpz(vand(phase_sped, sg->wg_phase_neg));
+    vi16_t wave_neg = vandnot(wave_pos, phase_gate);
+    vi16_t wave_out = vxor(exp_out, wave_neg);
+    sg->wg_out = wave_out;
+    chip->wg_mod = wave_out;
+
+    // EG: Move decay to sustain state
+    vi16_t eg_gen_dec = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_DECAY)));
+    vi16_t sl_hit = vcmpeq(vsrli(sg_eg_rout, 4), sg->eg_sl);
+    vi16_t eg_gen_dec_to_sus = vand(eg_gen_dec, sl_hit);
+    eg_gen = vsub(eg_gen, eg_gen_dec_to_sus);  // 1 --> 2
+    eg_inc = vandnot(eg_gen_dec_to_sus, eg_inc);
+
+    // WG: Update chip output accumulators, with quirky slot output delay
+    vi16_t og_out_ac = vblendv(wave_out, sg->og_prout, sg->og_prout_ac);
+    vi16_t og_out_bd = vblendv(wave_out, sg->og_prout, sg->og_prout_bd);
+    sg->og_prout = wave_out;
+    chip->og_acc_a = vadd(chip->og_acc_a, vand(og_out_ac, sg->og_out_ch_gate_a));
+    chip->og_acc_c = vadd(chip->og_acc_c, vand(og_out_ac, sg->og_out_ch_gate_c));
+    chip->og_acc_b = vadd(chip->og_acc_b, vand(og_out_bd, sg->og_out_ch_gate_b));
+    chip->og_acc_d = vadd(chip->og_acc_d, vand(og_out_bd, sg->og_out_ch_gate_d));
+
+    // EG: Move back to attack state
+    eg_gen = vand(notreset, eg_gen);  // * --> 0
+
+    // EG: Move to release state
+    eg_gen = vor(eg_gen, vsrli(vcmpz(sg->eg_key), 14));  // * --> 3
+
+    // EG: Update envelope generator
+    eg_rout = vadd(eg_rout, eg_inc);
+    eg_rout = vand(eg_rout, vset1(0x01FF));
+    sg->eg_rout = eg_rout;
+    sg->eg_gen = eg_gen;
+    sg->eg_gen_mullo = vsllv(vset1(1), vslli(eg_gen, 2));
+
+#ifdef AYMO_DEBUG
+    sg->eg_rate = rate;
+    sg->eg_inc = eg_inc;
+    sg->wg_fbmod = fbsum_sh;
+    sg->wg_mod = modsum;
+#endif
+}
+
+
+// Clear output accumulators
+static inline
+void aymo_(og_clear)(struct aymo_(chip)* chip)
+{
+    chip->og_acc_a = vsetz();
+    chip->og_acc_b = vsetz();
+    chip->og_acc_c = vsetz();
+    chip->og_acc_d = vsetz();
+}
+
+
+// Updates output mixdown
+static inline
+void aymo_(og_update)(struct aymo_(chip)* chip)
+{
+    vi16x16_t one = _mm256_set1_epi16(1);
+    vi32x8_t sum_a = _mm256_madd_epi16(chip->og_acc_a, one);
+    vi32x8_t sum_b = _mm256_madd_epi16(chip->og_acc_b, one);
+    vi32x8_t sum_c = _mm256_madd_epi16(chip->og_acc_c, one);
+    vi32x8_t sum_d = _mm256_madd_epi16(chip->og_acc_d, one);
+
+    vi32x4_t sum_a_lo = _mm256_castsi256_si128(sum_a);
+    vi32x4_t sum_a_hi = _mm256_extracti128_si256(sum_a, 1);
+    vi32x4_t tot_a = _mm_add_epi32(sum_a_lo, sum_a_hi);
+
+    vi32x4_t sum_b_lo = _mm256_castsi256_si128(sum_b);
+    vi32x4_t sum_b_hi = _mm256_extracti128_si256(sum_b, 1);
+    vi32x4_t tot_b = _mm_add_epi32(sum_b_lo, sum_b_hi);
+
+    vi32x4_t sum_c_lo = _mm256_castsi256_si128(sum_c);
+    vi32x4_t sum_c_hi = _mm256_extracti128_si256(sum_c, 1);
+    vi32x4_t tot_c = _mm_add_epi32(sum_c_lo, sum_c_hi);
+
+    vi32x4_t sum_d_lo = _mm256_castsi256_si128(sum_d);
+    vi32x4_t sum_d_hi = _mm256_extracti128_si256(sum_d, 1);
+    vi32x4_t tot_d = _mm_add_epi32(sum_d_lo, sum_d_hi);
+
+    tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(2, 3, 0, 1)));
+
+    tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(1, 0, 3, 2)));
+
+    vi32x4_t tot_ab = _mm_blend_epi32(tot_a, tot_b, 0xA);
+    vi32x4_t tot_cd = _mm_blend_epi32(tot_c, tot_d, 0x5);
+    vi32x4_t tot_abcd = _mm_blend_epi32(tot_ab, tot_cd, 0xC);
+    vi16x8_t sat_abcd = _mm_packs_epi32(tot_abcd, tot_abcd);
+
+    vi16x8_t old_abcd = _mm_shuffle_epi32(chip->og_out, _MM_SHUFFLE(1, 0, 3, 2));
+    vi16x8_t out_abcd = _mm_blend_epi16(old_abcd, sat_abcd, 0xF5);
+
+    chip->og_out = out_abcd;
+}
+
+
+// Updates timer management
+static inline
+void aymo_(tm_update)(struct aymo_(chip)* chip)
+{
+    // Update tremolo
+    if AYMO_UNLIKELY((chip->tm_timer & 0x3F) == 0x3F) {
+        chip->eg_tremolopos = ((chip->eg_tremolopos + 1) % 210);
+
+        uint16_t eg_tremolopos = chip->eg_tremolopos;
+        if (eg_tremolopos >= 105) {
+            eg_tremolopos = (210 - eg_tremolopos);
+        }
+        vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift));
+
+        for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            struct aymo_(slot_group)* sg = &chip->sg[sgi];
+            sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am);
+        }
+    }
+
+    // Update vibrato
+    if AYMO_UNLIKELY((chip->tm_timer & 0x3FF) == 0x3FF) {
+        chip->pg_vibpos = ((chip->pg_vibpos + 1) & 7);
+        uint8_t vibpos = chip->pg_vibpos;
+        int16_t pg_vib_mulhi = (0x10000 >> 7);
+        int16_t pg_vib_neg = 0;
+
+        if (!(vibpos & 3)) {
+            pg_vib_mulhi = 0;
+        }
+        else if (vibpos & 1) {
+            pg_vib_mulhi >>= 1;
+        }
+        pg_vib_mulhi >>= chip->eg_vibshift;
+        pg_vib_mulhi &= 0x7F80;
+
+        if (vibpos & 4) {
+            pg_vib_neg = -1;
+        }
+        chip->pg_vib_mulhi = vset1(pg_vib_mulhi);
+        chip->pg_vib_neg = vset1(pg_vib_neg);
+
+        for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            int cgi = aymo_(sgi_to_cgi)(sgi);
+            struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+            struct aymo_(slot_group)* sg = &chip->sg[sgi];
+            aymo_(pg_update_deltafreq)(chip, cg, sg);
+        }
+    }
+
+    chip->tm_timer++;
+    uint16_t eg_incstep = aymo_(eg_incstep_table)[chip->tm_timer & 3];
+    chip->eg_incstep = vi2u(vset1((int16_t)eg_incstep));
+
+    // Update timed envelope patterns
+    int16_t eg_shift = (int16_t)uffsll(chip->eg_timer);
+    int16_t eg_add = ((eg_shift > 13) ? 0 : eg_shift);
+    chip->eg_add = vset1(eg_add);
+
+    // Update envelope timer and flip state
+    if (chip->eg_state | chip->eg_timerrem) {
+        if (chip->eg_timer < ((1ULL << AYMO_YMF262_SLOT_NUM) - 1ULL)) {
+            chip->eg_timer++;
+            chip->eg_timerrem = 0;
+        }
+        else {
+            chip->eg_timer = 0;
+            chip->eg_timerrem = 1;
+        }
+    }
+    chip->eg_state ^= 1;
+}
+
+
+// Updates the register queue
+static inline
+void aymo_(rq_update)(struct aymo_(chip)* chip)
+{
+    if (chip->rq_delay) {
+        if (--chip->rq_delay) {
+            return;
+        }
+    }
+    if (chip->rq_head != chip->rq_tail) {
+        struct aymo_(reg_queue_item)* item = &chip->rq_buffer[chip->rq_head];
+
+        if (item->address & 0x8000u) {
+            chip->rq_delay = AYMO_(REG_QUEUE_LATENCY);
+            chip->rq_delay += (((uint32_t)(item->address & 0x7FFFu) << 16) | item->value);
+        }
+        else {
+            aymo_(write)(chip, item->address, item->value);
+        }
+
+        if (++chip->rq_head >= AYMO_(REG_QUEUE_LENGTH)) {
+            chip->rq_head = 0;
+        }
+    }
+}
+
+
+static
+void aymo_(tick_once)(struct aymo_(chip)* chip)
+{
+    int sgi;
+
+    // Clear output accumulators
+    aymo_(og_clear)(chip);
+
+    // Process slot group 0
+    sgi = 0;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+    aymo_(ng_update)(chip, (36 - 3));  // slot 16 --> slot 13
+    aymo_(rm_update_sg0)(chip);
+
+    // Process slot group 1
+    sgi = 1;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+    aymo_(ng_update)(chip, 3);  // slot 13 --> slot 16
+    aymo_(rm_update_sg1)(chip);
+
+    // Process slot group 2
+    sgi = 2;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 3
+    sgi = 3;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Update outputs
+    aymo_(og_update)(chip);
+
+    // Update timers
+    aymo_(tm_update)(chip);
+
+    // Dequeue registers
+    aymo_(rq_update)(chip);
+}
+
+
+static
+void aymo_(eg_update_ksl)(struct aymo_(chip)* chip, int word)
+{
+    int slot = aymo_ymf262_word_to_slot[word];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h);
+
+    int16_t pg_fnum = vextractv(cg->pg_fnum, sgo);
+    int16_t pg_fnum_hn = ((pg_fnum >> 6) & 15);
+
+    int ch2x = aymo_ymf262_word_to_ch2x[aymo_ymf262_slot_to_word[slot]];
+    int16_t eg_block = (int16_t)(chip->ch2x_regs[ch2x].reg_B0h.block);
+    int16_t eg_ksl = aymo_ymf262_eg_ksl_table[pg_fnum_hn];
+    eg_ksl = ((eg_ksl << 2) - ((8 - eg_block) << 5));
+    if (eg_ksl < 0) {
+        eg_ksl = 0;
+    }
+    int16_t eg_kslsh = aymo_ymf262_eg_kslsh_table[reg_40h->ksl];
+    int16_t eg_ksl_sh = (eg_ksl >> eg_kslsh);
+
+    int16_t eg_tl_x4 = ((int16_t)reg_40h->tl << 2);
+
+    int16_t eg_ksl_sh_tl_x4 = (eg_ksl_sh + eg_tl_x4);
+    vinsertv(sg->eg_ksl_sh_tl_x4, eg_ksl_sh_tl_x4, sgo);
+
+#ifdef AYMO_DEBUG
+    vinsertv(sg->eg_ksl, eg_ksl, sgo);
+#endif
+}
+
+
+static
+void aymo_(chip_pg_update_nts)(struct aymo_(chip)* chip)
+{
+    for (int slot = 0; slot < AYMO_(SLOT_NUM_MAX); ++slot) {
+        int word = aymo_ymf262_slot_to_word[slot];
+        int ch2x = aymo_ymf262_word_to_ch2x[word];
+        struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+        struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+        struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h);
+        int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8));
+        int16_t eg_ksv = ((reg_B0h->block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1));
+
+        int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+        int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+        int cgi = aymo_(sgi_to_cgi)(sgi);
+        struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+        struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+
+        struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h);
+        int16_t ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1));
+
+        vinsertv(cg->eg_ksv, eg_ksv, sgo);
+        vinsertv(sg->eg_ks,  ks,     sgo);
+    }
+}
+
+
+static
+void aymo_(pg_update_fnum)(
+    struct aymo_(chip)* chip, int ch2x,
+    int16_t pg_fnum, int16_t eg_ksv, int16_t pg_block
+)
+{
+    int word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int sgi0 = (word0 / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word0 % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi0);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+
+    vinsertv(cg->pg_block, pg_block, sgo);
+    vinsertv(cg->pg_fnum, pg_fnum, sgo);
+    vinsertv(cg->eg_ksv, eg_ksv, sgo);
+
+    struct aymo_(slot_group)* sg0 = &(chip->sg[sgi0]);
+    int slot0 = aymo_ymf262_word_to_slot[word0];
+    struct aymo_ymf262_reg_20h* reg_20h0 = &(chip->slot_regs[slot0].reg_20h);
+    int16_t ks0 = (eg_ksv >> ((reg_20h0->ksr ^ 1) << 1));
+    vinsertv(sg0->eg_ks, ks0, sgo);
+    aymo_(eg_update_ksl)(chip, word0);
+    aymo_(pg_update_deltafreq)(chip, cg, sg0);
+
+    int word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+    int sgi1 = (word1 / AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg1 = &(chip->sg[sgi1]);
+    int slot1 = aymo_ymf262_word_to_slot[word1];
+    struct aymo_ymf262_reg_20h* reg_20h1 = &(chip->slot_regs[slot1].reg_20h);
+    int16_t ks1 = (eg_ksv >> ((reg_20h1->ksr ^ 1) << 1));
+    vinsertv(sg1->eg_ks, ks1, sgo);
+    aymo_(eg_update_ksl)(chip, word1);
+    aymo_(pg_update_deltafreq)(chip, cg, sg1);
+}
+
+
+static
+void aymo_(ch2x_update_fnum)(struct aymo_(chip)* chip, int ch2x, int8_t ch2p)
+{
+    struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+    struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+    struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h);
+    int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8));
+    int16_t pg_block = (int16_t)reg_B0h->block;
+    int16_t eg_ksv = ((pg_block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1));
+
+    aymo_(pg_update_fnum)(chip, ch2x, pg_fnum, eg_ksv, pg_block);
+
+    if (ch2p >= 0) {
+        aymo_(pg_update_fnum)(chip, ch2p, pg_fnum, eg_ksv, pg_block);
+    }
+}
+
+
+static inline
+void aymo_(eg_key_on)(struct aymo_(chip)* chip, int word, int16_t mode)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    int16_t eg_key = vextractv(sg->eg_key, sgo);
+    eg_key |= mode;
+    vinsertv(sg->eg_key, eg_key, sgo);
+}
+
+
+static inline
+void aymo_(eg_key_off)(struct aymo_(chip)* chip, int word, int16_t mode)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    int16_t eg_key = vextractv(sg->eg_key, sgo);
+    eg_key &= (int16_t)~mode;
+    vinsertv(sg->eg_key, eg_key, sgo);
+}
+
+
+static
+void aymo_(ch2x_key_on)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm) {
+        unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+        unsigned ch2x_is_drum    = (chip->og_ch2x_drum    & (1UL << ch2x));
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+
+        if (ch2x_is_pairing && !ch2x_is_secondary) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0];
+            int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1];
+            aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL));
+        }
+        else if (!ch2x_is_pairing || ch2x_is_drum) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+        }
+    }
+    else {
+        int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+        int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+        aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+        aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+    }
+}
+
+
+static
+void aymo_(ch2x_key_off)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm) {
+        unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+        unsigned ch2x_is_drum    = (chip->og_ch2x_drum    & (1UL << ch2x));
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+
+        if (ch2x_is_pairing && !ch2x_is_secondary) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0];
+            int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1];
+            aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL));
+        }
+        else if (!ch2x_is_pairing || ch2x_is_drum) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+        }
+    }
+    else {
+        int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+        int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+        aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+        aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+    }
+}
+
+
+static
+void aymo_(cm_rewire_slot)(struct aymo_(chip)* chip, int word, const struct aymo_(conn)* conn)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    vinsertv(sg->wg_fbmod_gate, conn->wg_fbmod_gate, sgo);
+    vinsertv(sg->wg_prmod_gate, conn->wg_prmod_gate, sgo);
+    int16_t og_out_gate = conn->og_out_gate;
+    vinsertv(sg->og_out_gate, og_out_gate, sgo);
+
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+    vinsertv(sg->og_out_ch_gate_a, (vextractv(cg->og_ch_gate_a, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_b, (vextractv(cg->og_ch_gate_b, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_c, (vextractv(cg->og_ch_gate_c, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_d, (vextractv(cg->og_ch_gate_d, sgo) & og_out_gate), sgo);
+}
+
+
+static
+void aymo_(cm_rewire_ch2x)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm && (chip->og_ch2x_pairing & (1UL << ch2x))) {
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+        if (ch2x_is_secondary) {
+            int t = ch2x;
+            ch2x = ch2p;
+            ch2p = t;
+        }
+        unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+        unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+        unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt);
+        const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt];
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]);
+    }
+    else {
+        unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+        const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt];
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]);
+    }
+}
+
+
+static
+void aymo_(cm_rewire_conn)(
+    struct aymo_(chip)* chip,
+    const struct aymo_ymf262_reg_104h* reg_104h_prev
+)
+{
+    struct aymo_ymf262_reg_104h* reg_104h = &chip->chip_regs.reg_104h;
+    unsigned diff = (reg_104h_prev ? (reg_104h_prev->conn ^ reg_104h->conn) : 0xFF);
+
+    for (int ch4x = 0; ch4x < (AYMO_(CHANNEL_NUM_MAX) / 2); ++ch4x) {
+        if (diff & (1 << ch4x)) {
+            int ch2x = aymo_ymf262_ch4x_to_pair[ch4x][0];
+            int ch2p = aymo_ymf262_ch4x_to_pair[ch4x][1];
+
+            if (reg_104h->conn & (1 << ch4x)) {
+                chip->og_ch2x_pairing |= ((1UL << ch2x) | (1UL << ch2p));
+
+                unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+                unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+                unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt);
+                const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]);
+            }
+            else {
+                chip->og_ch2x_pairing &= ~((1UL << ch2x) | (1UL << ch2p));
+
+                unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+                const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]);
+
+                unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+                const struct aymo_(conn)* ch2p_conn = aymo_(conn_ch2x_table)[ch2p_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch2p_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch2p_conn[1]);
+            }
+        }
+    }
+}
+
+
+static
+void aymo_(cm_rewire_rhythm)(
+    struct aymo_(chip)* chip,
+    const struct aymo_ymf262_reg_BDh* reg_BDh_prev
+)
+{
+    const struct aymo_ymf262_reg_BDh reg_BDh_zero = { 0, 0, 0, 0, 0, 0, 0, 0 };
+    const struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh;
+    int force_update = 0;
+
+    if (reg_BDh->ryt) {
+        if (!reg_BDh_prev->ryt) {
+            // Apply special connection for rhythm mode
+            unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt;
+            const struct aymo_(conn)* ch6_conn = aymo_(conn_ryt_table)[ch6_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]);
+
+            const struct aymo_(conn)* ch7_conn = aymo_(conn_ryt_table)[2];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]);
+
+            const struct aymo_(conn)* ch8_conn = aymo_(conn_ryt_table)[3];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]);
+
+            force_update = 1;
+        }
+    }
+    else {
+        if (reg_BDh_prev->ryt) {
+            // Apply standard Channel_2xOP connection
+            unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt;
+            const struct aymo_(conn)* ch6_conn = aymo_(conn_ch2x_table)[ch6_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]);
+
+            unsigned ch7_cnt = chip->ch2x_regs[7].reg_C0h.cnt;
+            const struct aymo_(conn)* ch7_conn = aymo_(conn_ch2x_table)[ch7_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]);
+
+            unsigned ch8_cnt = chip->ch2x_regs[8].reg_C0h.cnt;
+            const struct aymo_(conn)* ch8_conn = aymo_(conn_ch2x_table)[ch8_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]);
+
+            reg_BDh = &reg_BDh_zero;  // force all keys off
+            force_update = 1;
+        }
+    }
+
+    if ((reg_BDh->hh != reg_BDh_prev->hh) || force_update) {
+        int word_hh = aymo_ymf262_ch2x_to_word[7][0];
+        if (reg_BDh->hh) {
+            aymo_(eg_key_on)(chip, word_hh, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_hh, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->tc != reg_BDh_prev->tc) || force_update) {
+        int word_tc = aymo_ymf262_ch2x_to_word[8][1];
+        if (reg_BDh->tc) {
+            aymo_(eg_key_on)(chip, word_tc, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_tc, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->tom != reg_BDh_prev->tom) || force_update) {
+        int word_tom = aymo_ymf262_ch2x_to_word[8][0];
+        if (reg_BDh->tom) {
+            aymo_(eg_key_on)(chip, word_tom, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_tom, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->sd != reg_BDh_prev->sd) || force_update) {
+        int word_sd = aymo_ymf262_ch2x_to_word[7][1];
+        if (reg_BDh->sd) {
+            aymo_(eg_key_on)(chip, word_sd, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_sd, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->bd != reg_BDh_prev->bd) || force_update) {
+        int word_bd0 = aymo_ymf262_ch2x_to_word[6][0];
+        int word_bd1 = aymo_ymf262_ch2x_to_word[6][1];
+        if (reg_BDh->bd) {
+            aymo_(eg_key_on)(chip, word_bd0, AYMO_(EG_KEY_DRUM));
+            aymo_(eg_key_on)(chip, word_bd1, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_bd0, AYMO_(EG_KEY_DRUM));
+            aymo_(eg_key_off)(chip, word_bd1, AYMO_(EG_KEY_DRUM));
+        }
+    }
+}
+
+
+static
+void aymo_(write_00h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    switch (address) {
+    case 0x01: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_01h) = value;
+        break;
+    }
+    case 0x02: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_02h) = value;
+        break;
+    }
+    case 0x03: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_03h) = value;
+        break;
+    }
+    case 0x04: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_04h) = value;
+        break;
+    }
+    case 0x104: {
+        struct aymo_ymf262_reg_104h reg_104h_prev = chip->chip_regs.reg_104h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_104h) = value;
+        aymo_(cm_rewire_conn)(chip, &reg_104h_prev);
+        break;
+    }
+    case 0x105: {
+        struct aymo_ymf262_reg_105h reg_105h_prev = chip->chip_regs.reg_105h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_105h) = value;
+        if (chip->chip_regs.reg_105h.newm != reg_105h_prev.newm) {
+            ;
+        }
+        break;
+    }
+    case 0x08: {
+        struct aymo_ymf262_reg_08h reg_08h_prev = chip->chip_regs.reg_08h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_08h) = value;
+        if (chip->chip_regs.reg_08h.nts != reg_08h_prev.nts) {
+            aymo_(chip_pg_update_nts)(chip);
+        }
+        break;
+    }
+    }
+}
+
+
+static
+void aymo_(write_20h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int sgi = (aymo_ymf262_slot_to_word[slot] / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (aymo_ymf262_slot_to_word[slot] % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h);
+    struct aymo_ymf262_reg_20h reg_20h_prev = *reg_20h;
+    *(uint8_t*)(void*)reg_20h = value;
+    unsigned update_deltafreq = 0;
+
+    if (reg_20h->mult != reg_20h_prev.mult) {
+        int16_t pg_mult_x2 = aymo_ymf262_pg_mult_x2_table[reg_20h->mult];
+        vinsertv(sg->pg_mult_x2, pg_mult_x2, sgo);
+        update_deltafreq = 1;  // force
+    }
+
+    if (reg_20h->ksr != reg_20h_prev.ksr) {
+        int16_t eg_ksv = vextractv(cg->eg_ksv, sgo);
+        int16_t eg_ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1));
+        vinsertv(sg->eg_ks, eg_ks, sgo);
+    }
+
+    if (reg_20h->egt != reg_20h_prev.egt) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->sr = (reg_20h->egt ? 0 : chip->slot_regs[slot].reg_80h.rr);
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+    }
+
+    if (reg_20h->vib != reg_20h_prev.vib) {
+        int16_t pg_vib = -(int16_t)reg_20h->vib;
+        vinsertv(sg->pg_vib, pg_vib, sgo);
+        update_deltafreq = 1;  // force
+    }
+
+    if (reg_20h->am != reg_20h_prev.am) {
+        int16_t eg_am = -(int16_t)reg_20h->am;
+        vinsertv(sg->eg_am, eg_am, sgo);
+
+        uint16_t eg_tremolopos = chip->eg_tremolopos;
+        if (eg_tremolopos >= 105) {
+            eg_tremolopos = (210 - eg_tremolopos);
+        }
+        vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift));
+        vsfence();
+        sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am);
+    }
+
+    if (update_deltafreq) {
+        for (sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            cgi = aymo_(sgi_to_cgi)(sgi);
+            cg = &chip->cg[cgi];
+            sg = &chip->sg[sgi];
+            aymo_(pg_update_deltafreq)(chip, cg, sg);
+        }
+    }
+}
+
+
+static
+void aymo_(write_40h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h);
+    struct aymo_ymf262_reg_40h reg_40h_prev = *reg_40h;
+    *(uint8_t*)(void*)reg_40h = value;
+
+    if ((reg_40h->tl != reg_40h_prev.tl) || (reg_40h->ksl != reg_40h_prev.ksl)) {
+        aymo_(eg_update_ksl)(chip, word);
+    }
+}
+
+
+static
+void aymo_(write_60h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_60h* reg_60h = &(chip->slot_regs[slot].reg_60h);
+    struct aymo_ymf262_reg_60h reg_60h_prev = *reg_60h;
+    *(uint8_t*)(void*)reg_60h = value;
+
+    if ((reg_60h->dr != reg_60h_prev.dr) || (reg_60h->ar != reg_60h_prev.ar)) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->dr = reg_60h->dr;
+        eg_adsr->ar = reg_60h->ar;
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+    }
+}
+
+
+static
+void aymo_(write_80h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_80h* reg_80h = &(chip->slot_regs[slot].reg_80h);
+    struct aymo_ymf262_reg_80h reg_80h_prev = *reg_80h;
+    *(uint8_t*)(void*)reg_80h = value;
+
+    if ((reg_80h->rr != reg_80h_prev.rr) || (reg_80h->sl != reg_80h_prev.sl)) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->sr = (chip->slot_regs[slot].reg_20h.egt ? 0 : reg_80h->rr);
+        eg_adsr->rr = reg_80h->rr;
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+        int16_t eg_sl = (int16_t)reg_80h->sl;
+        if (eg_sl == 0x0F) {
+            eg_sl = 0x1F;
+        }
+        vinsertv(sg->eg_sl, eg_sl, sgo);
+    }
+}
+
+
+static
+void aymo_(write_E0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_E0h* reg_E0h = &(chip->slot_regs[slot].reg_E0h);
+    struct aymo_ymf262_reg_E0h reg_E0h_prev = *reg_E0h;
+    *(uint8_t*)(void*)reg_E0h = value;
+
+    if (!chip->chip_regs.reg_105h.newm) {
+        reg_E0h->ws &= 3;
+    }
+
+    if (reg_E0h->ws != reg_E0h_prev.ws) {
+        const struct aymo_(wave)* wave = &aymo_(wave_table)[reg_E0h->ws];
+        vinsertv(sg->wg_phase_mullo, wave->wg_phase_mullo, sgo);
+        vinsertv(sg->wg_phase_zero,  wave->wg_phase_zero,  sgo);
+        vinsertv(sg->wg_phase_neg,   wave->wg_phase_neg,   sgo);
+        vinsertv(sg->wg_phase_flip,  wave->wg_phase_flip,  sgo);
+        vinsertv(sg->wg_phase_mask,  wave->wg_phase_mask,  sgo);
+        vinsertv(sg->wg_sine_gate,   wave->wg_sine_gate,   sgo);
+    }
+}
+
+
+static
+void aymo_(write_A0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+    int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+    int ch2x_is_secondary = (ch2p < ch2x);
+    if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) {
+        return;
+    }
+    if (!ch2x_is_pairing || ch2x_is_secondary) {
+        ch2p = -1;
+    }
+
+    struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+    struct aymo_ymf262_reg_A0h reg_A0h_prev = *reg_A0h;
+    *(uint8_t*)(void*)reg_A0h = value;
+
+    if (reg_A0h->fnum_lo != reg_A0h_prev.fnum_lo) {
+        aymo_(ch2x_update_fnum)(chip, ch2x, ch2p);
+    }
+}
+
+
+static
+void aymo_(write_B0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+    int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+    int ch2x_is_secondary = (ch2p < ch2x);
+    if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) {
+        return;
+    }
+    if (!ch2x_is_pairing || ch2x_is_secondary) {
+        ch2p = -1;
+    }
+
+    if (address == 0xBD) {
+        struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh;
+        struct aymo_ymf262_reg_BDh reg_BDh_prev = *reg_BDh;
+        *(uint8_t*)(void*)reg_BDh = value;
+
+        chip->eg_tremoloshift = (((reg_BDh->dam ^ 1) << 1) + 2);
+        chip->eg_vibshift = (reg_BDh->dvb ^ 1);
+        aymo_(cm_rewire_rhythm)(chip, &reg_BDh_prev);
+    }
+    else {
+        struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+        struct aymo_ymf262_reg_B0h reg_B0h_prev = *reg_B0h;
+        *(uint8_t*)(void*)reg_B0h = value;
+
+        if ((reg_B0h->fnum_hi != reg_B0h_prev.fnum_hi) || (reg_B0h->block != reg_B0h_prev.block)) {
+            aymo_(ch2x_update_fnum)(chip, ch2x, ch2p);
+        }
+
+        if (reg_B0h->kon != reg_B0h_prev.kon) {
+            if (reg_B0h->kon) {
+                aymo_(ch2x_key_on)(chip, ch2x);
+            } else {
+                aymo_(ch2x_key_off)(chip, ch2x);
+            }
+        }
+    }
+}
+
+
+static
+void aymo_(write_C0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    struct aymo_ymf262_reg_C0h* reg_C0h = &(chip->ch2x_regs[ch2x].reg_C0h);
+    struct aymo_ymf262_reg_C0h reg_C0h_prev = *reg_C0h;
+    if (!chip->chip_regs.reg_105h.newm) {
+        value = ((value | 0x30) & 0x3F);
+    }
+    *(uint8_t*)(void*)reg_C0h = value;
+
+    int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+    int sgo = (ch2x_word0 % AYMO_(SLOT_GROUP_LENGTH));
+    int sgi0 = (ch2x_word0 / AYMO_(SLOT_GROUP_LENGTH));
+    int sgi1 = (ch2x_word1 / AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg0 = &chip->sg[sgi0];
+    struct aymo_(slot_group)* sg1 = &chip->sg[sgi1];
+    int cgi = aymo_(sgi_to_cgi)(sgi0);
+    struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+
+    if (reg_C0h->cha != reg_C0h_prev.cha) {
+        int16_t og_ch_gate_a = -(int16_t)reg_C0h->cha;
+        vinsertv(cg->og_ch_gate_a, og_ch_gate_a, sgo);
+        vinsertv(sg0->og_out_ch_gate_a, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_a), sgo);
+        vinsertv(sg1->og_out_ch_gate_a, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_a), sgo);
+    }
+    if (reg_C0h->chb != reg_C0h_prev.chb) {
+        int16_t og_ch_gate_b = -(int16_t)reg_C0h->chb;
+        vinsertv(cg->og_ch_gate_b, og_ch_gate_b, sgo);
+        vinsertv(sg0->og_out_ch_gate_b, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_b), sgo);
+        vinsertv(sg1->og_out_ch_gate_b, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_b), sgo);
+    }
+    if (reg_C0h->chc != reg_C0h_prev.chc) {
+        int16_t og_ch_gate_c = -(int16_t)reg_C0h->chc;
+        vinsertv(cg->og_ch_gate_c, og_ch_gate_c, sgo);
+        vinsertv(sg0->og_out_ch_gate_c, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_c), sgo);
+        vinsertv(sg1->og_out_ch_gate_c, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_c), sgo);
+    }
+    if (reg_C0h->chd != reg_C0h_prev.chd) {
+        int16_t og_ch_gate_d = -(int16_t)reg_C0h->chd;
+        vinsertv(cg->og_ch_gate_d, og_ch_gate_d, sgo);
+        vinsertv(sg0->og_out_ch_gate_d, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_d), sgo);
+        vinsertv(sg1->og_out_ch_gate_d, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_d), sgo);
+    }
+
+    if (reg_C0h->fb != reg_C0h_prev.fb) {
+        int16_t fb_mulhi = (reg_C0h->fb ? (0x0040 << reg_C0h->fb) : 0);
+        vinsertv(sg0->wg_fb_mulhi, fb_mulhi, sgo);
+        vinsertv(sg1->wg_fb_mulhi, fb_mulhi, sgo);
+    }
+
+    if (chip->chip_regs.reg_105h.stereo) {
+        // TODO
+    }
+
+    if (reg_C0h->cnt != reg_C0h_prev.cnt) {
+        aymo_(cm_rewire_ch2x)(chip, ch2x);
+    }
+}
+
+
+static
+void aymo_(write_D0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    *(uint8_t*)(void*)&(chip->ch2x_regs[ch2x].reg_C0h) = value;
+
+    if (chip->chip_regs.reg_105h.stereo) {
+        // TODO
+    }
+}
+
+
+static
+int aymo_(rq_enqueue)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    uint16_t rq_tail = chip->rq_tail;
+    uint16_t rq_next = (rq_tail + 1);
+    if (rq_next >= AYMO_(REG_QUEUE_LENGTH)) {
+        rq_next = 0u;
+    }
+
+    if (rq_next != chip->rq_head) {
+        chip->rq_buffer[rq_tail].address = address;
+        chip->rq_buffer[rq_tail].value = value;
+        chip->rq_tail = rq_next;
+        return 1;
+    }
+    return 0;
+}
+
+
+const struct aymo_ymf262_vt* aymo_(get_vt)(void)
+{
+    return &(aymo_(vt));
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    // Wipe everything, except VT
+    const struct aymo_ymf262_vt* vt = chip->parent.vt;
+    aymo_memset(chip, 0, sizeof(*chip));
+    chip->parent.vt = vt;
+
+    // Initialize slots
+    for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+        struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+        sg->eg_rout         = vset1(0x01FF);
+        sg->eg_out          = vset1(0x01FF);
+        sg->eg_gen          = vset1(AYMO_(EG_GEN_RELEASE));
+        sg->eg_gen_mullo    = vset1(AYMO_(EG_GEN_MULLO_RELEASE));
+        sg->pg_mult_x2      = vset1(aymo_ymf262_pg_mult_x2_table[0]);
+        sg->og_prout_ac     = vsetm(aymo_(og_prout_ac)[sgi]);
+        sg->og_prout_bd     = vsetm(aymo_(og_prout_bd)[sgi]);
+
+        const struct aymo_(wave)* wave = &aymo_(wave_table)[0];
+        sg->wg_phase_mullo  = vset1(wave->wg_phase_mullo);
+        sg->wg_phase_zero   = vset1(wave->wg_phase_zero);
+        sg->wg_phase_neg    = vset1(wave->wg_phase_neg);
+        sg->wg_phase_flip   = vset1(wave->wg_phase_flip);
+        sg->wg_phase_mask   = vset1(wave->wg_phase_mask);
+        sg->wg_sine_gate    = vset1(wave->wg_sine_gate);
+    }
+
+    // Initialize channels
+    for (int cgi = 0; cgi < (AYMO_(SLOT_GROUP_NUM) / 2); ++cgi) {
+        struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+        cg->og_ch_gate_a = vset1(-1);
+        cg->og_ch_gate_b = vset1(-1);
+    }
+    for (int ch2x = 0; ch2x < AYMO_(CHANNEL_NUM_MAX); ++ch2x) {
+        aymo_(cm_rewire_ch2x)(chip, ch2x);
+    }
+
+    // Initialize chip
+    chip->ng_noise = 1;
+
+    chip->eg_tremoloshift = 4;
+    chip->eg_vibshift = 1;
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    AYMO_UNUSED_VAR(chip);
+    AYMO_UNUSED_VAR(address);
+    assert(chip);
+
+    // not supported
+    return 0u;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address > 0x1FF) {
+        return;
+    }
+
+    switch (address & 0xF0) {
+    case 0x00: {
+        aymo_(write_00h)(chip, address, value);
+        break;
+    }
+    case 0x20:
+    case 0x30: {
+        aymo_(write_20h)(chip, address, value);
+        break;
+    }
+    case 0x40:
+    case 0x50: {
+        aymo_(write_40h)(chip, address, value);
+        break;
+    }
+    case 0x60:
+    case 0x70: {
+        aymo_(write_60h)(chip, address, value);
+        break;
+    }
+    case 0x80:
+    case 0x90: {
+        aymo_(write_80h)(chip, address, value);
+        break;
+    }
+    case 0xE0:
+    case 0xF0: {
+        aymo_(write_E0h)(chip, address, value);
+        break;
+    }
+    case 0xA0: {
+        aymo_(write_A0h)(chip, address, value);
+        break;
+    }
+    case 0xB0: {
+        aymo_(write_B0h)(chip, address, value);
+        break;
+    }
+    case 0xC0: {
+        aymo_(write_C0h)(chip, address, value);
+        break;
+    }
+    case 0xD0: {
+        aymo_(write_D0h)(chip, address, value);
+        break;
+    }
+    }
+    vsfence();
+}
+
+
+int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address < 0x8000u) {
+        return aymo_(rq_enqueue)(chip, address, value);
+    }
+    return 0;
+}
+
+
+int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    if (count < 0x8000u) {
+        uint16_t address = (uint16_t)((count >> 8) | 0x8000u);
+        uint8_t value = (uint8_t)(count & 0xFFu);
+        return aymo_(rq_enqueue)(chip, address, value);
+    }
+    return 0;
+}
+
+
+int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel)
+{
+    assert(chip);
+
+    switch (channel) {
+        case 0u: return _mm_extract_epi16(chip->og_out, 0);
+        case 1u: return _mm_extract_epi16(chip->og_out, 1);
+        case 2u: return _mm_extract_epi16(chip->og_out, 2);
+        case 3u: return _mm_extract_epi16(chip->og_out, 3);
+        default: return 0;
+    }
+}
+
+
+void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+    }
+}
+
+
+void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 3u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        *(int32_t*)y = _mm_cvtsi128_si32(chip->og_out);
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 7u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        _mm_storel_epi64((void*)y, chip->og_out);
+        y += 4u;
+    }
+}
+
+
+void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 7u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out);
+        vf32x4_t vf32 = _mm_cvtepi32_ps(vi32);
+        _mm_storel_pi((void*)y, vf32);
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 15u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out);
+        vf32x4_t vf32 = _mm_cvtepi32_ps(vi32);
+        _mm_store_ps(y, vf32);
+        y += 4u;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
diff --git a/src/aymo_ymf262_x86_sse41.c b/src/aymo_ymf262_x86_sse41.c
new file mode 100644
index 0000000..f3eba29
--- /dev/null
+++ b/src/aymo_ymf262_x86_sse41.c
@@ -0,0 +1,1691 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <assert.h>
+#include "aymo_cpu_x86_sse41_inline.h"
+#include "aymo_ymf262.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_x86_sse41.h"
+
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+const struct aymo_ymf262_vt aymo_(vt) =
+{
+    AYMO_STRINGIFY2(aymo_(vt)),
+    (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)),
+    (aymo_ymf262_ctor_f)&(aymo_(ctor)),
+    (aymo_ymf262_dtor_f)&(aymo_(dtor)),
+    (aymo_ymf262_read_f)&(aymo_(read)),
+    (aymo_ymf262_write_f)&(aymo_(write)),
+    (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)),
+    (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)),
+    (aymo_ymf262_get_output_f)&(aymo_(get_output)),
+    (aymo_ymf262_tick_f)&(aymo_(tick)),
+    (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)),
+    (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)),
+    (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)),
+    (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4))
+};
+
+
+// 32-bit Slot Group side (lo/hi)
+const int8_t aymo_(sgo_side)[8] =
+{
+    0, 0, 0, 0,  1, 1, 1, 1
+};
+
+// 32-bit Slot Group cell
+const int8_t aymo_(sgo_cell)[8] =
+{
+    0, 1, 2, 3,  0, 1, 2, 3
+};
+
+
+const uint16_t aymo_(eg_incstep_table)[4] =
+{
+    ((1 << 15) | (1 << 14) | (1 << 13)),
+    ((0 << 15) | (0 << 14) | (1 << 13)),
+    ((0 << 15) | (1 << 14) | (1 << 13)),
+    ((0 << 15) | (0 << 14) | (0 << 13))
+};
+
+
+// Wave descriptors
+const struct aymo_(wave) aymo_(wave_table)[8] =  // TODO: share bits; select vit shifts
+{
+    { 1,  0x0000,  0x0200,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0200,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0000,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0100,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 2,  0x0400,  0x0200,  0x0100,  0x00FF,  -1 },
+    { 2,  0x0400,  0x0000,  0x0100,  0x00FF,  -1 },
+    { 1,  0x0000,  0x0200,  0x0200,  0x0001,   0 },
+    { 8,  0x0000,  0x1000,  0x1000,  0x1FFF,   0 }
+};
+
+
+// 2-channel connection descriptors
+const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */] =
+{
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,  -1 }
+    },
+};
+
+// 4-channel connection descriptors
+const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */] =
+{
+    {
+        { -1,   0,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,   0 },
+        {  0,  -1,  -1 }
+    },
+    {
+        { -1,   0,  -1 },
+        {  0,   0,   0 },
+        {  0,  -1,  -1 },
+        {  0,   0,  -1 }
+    },
+};
+
+// Rhythm connection descriptors
+const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */] =
+{
+    // Channel 6: BD, FM
+    {
+        { -1,   0,   0 },
+        {  0,  -1,  -1 }
+    },
+    // Channel 6: BD, AM
+    {
+        { -1,   0,   0 },
+        {  0,   0,  -1 }
+    },
+    // Channel 7: HH + SD
+    {
+        {  0,   0,  -1 },
+        {  0,   0,  -1 }
+    },
+    // Channel 8: TT + TC
+    {
+        {  0,   0,  -1 },
+        {  0,   0,  -1 }
+    }
+};
+
+
+// Slot mask output delay for outputs A and C
+const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)] =  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+{
+    0xF8,
+    0xF8,
+    0xF8,
+    0xFF,
+    0xF8,
+    0xFF,
+    0xF8,
+    0xFF
+};
+
+
+// Slot mask output delay for outputs B and D
+const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)] =  // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag
+{
+    0x88,
+    0xF8,
+    0x88,
+    0xF8,
+    0x88,
+    0xFF,
+    0x88,
+    0xFF
+};
+
+
+// Updates phase generator
+static inline
+void aymo_(pg_update_deltafreq)(
+    struct aymo_(chip)* chip,
+    struct aymo_(ch2x_group)* cg,
+    struct aymo_(slot_group)* sg
+)
+{
+    // Update phase
+    vi16_t fnum = cg->pg_fnum;
+    vi16_t range = vand(fnum, vset1(7 << 7));
+    range = vmulihi(range, vand(sg->pg_vib, chip->pg_vib_mulhi));
+    range = vsub(vxor(range, chip->pg_vib_neg), chip->pg_vib_neg);  // flip sign
+    fnum = vadd(fnum, range);
+
+    vi32_t zero = vsetz();
+    vi32_t fnum_lo = vunpacklo(fnum, zero);
+    vi32_t fnum_hi = vunpackhi(fnum, zero);
+    vi32_t block_sll_lo = vunpacklo(cg->pg_block, zero);
+    vi32_t block_sll_hi = vunpackhi(cg->pg_block, zero);
+    vi32_t basefreq_lo = vvsrli(vvsllv(fnum_lo, block_sll_lo), 1);
+    vi32_t basefreq_hi = vvsrli(vvsllv(fnum_hi, block_sll_hi), 1);
+    vi32_t pg_mult_x2_lo = vunpacklo(sg->pg_mult_x2, zero);
+    vi32_t pg_mult_x2_hi = vunpackhi(sg->pg_mult_x2, zero);
+    vi32_t deltafreq_lo = vvsrli(vvmullo(basefreq_lo, pg_mult_x2_lo), 1);
+    vi32_t deltafreq_hi = vvsrli(vvmullo(basefreq_hi, pg_mult_x2_hi), 1);
+    sg->pg_deltafreq_lo = deltafreq_lo;
+    sg->pg_deltafreq_hi = deltafreq_hi;
+}
+
+
+// Updates noise generator
+static inline
+void aymo_(ng_update)(struct aymo_(chip)* chip, unsigned times)
+{
+    // Update noise
+    uint32_t noise = chip->ng_noise;
+    while (times--) {
+        uint32_t n_bit = (((noise >> 14) ^ noise) & 1);
+        noise = ((noise >> 1) | (n_bit << 22));
+    }
+    chip->ng_noise = noise;
+}
+
+
+// Updates rhythm manager, slot group 1
+static inline
+void aymo_(rm_update_sg1)(struct aymo_(chip)* chip)
+{
+    struct aymo_(slot_group)* sg = &chip->sg[1];
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Double rhythm outputs
+        vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0);
+        vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask);
+        chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a));
+        chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b));
+        chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c));
+        chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d));
+    }
+
+    vi16_t phase = sg->pg_phase_out;
+    uint16_t phase13 = (uint16_t)vextract(phase, 1);
+
+    // Update noise bits
+    chip->rm_hh_bit2 = ((phase13 >> 2) & 1);
+    chip->rm_hh_bit3 = ((phase13 >> 3) & 1);
+    chip->rm_hh_bit7 = ((phase13 >> 7) & 1);
+    chip->rm_hh_bit8 = ((phase13 >> 8) & 1);
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Calculate noise bit
+        uint16_t rm_xor = (
+            (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) |
+            (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) |
+            (chip->rm_tc_bit3 ^ chip->rm_tc_bit5)
+        );
+
+        // Update HH
+        uint16_t noise = (uint16_t)chip->ng_noise;
+        phase13 = (rm_xor << 9);
+        if (rm_xor ^ (noise & 1)) {
+            phase13 |= 0xD0;
+        } else {
+            phase13 |= 0x34;
+        }
+        phase = vinsert(phase, (int16_t)phase13, 1);
+
+        sg->pg_phase_out = phase;
+    }
+}
+
+
+// Updates rhythm manager, slot group 3
+static inline
+void aymo_(rm_update_sg3)(struct aymo_(chip)* chip)
+{
+    struct aymo_(slot_group)* sg = &chip->sg[3];
+
+    if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) {
+        // Double rhythm outputs
+        vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0);
+        vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask);
+        chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a));
+        chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b));
+        chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c));
+        chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d));
+
+        // Calculate noise bit
+        uint16_t rm_xor = (
+            (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) |
+            (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) |
+            (chip->rm_tc_bit3 ^ chip->rm_tc_bit5)
+        );
+        vi16_t phase = sg->pg_phase_out;
+
+        // Update SD
+        uint16_t noise = (uint16_t)chip->ng_noise;
+        uint16_t phase16 = (
+            ((uint16_t)chip->rm_hh_bit8 << 9) |
+            ((uint16_t)(chip->rm_hh_bit8 ^ (noise & 1)) << 8)
+        );
+        phase = vinsert(phase, (int16_t)phase16, 1);
+
+        // Update TC
+        uint32_t phase17 = vextract(phase, 2);
+        chip->rm_tc_bit3 = ((phase17 >> 3) & 1);
+        chip->rm_tc_bit5 = ((phase17 >> 5) & 1);
+        phase17 = ((rm_xor << 9) | 0x80);
+        phase = vinsert(phase, (int16_t)phase17, 2);
+
+        sg->pg_phase_out = phase;
+    }
+}
+
+
+// Updates slot generators
+static
+void aymo_(sg_update)(
+    struct aymo_(chip)* chip,
+    struct aymo_(slot_group)* sg
+)
+{
+    // EG: Compute envelope output
+    vi16_t sg_eg_rout = sg->eg_rout;
+    sg->eg_out = vadd(vadd(sg_eg_rout, sg->eg_tremolo_am), sg->eg_ksl_sh_tl_x4);
+
+    // PG: Compute phase output
+    vi32_t phase_out_mask = vvset1(0xFFFF);
+    vi32_t phase_out_lo = vvand(vvsrli(sg->pg_phase_lo, 9), phase_out_mask);
+    vi32_t phase_out_hi = vvand(vvsrli(sg->pg_phase_hi, 9), phase_out_mask);
+    vi16_t phase_out = vvpackus(phase_out_lo, phase_out_hi);
+    sg->pg_phase_out = phase_out;
+
+    // EG: Compute rate
+    vi16_t eg_prgen = sg->eg_gen;
+    vi16_t eg_gen_rel = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_RELEASE)));
+    vi16_t notreset = vcmpz(vand(sg->eg_key, eg_gen_rel));
+    vi16_t eg_gen_mullo = vblendv(vset1(AYMO_(EG_GEN_MULLO_ATTACK)), sg->eg_gen_mullo, notreset);
+    vi16_t reg_rate = vu2i(vmululo(vi2u(sg->eg_adsr), vi2u(eg_gen_mullo)));  // move to top nibble
+    vi16_t rate_temp = vand(reg_rate, vset1((int16_t)0xF000));  // keep top nibble
+    rate_temp = vsrli(rate_temp, AYMO_(EG_GEN_SRLHI));
+    vi16_t rate = vadd(sg->eg_ks, rate_temp);
+    vi16_t rate_lo = vand(rate, vset1(3));
+    vi16_t rate_hi = vsrli(rate, 2);
+    rate_hi = vmini(rate_hi, vset1(15));
+
+    // PG: Update phase
+    vi32_t notreset_lo = vunpacklo(notreset, notreset);
+    vi32_t notreset_hi = vunpackhi(notreset, notreset);
+    vi32_t pg_phase_lo = vvand(notreset_lo, sg->pg_phase_lo);
+    vi32_t pg_phase_hi = vvand(notreset_hi, sg->pg_phase_hi);
+    sg->pg_phase_lo = vvadd(pg_phase_lo, sg->pg_deltafreq_lo);
+    sg->pg_phase_hi = vvadd(pg_phase_hi, sg->pg_deltafreq_hi);
+
+    // EG: Compute shift (< 12)
+    vi16_t eg_shift = vadd(rate_hi, chip->eg_add);
+    vi16_t rate_pre_lt12 = vor(vslli(rate_lo, 1), vset1(8));
+    vi16_t shift_lt12 = vsrlv(rate_pre_lt12, vsubsu(vset1(15), eg_shift));
+    vi16_t eg_state = vset1((int16_t)chip->eg_state);
+    shift_lt12 = vand(shift_lt12, eg_state);
+
+    // WG: Compute feedback and modulation inputs
+    vi16_t fbsum = vslli(vadd(sg->wg_out, sg->wg_prout), 1);
+    vi16_t fbsum_sh = vmulihi(fbsum, sg->wg_fb_mulhi);
+    vi16_t prmod = vand(chip->wg_mod, sg->wg_prmod_gate);
+    vi16_t fbmod = vand(fbsum_sh, sg->wg_fbmod_gate);
+    sg->wg_prout = sg->wg_out;
+
+    // WG: Compute operator phase input
+    vi16_t modsum = vadd(fbmod, prmod);
+    vi16_t phase = vadd(phase_out, modsum);
+
+    // EG: Compute shift (>= 12)
+    vu16_t rate_lo_muluhi = vi2u(vslli(vpow2m1lt4(rate_lo), 1));
+    vi16_t incstep_ge12 = vand(vu2i(vmuluhi(chip->eg_incstep, rate_lo_muluhi)), vset1(1));
+    vi16_t shift_ge12 = vadd(vand(rate_hi, vset1(3)), incstep_ge12);
+    shift_ge12 = vmini(shift_ge12, vset1(3));
+    shift_ge12 = vblendv(shift_ge12, eg_state, vcmpz(shift_ge12));
+
+    vi16_t shift = vblendv(shift_lt12, shift_ge12, vcmpgt(rate_hi, vset1(11)));
+    shift = vandnot(vcmpz(rate_temp), shift);
+
+    // EG: Instant attack
+    vi16_t eg_rout = sg_eg_rout;
+    eg_rout = vandnot(vandnot(notreset, vcmpeq(rate_hi, vset1(15))), eg_rout);
+
+    // WG: Process phase
+    vi16_t phase_sped = vu2i(vmululo(vi2u(phase), sg->wg_phase_mullo));
+    vi16_t phase_gate = vcmpz(vand(phase_sped, sg->wg_phase_zero));
+    vi16_t phase_flip = vcmpp(vand(phase_sped, sg->wg_phase_flip));
+    vi16_t phase_mask = sg->wg_phase_mask;
+    vi16_t phase_xor = vand(phase_flip, phase_mask);
+    vi16_t phase_idx = vxor(phase_sped, phase_xor);
+    phase_out = vand(vand(phase_gate, phase_mask), phase_idx);
+
+    // EG: Envelope off
+    vi16_t eg_off = vcmpgt(sg_eg_rout, vset1(0x01F7));
+    vi16_t eg_gen_natk_and_nrst = vand(vcmpp(eg_prgen), notreset);
+    eg_rout = vblendv(eg_rout, vset1(0x01FF), vand(eg_gen_natk_and_nrst, eg_off));
+
+    // WG: Compute logsin variant
+    vi16_t phase_lo = phase_out;  // vgather() masks to low byte
+    vi16_t logsin_val = vgather(aymo_ymf262_logsin_table, phase_lo);
+    logsin_val = vblendv(vset1(0x1000), logsin_val, phase_gate);
+
+    // EG: Compute common increment not in attack state
+    vi16_t eg_inc_natk_cond = vand(vand(notreset, vcmpz(eg_off)), vcmpp(shift));
+    vi16_t eg_inc_natk = vand(eg_inc_natk_cond, vpow2m1lt4(shift));
+    vi16_t eg_gen = eg_prgen;
+
+    // WG: Compute exponential output
+    vi16_t exp_in = vblendv(phase_out, logsin_val, sg->wg_sine_gate);
+    vi16_t exp_level = vadd(exp_in, vslli(sg->eg_out, 3));
+    exp_level = vmini(exp_level, vset1(0x1FFF));
+    vi16_t exp_level_lo = exp_level;  // vgather() masks to low byte
+    vi16_t exp_level_hi = vsrli(exp_level, 8);
+    vi16_t exp_value = vgather(aymo_ymf262_exp_x2_table, exp_level_lo);
+    vi16_t exp_out = vsrlv(exp_value, exp_level_hi);
+
+    // EG: Move attack to decay state
+    vi16_t eg_inc_atk_cond = vand(vand(vcmpp(sg->eg_key), vcmpp(shift)),
+                                  vand(vcmpz(eg_prgen), vcmpgt(vset1(15), rate_hi)));
+    vi16_t eg_inc_atk_ninc = vsrlv(sg_eg_rout, vsub(vset1(4), shift));
+    vi16_t eg_inc = vandnot(eg_inc_atk_ninc, eg_inc_atk_cond);
+    vi16_t eg_gen_atk_to_dec = vcmpz(vor(eg_prgen, sg_eg_rout));
+    eg_gen = vsub(eg_gen, eg_gen_atk_to_dec);  // 0 --> 1
+    eg_inc = vblendv(eg_inc_natk, eg_inc, vcmpz(eg_prgen));
+    eg_inc = vandnot(eg_gen_atk_to_dec, eg_inc);
+
+    // WG: Compute operator wave output
+    vi16_t wave_pos = vcmpz(vand(phase_sped, sg->wg_phase_neg));
+    vi16_t wave_neg = vandnot(wave_pos, phase_gate);
+    vi16_t wave_out = vxor(exp_out, wave_neg);
+    sg->wg_out = wave_out;
+    chip->wg_mod = wave_out;
+
+    // EG: Move decay to sustain state
+    vi16_t eg_gen_dec = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_DECAY)));
+    vi16_t sl_hit = vcmpeq(vsrli(sg_eg_rout, 4), sg->eg_sl);
+    vi16_t eg_gen_dec_to_sus = vand(eg_gen_dec, sl_hit);
+    eg_gen = vsub(eg_gen, eg_gen_dec_to_sus);  // 1 --> 2
+    eg_inc = vandnot(eg_gen_dec_to_sus, eg_inc);
+
+    // WG: Update chip output accumulators, with quirky slot output delay
+    vi16_t og_out_ac = vblendv(wave_out, sg->og_prout, sg->og_prout_ac);
+    vi16_t og_out_bd = vblendv(wave_out, sg->og_prout, sg->og_prout_bd);
+    sg->og_prout = wave_out;
+    chip->og_acc_a = vadd(chip->og_acc_a, vand(og_out_ac, sg->og_out_ch_gate_a));
+    chip->og_acc_c = vadd(chip->og_acc_c, vand(og_out_ac, sg->og_out_ch_gate_c));
+    chip->og_acc_b = vadd(chip->og_acc_b, vand(og_out_bd, sg->og_out_ch_gate_b));
+    chip->og_acc_d = vadd(chip->og_acc_d, vand(og_out_bd, sg->og_out_ch_gate_d));
+
+    // EG: Move back to attack state
+    eg_gen = vand(notreset, eg_gen);  // * --> 0
+
+    // EG: Move to release state
+    eg_gen = vor(eg_gen, vsrli(vcmpz(sg->eg_key), 14));  // * --> 3
+
+    // EG: Update envelope generator
+    eg_rout = vadd(eg_rout, eg_inc);
+    eg_rout = vand(eg_rout, vset1(0x01FF));
+    sg->eg_rout = eg_rout;
+    sg->eg_gen = eg_gen;
+    sg->eg_gen_mullo = vsllv(vset1(1), vslli(eg_gen, 2));
+
+#ifdef AYMO_DEBUG
+    sg->eg_rate = rate;
+    sg->eg_inc = eg_inc;
+    sg->wg_fbmod = fbsum_sh;
+    sg->wg_mod = modsum;
+#endif
+}
+
+
+// Clear output accumulators
+static inline
+void aymo_(og_clear)(struct aymo_(chip)* chip)
+{
+    chip->og_acc_a = vsetz();
+    chip->og_acc_b = vsetz();
+    chip->og_acc_c = vsetz();
+    chip->og_acc_d = vsetz();
+}
+
+
+// Updates output mixdown
+static inline
+void aymo_(og_update)(struct aymo_(chip)* chip)
+{
+    vi16x8_t one = _mm_set1_epi16(1);
+    vi32x4_t tot_a = _mm_madd_epi16(chip->og_acc_a, one);
+    vi32x4_t tot_b = _mm_madd_epi16(chip->og_acc_b, one);
+    vi32x4_t tot_c = _mm_madd_epi16(chip->og_acc_c, one);
+    vi32x4_t tot_d = _mm_madd_epi16(chip->og_acc_d, one);
+
+    tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(2, 3, 0, 1)));
+    tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(2, 3, 0, 1)));
+
+    tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(1, 0, 3, 2)));
+    tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(1, 0, 3, 2)));
+
+    vi32x4_t tot_ab = _mm_blend_epi16(tot_a, tot_b, 0xCC);
+    vi32x4_t tot_cd = _mm_blend_epi16(tot_c, tot_d, 0x33);
+    vi32x4_t tot_abcd = _mm_blend_epi16(tot_ab, tot_cd, 0xF0);
+    vi16x8_t sat_abcd = _mm_packs_epi32(tot_abcd, tot_abcd);
+
+    vi16x8_t old_abcd = _mm_shuffle_epi32(chip->og_out, _MM_SHUFFLE(1, 0, 3, 2));
+    vi16x8_t out_abcd = _mm_blend_epi16(old_abcd, sat_abcd, 0xF5);
+
+    chip->og_out = out_abcd;
+}
+
+
+// Updates timer management
+static inline
+void aymo_(tm_update)(struct aymo_(chip)* chip)
+{
+    // Update tremolo
+    if AYMO_UNLIKELY((chip->tm_timer & 0x3F) == 0x3F) {
+        chip->eg_tremolopos = ((chip->eg_tremolopos + 1) % 210);
+
+        uint16_t eg_tremolopos = chip->eg_tremolopos;
+        if (eg_tremolopos >= 105) {
+            eg_tremolopos = (210 - eg_tremolopos);
+        }
+        vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift));
+
+        for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            struct aymo_(slot_group)* sg = &chip->sg[sgi];
+            sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am);
+        }
+    }
+
+    // Update vibrato
+    if AYMO_UNLIKELY((chip->tm_timer & 0x3FF) == 0x3FF) {
+        chip->pg_vibpos = ((chip->pg_vibpos + 1) & 7);
+        uint8_t vibpos = chip->pg_vibpos;
+        int16_t pg_vib_mulhi = (0x10000 >> 7);
+        int16_t pg_vib_neg = 0;
+
+        if (!(vibpos & 3)) {
+            pg_vib_mulhi = 0;
+        }
+        else if (vibpos & 1) {
+            pg_vib_mulhi >>= 1;
+        }
+        pg_vib_mulhi >>= chip->eg_vibshift;
+        pg_vib_mulhi &= 0x7F80;
+
+        if (vibpos & 4) {
+            pg_vib_neg = -1;
+        }
+        chip->pg_vib_mulhi = vset1(pg_vib_mulhi);
+        chip->pg_vib_neg = vset1(pg_vib_neg);
+
+        for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            int cgi = aymo_(sgi_to_cgi)(sgi);
+            struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+            struct aymo_(slot_group)* sg = &chip->sg[sgi];
+            aymo_(pg_update_deltafreq)(chip, cg, sg);
+        }
+    }
+
+    chip->tm_timer++;
+    uint16_t eg_incstep = aymo_(eg_incstep_table)[chip->tm_timer & 3];
+    chip->eg_incstep = vi2u(vset1((int16_t)eg_incstep));
+
+    // Update timed envelope patterns
+    int16_t eg_shift = (int16_t)uffsll(chip->eg_timer);
+    int16_t eg_add = ((eg_shift > 13) ? 0 : eg_shift);
+    chip->eg_add = vset1(eg_add);
+
+    // Update envelope timer and flip state
+    if (chip->eg_state | chip->eg_timerrem) {
+        if (chip->eg_timer < ((1ULL << AYMO_YMF262_SLOT_NUM) - 1ULL)) {
+            chip->eg_timer++;
+            chip->eg_timerrem = 0;
+        }
+        else {
+            chip->eg_timer = 0;
+            chip->eg_timerrem = 1;
+        }
+    }
+    chip->eg_state ^= 1;
+}
+
+
+// Updates the register queue
+static inline
+void aymo_(rq_update)(struct aymo_(chip)* chip)
+{
+    if (chip->rq_delay) {
+        if (--chip->rq_delay) {
+            return;
+        }
+    }
+    if (chip->rq_head != chip->rq_tail) {
+        struct aymo_(reg_queue_item)* item = &chip->rq_buffer[chip->rq_head];
+
+        if (item->address & 0x8000u) {
+            chip->rq_delay = AYMO_(REG_QUEUE_LATENCY);
+            chip->rq_delay += (((uint32_t)(item->address & 0x7FFFu) << 16) | item->value);
+        }
+        else {
+            aymo_(write)(chip, item->address, item->value);
+        }
+
+        if (++chip->rq_head >= AYMO_(REG_QUEUE_LENGTH)) {
+            chip->rq_head = 0;
+        }
+    }
+}
+
+
+static
+void aymo_(tick_once)(struct aymo_(chip)* chip)
+{
+    int sgi;
+
+    // Clear output accumulators
+    aymo_(og_clear)(chip);
+
+    // Process slot group 0
+    sgi = 0;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 2
+    sgi = 2;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 4
+    sgi = 4;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 6
+    sgi = 6;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+    // Process slot group 1
+    sgi = 1;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+    aymo_(ng_update)(chip, (36 - 3));  // slot 16 --> slot 13
+    aymo_(rm_update_sg1)(chip);
+
+    // Process slot group 3
+    sgi = 3;
+    aymo_(sg_update)(chip, &chip->sg[sgi]);
+    aymo_(ng_update)(chip, 3);  // slot 13 --> slot 16
+    aymo_(rm_update_sg3)(chip);
+
+    if AYMO_UNLIKELY(chip->process_all_slots) {
+        // Process slot group 5
+        sgi = 5;
+        aymo_(sg_update)(chip, &chip->sg[sgi]);
+
+        // Process slot group 7
+        sgi = 7;
+        aymo_(sg_update)(chip, &chip->sg[sgi]);
+    }
+
+    // Update outputs
+    aymo_(og_update)(chip);
+
+    // Update timers
+    aymo_(tm_update)(chip);
+
+    // Dequeue registers
+    aymo_(rq_update)(chip);
+}
+
+
+static
+void aymo_(eg_update_ksl)(struct aymo_(chip)* chip, int word)
+{
+    int slot = aymo_ymf262_word_to_slot[word];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h);
+
+    int16_t pg_fnum = vextractv(cg->pg_fnum, sgo);
+    int16_t pg_fnum_hn = ((pg_fnum >> 6) & 15);
+
+    int ch2x = aymo_ymf262_word_to_ch2x[aymo_ymf262_slot_to_word[slot]];
+    int16_t eg_block = (int16_t)(chip->ch2x_regs[ch2x].reg_B0h.block);
+    int16_t eg_ksl = aymo_ymf262_eg_ksl_table[pg_fnum_hn];
+    eg_ksl = ((eg_ksl << 2) - ((8 - eg_block) << 5));
+    if (eg_ksl < 0) {
+        eg_ksl = 0;
+    }
+    int16_t eg_kslsh = aymo_ymf262_eg_kslsh_table[reg_40h->ksl];
+    int16_t eg_ksl_sh = (eg_ksl >> eg_kslsh);
+
+    int16_t eg_tl_x4 = ((int16_t)reg_40h->tl << 2);
+
+    int16_t eg_ksl_sh_tl_x4 = (eg_ksl_sh + eg_tl_x4);
+    vinsertv(sg->eg_ksl_sh_tl_x4, eg_ksl_sh_tl_x4, sgo);
+
+#ifdef AYMO_DEBUG
+    vinsertv(sg->eg_ksl, eg_ksl, sgo);
+#endif
+}
+
+
+static
+void aymo_(chip_pg_update_nts)(struct aymo_(chip)* chip)
+{
+    for (int slot = 0; slot < AYMO_(SLOT_NUM_MAX); ++slot) {
+        int word = aymo_ymf262_slot_to_word[slot];
+        int ch2x = aymo_ymf262_word_to_ch2x[word];
+        struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+        struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+        struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h);
+        int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8));
+        int16_t eg_ksv = ((reg_B0h->block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1));
+
+        int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+        int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+        int cgi = aymo_(sgi_to_cgi)(sgi);
+        struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+        struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+
+        struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h);
+        int16_t ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1));
+
+        vinsertv(cg->eg_ksv, eg_ksv, sgo);
+        vinsertv(sg->eg_ks,  ks,     sgo);
+    }
+}
+
+
+static
+void aymo_(pg_update_fnum)(
+    struct aymo_(chip)* chip, int ch2x,
+    int16_t pg_fnum, int16_t eg_ksv, int16_t pg_block
+)
+{
+    int word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int sgi0 = (word0 / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word0 % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi0);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+
+    vinsertv(cg->pg_block, pg_block, sgo);
+    vinsertv(cg->pg_fnum, pg_fnum, sgo);
+    vinsertv(cg->eg_ksv, eg_ksv, sgo);
+
+    struct aymo_(slot_group)* sg0 = &(chip->sg[sgi0]);
+    int slot0 = aymo_ymf262_word_to_slot[word0];
+    struct aymo_ymf262_reg_20h* reg_20h0 = &(chip->slot_regs[slot0].reg_20h);
+    int16_t ks0 = (eg_ksv >> ((reg_20h0->ksr ^ 1) << 1));
+    vinsertv(sg0->eg_ks, ks0, sgo);
+    aymo_(eg_update_ksl)(chip, word0);
+    aymo_(pg_update_deltafreq)(chip, cg, sg0);
+
+    int word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+    int sgi1 = (word1 / AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg1 = &(chip->sg[sgi1]);
+    int slot1 = aymo_ymf262_word_to_slot[word1];
+    struct aymo_ymf262_reg_20h* reg_20h1 = &(chip->slot_regs[slot1].reg_20h);
+    int16_t ks1 = (eg_ksv >> ((reg_20h1->ksr ^ 1) << 1));
+    vinsertv(sg1->eg_ks, ks1, sgo);
+    aymo_(eg_update_ksl)(chip, word1);
+    aymo_(pg_update_deltafreq)(chip, cg, sg1);
+}
+
+
+static
+void aymo_(ch2x_update_fnum)(struct aymo_(chip)* chip, int ch2x, int8_t ch2p)
+{
+    struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+    struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+    struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h);
+    int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8));
+    int16_t pg_block = (int16_t)reg_B0h->block;
+    int16_t eg_ksv = ((pg_block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1));
+
+    aymo_(pg_update_fnum)(chip, ch2x, pg_fnum, eg_ksv, pg_block);
+
+    if (ch2p >= 0) {
+        aymo_(pg_update_fnum)(chip, ch2p, pg_fnum, eg_ksv, pg_block);
+    }
+}
+
+
+static inline
+void aymo_(eg_key_on)(struct aymo_(chip)* chip, int word, int16_t mode)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    int16_t eg_key = vextractv(sg->eg_key, sgo);
+    eg_key |= mode;
+    vinsertv(sg->eg_key, eg_key, sgo);
+}
+
+
+static inline
+void aymo_(eg_key_off)(struct aymo_(chip)* chip, int word, int16_t mode)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    int16_t eg_key = vextractv(sg->eg_key, sgo);
+    eg_key &= (int16_t)~mode;
+    vinsertv(sg->eg_key, eg_key, sgo);
+}
+
+
+static
+void aymo_(ch2x_key_on)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm) {
+        unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+        unsigned ch2x_is_drum    = (chip->og_ch2x_drum    & (1UL << ch2x));
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+
+        if (ch2x_is_pairing && !ch2x_is_secondary) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0];
+            int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1];
+            aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL));
+        }
+        else if (!ch2x_is_pairing || ch2x_is_drum) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+        }
+    }
+    else {
+        int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+        int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+        aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+        aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+    }
+}
+
+
+static
+void aymo_(ch2x_key_off)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm) {
+        unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+        unsigned ch2x_is_drum    = (chip->og_ch2x_drum    & (1UL << ch2x));
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+
+        if (ch2x_is_pairing && !ch2x_is_secondary) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0];
+            int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1];
+            aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL));
+        }
+        else if (!ch2x_is_pairing || ch2x_is_drum) {
+            int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+            int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+            aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+            aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+        }
+    }
+    else {
+        int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+        int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+        aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL));
+        aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL));
+    }
+}
+
+
+static
+void aymo_(cm_rewire_slot)(struct aymo_(chip)* chip, int word, const struct aymo_(conn)* conn)
+{
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &chip->sg[sgi];
+    vinsertv(sg->wg_fbmod_gate, conn->wg_fbmod_gate, sgo);
+    vinsertv(sg->wg_prmod_gate, conn->wg_prmod_gate, sgo);
+    int16_t og_out_gate = conn->og_out_gate;
+    vinsertv(sg->og_out_gate, og_out_gate, sgo);
+
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+    vinsertv(sg->og_out_ch_gate_a, (vextractv(cg->og_ch_gate_a, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_b, (vextractv(cg->og_ch_gate_b, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_c, (vextractv(cg->og_ch_gate_c, sgo) & og_out_gate), sgo);
+    vinsertv(sg->og_out_ch_gate_d, (vextractv(cg->og_ch_gate_d, sgo) & og_out_gate), sgo);
+}
+
+
+static
+void aymo_(cm_rewire_ch2x)(struct aymo_(chip)* chip, int ch2x)
+{
+    if (chip->chip_regs.reg_105h.newm && (chip->og_ch2x_pairing & (1UL << ch2x))) {
+        int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+        int ch2x_is_secondary = (ch2p < ch2x);
+        if (ch2x_is_secondary) {
+            int t = ch2x;
+            ch2x = ch2p;
+            ch2p = t;
+        }
+        unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+        unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+        unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt);
+        const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt];
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]);
+    }
+    else {
+        unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+        const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt];
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]);
+        aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]);
+    }
+}
+
+
+static
+void aymo_(cm_rewire_conn)(
+    struct aymo_(chip)* chip,
+    const struct aymo_ymf262_reg_104h* reg_104h_prev
+)
+{
+    struct aymo_ymf262_reg_104h* reg_104h = &chip->chip_regs.reg_104h;
+    unsigned diff = (reg_104h_prev ? (reg_104h_prev->conn ^ reg_104h->conn) : 0xFF);
+
+    for (int ch4x = 0; ch4x < (AYMO_(CHANNEL_NUM_MAX) / 2); ++ch4x) {
+        if (diff & (1 << ch4x)) {
+            int ch2x = aymo_ymf262_ch4x_to_pair[ch4x][0];
+            int ch2p = aymo_ymf262_ch4x_to_pair[ch4x][1];
+
+            if (reg_104h->conn & (1 << ch4x)) {
+                chip->og_ch2x_pairing |= ((1UL << ch2x) | (1UL << ch2p));
+
+                unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+                unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+                unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt);
+                const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]);
+            }
+            else {
+                chip->og_ch2x_pairing &= ~((1UL << ch2x) | (1UL << ch2p));
+
+                unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt;
+                const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]);
+
+                unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt;
+                const struct aymo_(conn)* ch2p_conn = aymo_(conn_ch2x_table)[ch2p_cnt];
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch2p_conn[0]);
+                aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch2p_conn[1]);
+            }
+        }
+    }
+}
+
+
+static
+void aymo_(cm_rewire_rhythm)(
+    struct aymo_(chip)* chip,
+    const struct aymo_ymf262_reg_BDh* reg_BDh_prev
+)
+{
+    const struct aymo_ymf262_reg_BDh reg_BDh_zero = { 0, 0, 0, 0, 0, 0, 0, 0 };
+    const struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh;
+    int force_update = 0;
+
+    if (reg_BDh->ryt) {
+        if (!reg_BDh_prev->ryt) {
+            // Apply special connection for rhythm mode
+            unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt;
+            const struct aymo_(conn)* ch6_conn = aymo_(conn_ryt_table)[ch6_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]);
+
+            const struct aymo_(conn)* ch7_conn = aymo_(conn_ryt_table)[2];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]);
+
+            const struct aymo_(conn)* ch8_conn = aymo_(conn_ryt_table)[3];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]);
+
+            force_update = 1;
+        }
+    }
+    else {
+        if (reg_BDh_prev->ryt) {
+            // Apply standard Channel_2xOP connection
+            unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt;
+            const struct aymo_(conn)* ch6_conn = aymo_(conn_ch2x_table)[ch6_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]);
+
+            unsigned ch7_cnt = chip->ch2x_regs[7].reg_C0h.cnt;
+            const struct aymo_(conn)* ch7_conn = aymo_(conn_ch2x_table)[ch7_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]);
+
+            unsigned ch8_cnt = chip->ch2x_regs[8].reg_C0h.cnt;
+            const struct aymo_(conn)* ch8_conn = aymo_(conn_ch2x_table)[ch8_cnt];
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]);
+            aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]);
+
+            reg_BDh = &reg_BDh_zero;  // force all keys off
+            force_update = 1;
+        }
+    }
+
+    if ((reg_BDh->hh != reg_BDh_prev->hh) || force_update) {
+        int word_hh = aymo_ymf262_ch2x_to_word[7][0];
+        if (reg_BDh->hh) {
+            aymo_(eg_key_on)(chip, word_hh, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_hh, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->tc != reg_BDh_prev->tc) || force_update) {
+        int word_tc = aymo_ymf262_ch2x_to_word[8][1];
+        if (reg_BDh->tc) {
+            aymo_(eg_key_on)(chip, word_tc, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_tc, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->tom != reg_BDh_prev->tom) || force_update) {
+        int word_tom = aymo_ymf262_ch2x_to_word[8][0];
+        if (reg_BDh->tom) {
+            aymo_(eg_key_on)(chip, word_tom, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_tom, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->sd != reg_BDh_prev->sd) || force_update) {
+        int word_sd = aymo_ymf262_ch2x_to_word[7][1];
+        if (reg_BDh->sd) {
+            aymo_(eg_key_on)(chip, word_sd, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_sd, AYMO_(EG_KEY_DRUM));
+        }
+    }
+
+    if ((reg_BDh->bd != reg_BDh_prev->bd) || force_update) {
+        int word_bd0 = aymo_ymf262_ch2x_to_word[6][0];
+        int word_bd1 = aymo_ymf262_ch2x_to_word[6][1];
+        if (reg_BDh->bd) {
+            aymo_(eg_key_on)(chip, word_bd0, AYMO_(EG_KEY_DRUM));
+            aymo_(eg_key_on)(chip, word_bd1, AYMO_(EG_KEY_DRUM));
+        } else {
+            aymo_(eg_key_off)(chip, word_bd0, AYMO_(EG_KEY_DRUM));
+            aymo_(eg_key_off)(chip, word_bd1, AYMO_(EG_KEY_DRUM));
+        }
+    }
+}
+
+
+static
+void aymo_(write_00h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    switch (address) {
+    case 0x01: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_01h) = value;
+        break;
+    }
+    case 0x02: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_02h) = value;
+        break;
+    }
+    case 0x03: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_03h) = value;
+        break;
+    }
+    case 0x04: {
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_04h) = value;
+        break;
+    }
+    case 0x104: {
+        struct aymo_ymf262_reg_104h reg_104h_prev = chip->chip_regs.reg_104h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_104h) = value;
+        aymo_(cm_rewire_conn)(chip, &reg_104h_prev);
+        break;
+    }
+    case 0x105: {
+        struct aymo_ymf262_reg_105h reg_105h_prev = chip->chip_regs.reg_105h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_105h) = value;
+        if (chip->chip_regs.reg_105h.newm != reg_105h_prev.newm) {
+            ;
+        }
+        break;
+    }
+    case 0x08: {
+        struct aymo_ymf262_reg_08h reg_08h_prev = chip->chip_regs.reg_08h;
+        *(uint8_t*)(void*)&(chip->chip_regs.reg_08h) = value;
+        if (chip->chip_regs.reg_08h.nts != reg_08h_prev.nts) {
+            aymo_(chip_pg_update_nts)(chip);
+        }
+        break;
+    }
+    }
+}
+
+
+static
+void aymo_(write_20h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int sgi = (aymo_ymf262_slot_to_word[slot] / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (aymo_ymf262_slot_to_word[slot] % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h);
+    struct aymo_ymf262_reg_20h reg_20h_prev = *reg_20h;
+    *(uint8_t*)(void*)reg_20h = value;
+    unsigned update_deltafreq = 0;
+
+    if (reg_20h->mult != reg_20h_prev.mult) {
+        int16_t pg_mult_x2 = aymo_ymf262_pg_mult_x2_table[reg_20h->mult];
+        vinsertv(sg->pg_mult_x2, pg_mult_x2, sgo);
+        update_deltafreq = 1;  // force
+    }
+
+    if (reg_20h->ksr != reg_20h_prev.ksr) {
+        int16_t eg_ksv = vextractv(cg->eg_ksv, sgo);
+        int16_t eg_ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1));
+        vinsertv(sg->eg_ks, eg_ks, sgo);
+    }
+
+    if (reg_20h->egt != reg_20h_prev.egt) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->sr = (reg_20h->egt ? 0 : chip->slot_regs[slot].reg_80h.rr);
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+    }
+
+    if (reg_20h->vib != reg_20h_prev.vib) {
+        int16_t pg_vib = -(int16_t)reg_20h->vib;
+        vinsertv(sg->pg_vib, pg_vib, sgo);
+        update_deltafreq = 1;  // force
+    }
+
+    if (reg_20h->am != reg_20h_prev.am) {
+        int16_t eg_am = -(int16_t)reg_20h->am;
+        vinsertv(sg->eg_am, eg_am, sgo);
+
+        uint16_t eg_tremolopos = chip->eg_tremolopos;
+        if (eg_tremolopos >= 105) {
+            eg_tremolopos = (210 - eg_tremolopos);
+        }
+        vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift));
+        vsfence();
+        sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am);
+    }
+
+    if (update_deltafreq) {
+        for (sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+            cgi = aymo_(sgi_to_cgi)(sgi);
+            cg = &chip->cg[cgi];
+            sg = &chip->sg[sgi];
+            aymo_(pg_update_deltafreq)(chip, cg, sg);
+        }
+    }
+}
+
+
+static
+void aymo_(write_40h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h);
+    struct aymo_ymf262_reg_40h reg_40h_prev = *reg_40h;
+    *(uint8_t*)(void*)reg_40h = value;
+
+    if ((reg_40h->tl != reg_40h_prev.tl) || (reg_40h->ksl != reg_40h_prev.ksl)) {
+        aymo_(eg_update_ksl)(chip, word);
+    }
+}
+
+
+static
+void aymo_(write_60h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_60h* reg_60h = &(chip->slot_regs[slot].reg_60h);
+    struct aymo_ymf262_reg_60h reg_60h_prev = *reg_60h;
+    *(uint8_t*)(void*)reg_60h = value;
+
+    if ((reg_60h->dr != reg_60h_prev.dr) || (reg_60h->ar != reg_60h_prev.ar)) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->dr = reg_60h->dr;
+        eg_adsr->ar = reg_60h->ar;
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+    }
+}
+
+
+static
+void aymo_(write_80h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_80h* reg_80h = &(chip->slot_regs[slot].reg_80h);
+    struct aymo_ymf262_reg_80h reg_80h_prev = *reg_80h;
+    *(uint8_t*)(void*)reg_80h = value;
+
+    if ((reg_80h->rr != reg_80h_prev.rr) || (reg_80h->sl != reg_80h_prev.sl)) {
+        int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo);
+        struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word;
+        eg_adsr->sr = (chip->slot_regs[slot].reg_20h.egt ? 0 : reg_80h->rr);
+        eg_adsr->rr = reg_80h->rr;
+        vinsertv(sg->eg_adsr, eg_adsr_word, sgo);
+        int16_t eg_sl = (int16_t)reg_80h->sl;
+        if (eg_sl == 0x0F) {
+            eg_sl = 0x1F;
+        }
+        vinsertv(sg->eg_sl, eg_sl, sgo);
+    }
+}
+
+
+static
+void aymo_(write_E0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int slot = aymo_(addr_to_slot)(address);
+    int word = aymo_ymf262_slot_to_word[slot];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+    struct aymo_ymf262_reg_E0h* reg_E0h = &(chip->slot_regs[slot].reg_E0h);
+    struct aymo_ymf262_reg_E0h reg_E0h_prev = *reg_E0h;
+    *(uint8_t*)(void*)reg_E0h = value;
+
+    if (!chip->chip_regs.reg_105h.newm) {
+        reg_E0h->ws &= 3;
+    }
+
+    if (reg_E0h->ws != reg_E0h_prev.ws) {
+        const struct aymo_(wave)* wave = &aymo_(wave_table)[reg_E0h->ws];
+        vinsertv(sg->wg_phase_mullo, wave->wg_phase_mullo, sgo);
+        vinsertv(sg->wg_phase_zero,  wave->wg_phase_zero,  sgo);
+        vinsertv(sg->wg_phase_neg,   wave->wg_phase_neg,   sgo);
+        vinsertv(sg->wg_phase_flip,  wave->wg_phase_flip,  sgo);
+        vinsertv(sg->wg_phase_mask,  wave->wg_phase_mask,  sgo);
+        vinsertv(sg->wg_sine_gate,   wave->wg_sine_gate,   sgo);
+    }
+}
+
+
+static
+void aymo_(write_A0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+    int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+    int ch2x_is_secondary = (ch2p < ch2x);
+    if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) {
+        return;
+    }
+    if (!ch2x_is_pairing || ch2x_is_secondary) {
+        ch2p = -1;
+    }
+
+    struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h);
+    struct aymo_ymf262_reg_A0h reg_A0h_prev = *reg_A0h;
+    *(uint8_t*)(void*)reg_A0h = value;
+
+    if (reg_A0h->fnum_lo != reg_A0h_prev.fnum_lo) {
+        aymo_(ch2x_update_fnum)(chip, ch2x, ch2p);
+    }
+}
+
+
+static
+void aymo_(write_B0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x));
+    int ch2p = aymo_ymf262_ch2x_paired[ch2x];
+    int ch2x_is_secondary = (ch2p < ch2x);
+    if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) {
+        return;
+    }
+    if (!ch2x_is_pairing || ch2x_is_secondary) {
+        ch2p = -1;
+    }
+
+    if (address == 0xBD) {
+        struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh;
+        struct aymo_ymf262_reg_BDh reg_BDh_prev = *reg_BDh;
+        *(uint8_t*)(void*)reg_BDh = value;
+
+        chip->eg_tremoloshift = (((reg_BDh->dam ^ 1) << 1) + 2);
+        chip->eg_vibshift = (reg_BDh->dvb ^ 1);
+        aymo_(cm_rewire_rhythm)(chip, &reg_BDh_prev);
+    }
+    else {
+        struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h);
+        struct aymo_ymf262_reg_B0h reg_B0h_prev = *reg_B0h;
+        *(uint8_t*)(void*)reg_B0h = value;
+
+        if ((reg_B0h->fnum_hi != reg_B0h_prev.fnum_hi) || (reg_B0h->block != reg_B0h_prev.block)) {
+            aymo_(ch2x_update_fnum)(chip, ch2x, ch2p);
+        }
+
+        if (reg_B0h->kon != reg_B0h_prev.kon) {
+            if (reg_B0h->kon) {
+                aymo_(ch2x_key_on)(chip, ch2x);
+            } else {
+                aymo_(ch2x_key_off)(chip, ch2x);
+            }
+        }
+    }
+}
+
+
+static
+void aymo_(write_C0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    struct aymo_ymf262_reg_C0h* reg_C0h = &(chip->ch2x_regs[ch2x].reg_C0h);
+    struct aymo_ymf262_reg_C0h reg_C0h_prev = *reg_C0h;
+    if (!chip->chip_regs.reg_105h.newm) {
+        value = ((value | 0x30) & 0x3F);
+    }
+    *(uint8_t*)(void*)reg_C0h = value;
+
+    int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1];
+    int sgo = (ch2x_word0 % AYMO_(SLOT_GROUP_LENGTH));
+    int sgi0 = (ch2x_word0 / AYMO_(SLOT_GROUP_LENGTH));
+    int sgi1 = (ch2x_word1 / AYMO_(SLOT_GROUP_LENGTH));
+    struct aymo_(slot_group)* sg0 = &chip->sg[sgi0];
+    struct aymo_(slot_group)* sg1 = &chip->sg[sgi1];
+    int cgi = aymo_(sgi_to_cgi)(sgi0);
+    struct aymo_(ch2x_group)* cg = &chip->cg[cgi];
+
+    if (reg_C0h->cha != reg_C0h_prev.cha) {
+        int16_t og_ch_gate_a = -(int16_t)reg_C0h->cha;
+        vinsertv(cg->og_ch_gate_a, og_ch_gate_a, sgo);
+        vinsertv(sg0->og_out_ch_gate_a, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_a), sgo);
+        vinsertv(sg1->og_out_ch_gate_a, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_a), sgo);
+    }
+    if (reg_C0h->chb != reg_C0h_prev.chb) {
+        int16_t og_ch_gate_b = -(int16_t)reg_C0h->chb;
+        vinsertv(cg->og_ch_gate_b, og_ch_gate_b, sgo);
+        vinsertv(sg0->og_out_ch_gate_b, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_b), sgo);
+        vinsertv(sg1->og_out_ch_gate_b, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_b), sgo);
+    }
+    if (reg_C0h->chc != reg_C0h_prev.chc) {
+        int16_t og_ch_gate_c = -(int16_t)reg_C0h->chc;
+        vinsertv(cg->og_ch_gate_c, og_ch_gate_c, sgo);
+        vinsertv(sg0->og_out_ch_gate_c, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_c), sgo);
+        vinsertv(sg1->og_out_ch_gate_c, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_c), sgo);
+    }
+    if (reg_C0h->chd != reg_C0h_prev.chd) {
+        int16_t og_ch_gate_d = -(int16_t)reg_C0h->chd;
+        vinsertv(cg->og_ch_gate_d, og_ch_gate_d, sgo);
+        vinsertv(sg0->og_out_ch_gate_d, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_d), sgo);
+        vinsertv(sg1->og_out_ch_gate_d, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_d), sgo);
+    }
+
+    if (reg_C0h->fb != reg_C0h_prev.fb) {
+        int16_t fb_mulhi = (reg_C0h->fb ? (0x0040 << reg_C0h->fb) : 0);
+        vinsertv(sg0->wg_fb_mulhi, fb_mulhi, sgo);
+        vinsertv(sg1->wg_fb_mulhi, fb_mulhi, sgo);
+    }
+
+    if (chip->chip_regs.reg_105h.stereo) {
+        // TODO
+    }
+
+    if (reg_C0h->cnt != reg_C0h_prev.cnt) {
+        aymo_(cm_rewire_ch2x)(chip, ch2x);
+    }
+}
+
+
+static
+void aymo_(write_D0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    int ch2x = aymo_(addr_to_ch2x)(address);
+    *(uint8_t*)(void*)&(chip->ch2x_regs[ch2x].reg_C0h) = value;
+
+    if (chip->chip_regs.reg_105h.stereo) {
+        // TODO
+    }
+}
+
+
+static
+int aymo_(rq_enqueue)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    uint16_t rq_tail = chip->rq_tail;
+    uint16_t rq_next = (rq_tail + 1);
+    if (rq_next >= AYMO_(REG_QUEUE_LENGTH)) {
+        rq_next = 0u;
+    }
+
+    if (rq_next != chip->rq_head) {
+        chip->rq_buffer[rq_tail].address = address;
+        chip->rq_buffer[rq_tail].value = value;
+        chip->rq_tail = rq_next;
+        return 1;
+    }
+    return 0;
+}
+
+
+const struct aymo_ymf262_vt* aymo_(get_vt)(void)
+{
+    return &(aymo_(vt));
+}
+
+
+uint32_t aymo_(get_sizeof)(void)
+{
+    return sizeof(struct aymo_(chip));
+}
+
+
+void aymo_(ctor)(struct aymo_(chip)* chip)
+{
+    assert(chip);
+
+    // Wipe everything, except VT
+    const struct aymo_ymf262_vt* vt = chip->parent.vt;
+    aymo_memset(chip, 0, sizeof(*chip));
+    chip->parent.vt = vt;
+
+    // Initialize slots
+    for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) {
+        struct aymo_(slot_group)* sg = &(chip->sg[sgi]);
+        sg->eg_rout         = vset1(0x01FF);
+        sg->eg_out          = vset1(0x01FF);
+        sg->eg_gen          = vset1(AYMO_(EG_GEN_RELEASE));
+        sg->eg_gen_mullo    = vset1(AYMO_(EG_GEN_MULLO_RELEASE));
+        sg->pg_mult_x2      = vset1(aymo_ymf262_pg_mult_x2_table[0]);
+        sg->og_prout_ac     = vsetm(aymo_(og_prout_ac)[sgi]);
+        sg->og_prout_bd     = vsetm(aymo_(og_prout_bd)[sgi]);
+
+        const struct aymo_(wave)* wave = &aymo_(wave_table)[0];
+        sg->wg_phase_mullo  = vset1(wave->wg_phase_mullo);
+        sg->wg_phase_zero   = vset1(wave->wg_phase_zero);
+        sg->wg_phase_neg    = vset1(wave->wg_phase_neg);
+        sg->wg_phase_flip   = vset1(wave->wg_phase_flip);
+        sg->wg_phase_mask   = vset1(wave->wg_phase_mask);
+        sg->wg_sine_gate    = vset1(wave->wg_sine_gate);
+    }
+
+    // Initialize channels
+    for (int cgi = 0; cgi < (AYMO_(SLOT_GROUP_NUM) / 2); ++cgi) {
+        struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]);
+        cg->og_ch_gate_a = vset1(-1);
+        cg->og_ch_gate_b = vset1(-1);
+    }
+    for (int ch2x = 0; ch2x < AYMO_(CHANNEL_NUM_MAX); ++ch2x) {
+        aymo_(cm_rewire_ch2x)(chip, ch2x);
+    }
+
+    // Initialize chip
+    chip->ng_noise = 1;
+
+    chip->eg_tremoloshift = 4;
+    chip->eg_vibshift = 1;
+}
+
+
+void aymo_(dtor)(struct aymo_(chip)* chip)
+{
+    AYMO_UNUSED_VAR(chip);
+    assert(chip);
+}
+
+
+uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address)
+{
+    AYMO_UNUSED_VAR(chip);
+    AYMO_UNUSED_VAR(address);
+    assert(chip);
+
+    // not supported
+    return 0u;
+}
+
+
+void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address > 0x1FF) {
+        return;
+    }
+
+    switch (address & 0xF0) {
+    case 0x00: {
+        aymo_(write_00h)(chip, address, value);
+        break;
+    }
+    case 0x20:
+    case 0x30: {
+        aymo_(write_20h)(chip, address, value);
+        break;
+    }
+    case 0x40:
+    case 0x50: {
+        aymo_(write_40h)(chip, address, value);
+        break;
+    }
+    case 0x60:
+    case 0x70: {
+        aymo_(write_60h)(chip, address, value);
+        break;
+    }
+    case 0x80:
+    case 0x90: {
+        aymo_(write_80h)(chip, address, value);
+        break;
+    }
+    case 0xE0:
+    case 0xF0: {
+        aymo_(write_E0h)(chip, address, value);
+        break;
+    }
+    case 0xA0: {
+        aymo_(write_A0h)(chip, address, value);
+        break;
+    }
+    case 0xB0: {
+        aymo_(write_B0h)(chip, address, value);
+        break;
+    }
+    case 0xC0: {
+        aymo_(write_C0h)(chip, address, value);
+        break;
+    }
+    case 0xD0: {
+        aymo_(write_D0h)(chip, address, value);
+        break;
+    }
+    }
+    vsfence();
+}
+
+
+int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value)
+{
+    assert(chip);
+
+    if (address < 0x8000u) {
+        return aymo_(rq_enqueue)(chip, address, value);
+    }
+    return 0;
+}
+
+
+int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    if (count < 0x8000u) {
+        uint16_t address = (uint16_t)((count >> 8) | 0x8000u);
+        uint8_t value = (uint8_t)(count & 0xFFu);
+        return aymo_(rq_enqueue)(chip, address, value);
+    }
+    return 0;
+}
+
+
+int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel)
+{
+    assert(chip);
+
+    switch (channel) {
+        case 0u: return _mm_extract_epi16(chip->og_out, 0);
+        case 1u: return _mm_extract_epi16(chip->og_out, 1);
+        case 2u: return _mm_extract_epi16(chip->og_out, 2);
+        case 3u: return _mm_extract_epi16(chip->og_out, 3);
+        default: return 0;
+    }
+}
+
+
+void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count)
+{
+    assert(chip);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+    }
+}
+
+
+void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 3u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        *(int32_t*)y = _mm_cvtsi128_si32(chip->og_out);
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 7u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        _mm_storel_epi64((void*)y, chip->og_out);
+        y += 4u;
+    }
+}
+
+
+void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 7u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out);
+        vf32x4_t vf32 = _mm_cvtepi32_ps(vi32);
+        _mm_storel_pi((void*)y, vf32);
+        y += 2u;
+    }
+}
+
+
+void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[])
+{
+    assert(chip);
+    assert(((uintptr_t)(void*)y & 15u) == 0u);
+
+    while (count--) {
+        aymo_(tick_once)(chip);
+
+        vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out);
+        vf32x4_t vf32 = _mm_cvtepi32_ps(vi32);
+        _mm_store_ps(y, vf32);
+        y += 4u;
+    }
+}
+
+
+AYMO_CXX_EXTERN_C_END
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
diff --git a/tests/aymo_testing.c b/tests/aymo_testing.c
new file mode 100644
index 0000000..5c40e08
--- /dev/null
+++ b/tests/aymo_testing.c
@@ -0,0 +1,110 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_testing.h"
+
+#include <assert.h>
+#include <ctype.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+char* aymo_test_args_to_str(
+    int first,
+    int last,
+    char** argv,
+    const char *prefix,
+    const char *suffix
+)
+{
+    assert(argv);
+
+    size_t prefix_len = 0u;
+    if (prefix) {
+        prefix_len = strlen(prefix);
+    }
+    size_t suffix_len = 0u;
+    if (suffix) {
+        suffix_len = strlen(suffix);
+    }
+
+    size_t line_len = (prefix_len + (size_t)(last - first) + suffix_len);
+
+    if (first == 0) {
+        char* fwd = strrchr(argv[0], '/');
+        if (fwd) {
+            argv[0] = (fwd + 1);
+        }
+        char* bwd = strrchr(argv[0], '\\');
+        if (bwd) {
+            argv[0] = (bwd + 1);
+        }
+    }
+
+    for (int i = first; i <= last; ++i) {
+        assert(argv[i]);
+        size_t arg_len = strlen(argv[i]);
+        line_len += arg_len;
+    }
+
+    char *line = malloc(line_len + 1u);
+    assert(line);
+    size_t offset = 0u;
+
+    if (prefix) {
+        memcpy(&line[offset], prefix, prefix_len);
+        offset += prefix_len;
+    }
+    for (int i = first; i <= last; ++i) {
+        size_t arg_len = strlen(argv[i]);
+        memcpy(&line[offset], argv[i], arg_len);
+        offset += arg_len;
+        if (i < last) {
+            line[offset++] = '_';
+        }
+    }
+    if (suffix) {
+        memcpy(&line[offset], suffix, suffix_len);
+        offset += suffix_len;
+    }
+
+    line[offset] = '\0';
+
+    for (offset = 0u; offset < line_len; ++offset) {
+        char c = line[offset];
+        if (!isgraph(c) || (c == '/') || (c == '\\')) {
+            line[offset] = '_';
+        }
+    }
+
+    return line;
+}
+
+
+void aymo_test_free_args_str(char* line)
+{
+    free(line);
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/tests/aymo_testing.h b/tests/aymo_testing.h
new file mode 100644
index 0000000..0860c53
--- /dev/null
+++ b/tests/aymo_testing.h
@@ -0,0 +1,54 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+#ifndef _include_aymo_testing_h
+#define _include_aymo_testing_h
+
+#include "aymo_cc.h"
+
+
+// Test exit code status values
+#define TEST_STATUS_PASS        (0)     // Test passed successfully
+#define TEST_STATUS_FAIL        (1)     // Tast failed; usually with error text
+#define TEST_STATUS_SKIP        (77)    // Test skipped
+#define TEST_STATUS_HARD        (99)    // Test failed with hard error
+
+
+// Macros to build test name lookup tables
+typedef void (*aymo_testing_test_f)(void);  // using globals as test status variables
+
+struct aymo_testing_entry {
+    const char* name;
+    aymo_testing_test_f func;
+};
+
+#define AYMO_TEST_ENTRY(name)   { AYMO_STRINGIFY2(name), name }
+
+
+AYMO_PUBLIC char* aymo_test_args_to_str(
+    int first,
+    int last,
+    char** argv,
+    const char *prefix,
+    const char *suffix
+);
+AYMO_PUBLIC void aymo_test_free_args_str(char* line);
+
+
+#endif  // _include_aymo_testing_h
diff --git a/tests/aymo_testing_epilogue_inline.h b/tests/aymo_testing_epilogue_inline.h
new file mode 100644
index 0000000..68eb553
--- /dev/null
+++ b/tests/aymo_testing_epilogue_inline.h
@@ -0,0 +1,41 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+
+int main(int argc, char** argv)
+{
+    app_return = TEST_STATUS_PASS;
+
+    if (argc != 2) {
+        fprintf(stderr, "USAGE:\t%s TETSNAME\n", (argc ? argv[0] : "test_exe"));
+        return TEST_STATUS_HARD;
+    }
+
+    for (unsigned i = 0; i < AYMO_VECTOR_LENGTH(unit_tests); ++i) {
+        if (!strcmp(unit_tests[i].name, argv[1])) {
+            (unit_tests[i].func)();
+            break;
+        }
+    }
+    return app_return;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/tests/meson.build b/tests/meson.build
new file mode 100644
index 0000000..6551089
--- /dev/null
+++ b/tests/meson.build
@@ -0,0 +1,392 @@
+test_includes = include_directories(
+  '.',
+)
+
+test_common_sources = files(
+  'aymo_testing.c',
+)
+
+
+test_names = [
+]
+
+test_names_none = [
+  'test_convert_none',
+  'test_tda8425_none_sweep',
+  'test_ym7128_none_sweep',
+  'test_ymf262_none_compare',
+]
+
+
+test_names_x86 = [
+]
+
+test_names_x86_sse41 = [
+  'test_convert_x86_sse41',
+  'test_tda8425_x86_sse41_sweep',
+  'test_ym7128_x86_sse41_sweep',
+  'test_ymf262_x86_sse41_compare',
+]
+
+test_names_x86_avx2 = [
+  'test_convert_x86_avx2',
+  'test_tda8425_x86_avx2_sweep',
+  'test_ymf262_x86_avx2_compare',
+]
+
+
+test_names_arm = [
+]
+
+test_names_arm_neon = [
+  'test_convert_arm_neon',
+  'test_tda8425_arm_neon_sweep',
+  'test_ym7128_arm_neon_sweep',
+  'test_ymf262_arm_neon_compare',
+]
+
+
+# =====================================================================
+
+fs = import('fs')
+
+
+# Generic
+foreach test_name : test_names
+  test_c = '@0@.c'.format(test_name)
+  if fs.exists(test_c)
+    test_exe = executable(
+      test_name,
+      test_c,
+      test_common_sources,
+      include_directories: test_includes,
+      dependencies: [aymo_static_dep, aymo_libc_dep],
+      install: false,
+    )
+    test(test_name, test_exe)
+  endif
+endforeach
+
+
+# x86 specific
+if host_cpu_family in ['x86', 'x86_64']
+  foreach test_name : test_names_x86
+    test_c = '@0@.c'.format(test_name)
+    if fs.exists(test_c)
+      test_exe = executable(
+        test_name,
+        test_c,
+        test_common_sources,
+        include_directories: test_includes,
+        dependencies: [aymo_static_dep, aymo_libc_dep],
+        install: false,
+      )
+      test(test_name, test_exe)
+    endif
+  endforeach
+endif
+
+
+# ARM specific
+if host_cpu_family in ['arm', 'aarch64']
+  foreach test_name : test_names_arm
+    test_c = '@0@.c'.format(test_name)
+    if fs.exists(test_c)
+      test_exe = executable(
+        test_name,
+        test_c,
+        test_common_sources,
+        include_directories: test_includes,
+        dependencies: [aymo_static_dep, aymo_libc_dep],
+        install: false,
+      )
+      test(test_name, test_exe)
+    endif
+  endforeach
+endif
+
+
+# CPU-ext specific
+foreach intr_name : ['none', 'x86_sse41', 'x86_avx2', 'arm_neon']
+  have_intr = get_variable('aymo_have_@0@'.format(intr_name))
+  if have_intr
+    test_names = get_variable('test_names_@0@'.format(intr_name))
+    intr_args = get_variable('aymo_@0@_args'.format(intr_name), [])
+    foreach test_name : test_names
+      test_c = '@0@.c'.format(test_name)
+      if fs.exists(test_c)
+        test_exe = executable(
+          test_name,
+          test_c,
+          test_common_sources,
+          c_args: intr_args,
+          include_directories: test_includes,
+          dependencies: [aymo_static_dep, aymo_libc_dep],
+          install: false,
+        )
+        set_variable('@0@_exe'.format(test_name), test_exe)
+      endif
+    endforeach
+  endif
+endforeach
+
+
+# =====================================================================
+# convert
+
+# function_name
+aymo_convert_suite = [
+  'test_aymo_convert_@0@_i16_f32',
+  'test_aymo_convert_@0@_f32_i16',
+  'test_aymo_convert_@0@_i16_f32_1',
+  'test_aymo_convert_@0@_f32_i16_1',
+  'test_aymo_convert_@0@_i16_f32_k',
+  'test_aymo_convert_@0@_f32_i16_k',
+  'test_aymo_convert_@0@_u16_f32',
+  'test_aymo_convert_@0@_f32_u16',
+  'test_aymo_convert_@0@_u16_f32_1',
+  'test_aymo_convert_@0@_f32_u16_1',
+  'test_aymo_convert_@0@_u16_f32_k',
+  'test_aymo_convert_@0@_f32_u16_k',
+]
+
+foreach intr_name : ['none', 'x86_sse41', 'x86_avx2', 'arm_neon']
+  have_intr = get_variable('aymo_have_@0@'.format(intr_name))
+  if have_intr
+    test_suite = 'test_convert_@0@'.format(intr_name)
+    test_exe = get_variable('@0@_exe'.format(test_suite))
+    foreach t : aymo_convert_suite
+      test_name = t.format(intr_name)
+      test(test_name, test_exe, args: test_name)
+    endforeach
+  endif
+endforeach
+
+
+# =====================================================================
+# TDA8425
+
+seconds = '8'
+samplerate = '48000'
+aymo_tda8425_sweep_suite = {
+  'stereo_ab_1_384000': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', '384000', seconds],
+  'stereo_ab_1_96000': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', '96000', seconds],
+  'stereo_ab_1_48000': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', '48000', seconds],
+  'stereo_ab_1_44100': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', '44100', seconds],
+
+  'volume_lr_max_min': ['0xFF', '0xF0', '0xF6', '0xF6', '0xFF', '0xCE', samplerate, seconds],
+  'volume_lr_min_max': ['0xF0', '0xFF', '0xF6', '0xF6', '0xFF', '0xCE', samplerate, seconds],
+
+  'bass_min': ['0xFC', '0xFC', '0xF0', '0xF6', '0xFF', '0xCE', samplerate, seconds],
+  'bass_max': ['0xFC', '0xFC', '0xFF', '0xF6', '0xFF', '0xCE', samplerate, seconds],
+
+  'treble_min': ['0xFC', '0xFC', '0xF6', '0xF0', '0xFF', '0xCE', samplerate, seconds],
+  'treble_max': ['0xFC', '0xFC', '0xF6', '0xFF', '0xFF', '0xCE', samplerate, seconds],
+
+  'stereo_none_1': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC8', samplerate, seconds],
+  'stereo_none_2': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC9', samplerate, seconds],
+  'stereo_1_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCA', samplerate, seconds],
+  'stereo_1_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCC', samplerate, seconds],
+  'stereo_1_ab': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', samplerate, seconds],
+  'stereo_2_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCB', samplerate, seconds],
+  'stereo_2_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCD', samplerate, seconds],
+  'stereo_2_ab': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCF', samplerate, seconds],
+
+  'mono_1_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC2', samplerate, seconds],
+  'mono_1_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC4', samplerate, seconds],
+  'mono_1_ab': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC6', samplerate, seconds],
+
+  'pseudo_1_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xD2', samplerate, seconds],
+  'pseudo_1_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xD4', samplerate, seconds],
+  # TODO: Preset 1
+  # TODO: Preset 2
+  # TODO: Preset 3
+
+  'spatial_1_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xDE', samplerate, seconds],
+  'spatial_1_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xDE', samplerate, seconds],
+  'spatial_1_ab': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xDE', samplerate, seconds],
+
+  'mute': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xEE', samplerate, seconds],
+
+  # TODO: T-filter extremes
+}
+
+foreach intr_name : ['none', 'x86_sse41', 'x86_avx2', 'arm_neon']
+  have_intr = get_variable('aymo_have_@0@'.format(intr_name))
+  if have_intr
+    test_suite = 'test_tda8425_@0@_sweep'.format(intr_name)
+    test_exe = get_variable('@0@_exe'.format(test_suite))
+    foreach test_name, test_args : aymo_tda8425_sweep_suite
+      test_name = ('_'.join([test_suite] + [test_name])).underscorify()
+      test(test_name, test_exe, args: test_args)
+    endforeach
+  endif
+endforeach
+
+
+# =====================================================================
+# YM7128
+
+seconds = '20'
+# name: [seconds, reg...]
+aymo_ym7128_sweep_suite = {
+  'off': [seconds,
+    '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00',
+    '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00',
+    '0x00', '0x00', '0x00', '0x00',
+    '0x00', '0x00',
+    '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00'
+  ],
+  'direct': [seconds,
+    '0x3F', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00',
+    '0x3F', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00',
+    '0x3F', '0x00', '0x3F', '0x3F',
+    '0x00', '0x00',
+    '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00'
+  ],
+  'dune/arrakis': [seconds,
+    '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00',
+    '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07',
+    '0x1A', '0x1D', '0x1A', '0x1A',
+    '0x16', '0x16',
+    '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F',
+  ],
+  'dune/baghdad': [seconds,
+    '0x1F', '0x00', '0x1B', '0x00', '0x17', '0x00', '0x33', '0x00',
+    '0x00', '0x1D', '0x00', '0x19', '0x00', '0x15', '0x00', '0x11',
+    '0x1D', '0x1D', '0x1D', '0x1D',
+    '0x13', '0x13',
+    '0x06', '0x02', '0x04', '0x06', '0x08', '0x0A', '0x0C', '0x0E', '0x10',
+  ],
+  'dune/morning': [seconds,
+    '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00',
+    '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07',
+    '0x1A', '0x1D', '0x1B', '0x1B',
+    '0x16', '0x16',
+    '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F',
+  ],
+  'dune/sequence': [seconds,
+    '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00',
+    '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07',
+    '0x1A', '0x1D', '0x1C', '0x1C',
+    '0x16', '0x16',
+    '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F',
+  ],
+  'dune/sietch': [seconds,
+    '0x1F', '0x00', '0x1B', '0x00', '0x17', '0x00', '0x33', '0x00',
+    '0x00', '0x1D', '0x00', '0x19', '0x00', '0x15', '0x00', '0x11',
+    '0x1D', '0x1D', '0x1D', '0x1D',
+    '0x13', '0x13',
+    '0x06', '0x02', '0x04', '0x06', '0x08', '0x0A', '0x0C', '0x0E', '0x10',
+  ],
+  'dune/warsong': [seconds,
+    '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00',
+    '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07',
+    '0x1A', '0x1D', '0x1C', '0x1C',
+    '0x16', '0x16',
+    '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F',
+  ],
+  'dune/water': [seconds,
+    '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00',
+    '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07',
+    '0x1A', '0x1D', '0x1A', '0x1A',
+    '0x16', '0x16',
+    '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F',
+  ],
+  'dune/wormintro': [seconds,
+    '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00',
+    '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07',
+    '0x1A', '0x1D', '0x18', '0x18',
+    '0x16', '0x16',
+    '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F',
+  ],
+  'dune/wormsuit': [seconds,
+    '0x18', '0x00', '0x1A', '0x00', '0x1C', '0x00', '0x1E', '0x00',
+    '0x00', '0x19', '0x00', '0x1B', '0x00', '0x1D', '0x00', '0x1F',
+    '0x1B', '0x1F', '0x17', '0x17',
+    '0x12', '0x08',
+    '0x1F', '0x07', '0x0A', '0x0D', '0x10', '0x13', '0x16', '0x19', '0x1C',
+  ],
+  'gold/cavern': [seconds,
+    '0x1F', '0x00', '0x1D', '0x00', '0x1B', '0x00', '0x19', '0x00',
+    '0x20', '0x3E', '0x20', '0x3C', '0x20', '0x3A', '0x20', '0x38',
+    '0x3C', '0x3E', '0x1C', '0x1C',
+    '0x11', '0x0A',
+    '0x12', '0x10', '0x0E', '0x0C', '0x0A', '0x08', '0x06', '0x04', '0x02',
+  ],
+  'gold/chapel': [seconds,
+    '0x1F', '0x1E', '0x1D', '0x1C', '0x1B', '0x1A', '0x19', '0x18',
+    '0x3F', '0x3E', '0x3D', '0x3C', '0x3B', '0x3A', '0x39', '0x38',
+    '0x38', '0x3D', '0x1B', '0x1B',
+    '0x10', '0x10',
+    '0x1F', '0x1F', '0x1D', '0x1B', '0x19', '0x17', '0x15', '0x13', '0x11',
+  ],
+  'gold/concert_hall': [seconds,
+    '0x31', '0x00', '0x15', '0x00', '0x39', '0x00', '0x1D', '0x00',
+    '0x00', '0x33', '0x00', '0x17', '0x00', '0x3B', '0x00', '0x1F',
+    '0x1A', '0x1C', '0x1D', '0x1D',
+    '0x16', '0x16',
+    '0x1F', '0x1C', '0x19', '0x16', '0x13', '0x10', '0x0D', '0x0A', '0x07',
+  ],
+  'gold/deep_space': [seconds,
+    '0x18', '0x00', '0x1A', '0x00', '0x1C', '0x00', '0x1E', '0x00',
+    '0x00', '0x19', '0x00', '0x1B', '0x00', '0x1D', '0x00', '0x1F',
+    '0x1B', '0x1F', '0x1C', '0x1C',
+    '0x12', '0x08',
+    '0x1F', '0x07', '0x0A', '0x0D', '0x10', '0x13', '0x16', '0x19', '0x1C',
+  ],
+  'gold/jazz_club': [seconds,
+    '0x1F', '0x1B', '0x37', '0x13', '0x2F', '0x0B', '0x27', '0x03',
+    '0x1F', '0x3B', '0x17', '0x33', '0x0F', '0x2B', '0x07', '0x23',
+    '0x1C', '0x1F', '0x1B', '0x1B',
+    '0x0C', '0x0C',
+    '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F',
+  ],
+  'gold/movie_theater': [seconds,
+    '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00',
+    '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07',
+    '0x1A', '0x1D', '0x1C', '0x1C',
+    '0x16', '0x16',
+    '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F',
+  ],
+  'gold/recital_hall': [seconds,
+    '0x1F', '0x3E', '0x1D', '0x3C', '0x1B', '0x3A', '0x19', '0x38',
+    '0x3F', '0x1E', '0x3D', '0x1C', '0x3B', '0x1A', '0x39', '0x18',
+    '0x18', '0x1C', '0x1C', '0x1C',
+    '0x15', '0x15',
+    '0x14', '0x04', '0x06', '0x08', '0x0A', '0x0C', '0x0E', '0x10', '0x12',
+  ],
+  'gold/stadium': [seconds,
+    '0x1F', '0x00', '0x1B', '0x00', '0x17', '0x00', '0x33', '0x00',
+    '0x00', '0x1D', '0x00', '0x19', '0x00', '0x15', '0x00', '0x11',
+    '0x1D', '0x1D', '0x3D', '0x3D',
+    '0x13', '0x13',
+    '0x06', '0x02', '0x04', '0x06', '0x08', '0x0A', '0x0C', '0x0E', '0x10',
+  ],
+}
+
+foreach intr_name : ['none', 'x86_sse41', 'arm_neon']
+  have_intr = get_variable('aymo_have_@0@'.format(intr_name))
+  if have_intr
+    test_suite = 'test_ym7128_@0@_sweep'.format(intr_name)
+    test_exe = get_variable('@0@_exe'.format(test_suite))
+    foreach test_name, test_args : aymo_ym7128_sweep_suite
+      test_name = ('_'.join([test_suite] + [test_name])).underscorify()
+      test(test_name, test_exe, args: test_args)
+    endforeach
+  endif
+endforeach
+
+
+# =====================================================================
+# YMF262
+
+foreach intr_name : ['none', 'x86_sse41', 'x86_avx2', 'arm_neon']
+  have_intr = get_variable('aymo_have_@0@'.format(intr_name))
+  if have_intr
+    # TODO: improve testing scores
+    test_name = 'test_ymf262_@0@_compare'.format(intr_name)
+    test_exe = get_variable('@0@_exe'.format(test_name))
+    test(test_name, test_exe, args: ['avd', '../tests/scores/DUNE.avd'])
+  endif
+endforeach
diff --git a/tests/test_convert_arm_neon.c b/tests/test_convert_arm_neon.c
new file mode 100644
index 0000000..4d02e36
--- /dev/null
+++ b/tests/test_convert_arm_neon.c
@@ -0,0 +1,376 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+#include "aymo_file.h"
+#include "aymo_testing.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_convert_arm_neon.h"
+
+#include "test_convert_prologue_inline.h"
+
+
+void test_aymo_convert_arm_neon_i16_f32(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32)((ei - si), &src_i16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_f32_i16(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16)((ei - si), &src_f32[si], &buf_i16[si]);
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_i16_f32_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32_1)((ei - si), &src_i16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_f32_i16_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16_1)((ei - si), &src_f32_1[si], &buf_i16[si]);
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16_1, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_i16_f32_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32_k)((ei - si), &src_i16[si], &buf_f32[si], (float)(1. / K));
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_f32_i16_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16_k)((ei - si), &src_f32_1[si], &buf_i16[si], (float)(K));
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16_1, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_u16_f32(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32)((ei - si), &src_u16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_f32_u16(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16)((ei - si), &src_f32[si], &buf_u16[si]);
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_u16_f32_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32_1)((ei - si), &src_u16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_f32_u16_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16_1)((ei - si), &src_f32_1[si], &buf_u16[si]);
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16_1, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_u16_f32_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32_k)((ei - si), &src_u16[si], &buf_f32[si], (float)(1. / K));
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_arm_neon_f32_u16_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16_k)((ei - si), &src_f32_1[si], &buf_u16[si], (float)(K));
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16_1, ref_n);
+}
+
+
+struct aymo_testing_entry unit_tests[] =
+{
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_i16_f32),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_i16),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_i16_f32_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_i16_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_i16_f32_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_i16_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_u16_f32),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_u16),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_u16_f32_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_u16_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_u16_f32_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_u16_k)
+};
+
+
+#include "aymo_testing_epilogue_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
diff --git a/tests/test_convert_none.c b/tests/test_convert_none.c
new file mode 100644
index 0000000..20bf38f
--- /dev/null
+++ b/tests/test_convert_none.c
@@ -0,0 +1,371 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#include "aymo_file.h"
+#include "aymo_testing.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_convert_none.h"
+
+#include "test_convert_prologue_inline.h"
+
+
+void test_aymo_convert_none_i16_f32(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32)((ei - si), &src_i16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32, ref_n);
+}
+
+
+void test_aymo_convert_none_f32_i16(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16)((ei - si), &src_f32[si], &buf_i16[si]);
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16, ref_n);
+}
+
+
+void test_aymo_convert_none_i16_f32_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32_1)((ei - si), &src_i16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_none_f32_i16_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16_1)((ei - si), &src_f32_1[si], &buf_i16[si]);
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16_1, ref_n);
+}
+
+
+void test_aymo_convert_none_i16_f32_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32_k)((ei - si), &src_i16[si], &buf_f32[si], (float)(1. / K));
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_none_f32_i16_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16_k)((ei - si), &src_f32_1[si], &buf_i16[si], (float)(K));
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16_1, ref_n);
+}
+
+
+void test_aymo_convert_none_u16_f32(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32)((ei - si), &src_u16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32, ref_n);
+}
+
+
+void test_aymo_convert_none_f32_u16(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16)((ei - si), &src_f32[si], &buf_u16[si]);
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16, ref_n);
+}
+
+
+void test_aymo_convert_none_u16_f32_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32_1)((ei - si), &src_u16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_none_f32_u16_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16_1)((ei - si), &src_f32_1[si], &buf_u16[si]);
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16_1, ref_n);
+}
+
+
+void test_aymo_convert_none_u16_f32_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32_k)((ei - si), &src_u16[si], &buf_f32[si], (float)(1. / K));
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_none_f32_u16_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16_k)((ei - si), &src_f32_1[si], &buf_u16[si], (float)(K));
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16_1, ref_n);
+}
+
+
+struct aymo_testing_entry unit_tests[] =
+{
+    AYMO_TEST_ENTRY(test_aymo_convert_none_i16_f32),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_f32_i16),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_i16_f32_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_f32_i16_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_i16_f32_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_f32_i16_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_u16_f32),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_f32_u16),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_u16_f32_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_f32_u16_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_u16_f32_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_none_f32_u16_k)
+};
+
+
+#include "aymo_testing_epilogue_inline.h"
diff --git a/tests/test_convert_prologue_inline.h b/tests/test_convert_prologue_inline.h
new file mode 100644
index 0000000..bdb3a0f
--- /dev/null
+++ b/tests/test_convert_prologue_inline.h
@@ -0,0 +1,296 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cc.h"
+
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+static int app_return;
+
+
+#define ref_n 64u
+
+
+#undef DIRTY
+
+#undef x0xmm
+#undef x0xMM
+
+#undef mmu
+#undef MMu
+
+#undef K
+#undef xmm_f
+#undef xMM_f
+#undef xmi_f
+#undef xMI_f
+#undef xmu_f
+#undef xMU_f
+
+
+#define DIRTY   (0xCCu)
+
+#define x0xmm   (INT16_MIN)
+#define x0xMM   (INT16_MAX)
+
+#define xxmmu   (0u)
+#define xxMMu   (UINT16_MAX)
+
+#define K       (+32768.f)
+#define x0xff   (-1024.f * K)
+#define x0xFF   (+1024.f * K)
+#define x0xfi   ((float)x0xmm)
+#define x0xFI   ((float)x0xMM)
+#define x0xfu   ((float)xxmmu)
+#define x0xFU   ((float)xxMMu)
+
+
+static float buf_f32[ref_n];
+static int16_t buf_i16[ref_n];
+static uint16_t buf_u16[ref_n];
+
+
+const int16_t src_i16[ref_n] = {
+    x0xmm,  -0x01,  -0x02,  +0x03,  x0xMM,  -0x05,  -0x06,  +0x07,
+    -0x10,  x0xmm,  +0x12,  -0x13,  -0x14,  x0xMM,  -0x16,  -0x17,
+    +0x20,  +0x21,  x0xmm,  -0x23,  -0x24,  -0x25,  x0xMM,  -0x27,
+    -0x30,  -0x31,  +0x32,  x0xmm,  +0x34,  +0x35,  +0x36,  x0xMM,
+    x0xMM,  -0x41,  +0x42,  +0x43,  x0xmm,  +0x45,  -0x46,  -0x47,
+    +0x50,  x0xMM,  -0x52,  -0x53,  +0x54,  x0xmm,  +0x56,  +0x57,
+    +0x60,  -0x61,  x0xMM,  +0x63,  -0x64,  +0x65,  x0xmm,  -0x67,
+    -0x70,  +0x71,  +0x72,  x0xMM,  +0x74,  -0x75,  -0x76,  x0xmm
+};
+
+const float ref_i16_f32[ref_n] = {
+    x0xfi,  -0x01,  -0x02,  +0x03,  x0xFI,  -0x05,  -0x06,  +0x07,
+    -0x10,  x0xfi,  +0x12,  -0x13,  -0x14,  x0xFI,  -0x16,  -0x17,
+    +0x20,  +0x21,  x0xfi,  -0x23,  -0x24,  -0x25,  x0xFI,  -0x27,
+    -0x30,  -0x31,  +0x32,  x0xfi,  +0x34,  +0x35,  +0x36,  x0xFI,
+    x0xFI,  -0x41,  +0x42,  +0x43,  x0xfi,  +0x45,  -0x46,  -0x47,
+    +0x50,  x0xFI,  -0x52,  -0x53,  +0x54,  x0xfi,  +0x56,  +0x57,
+    +0x60,  -0x61,  x0xFI,  +0x63,  -0x64,  +0x65,  x0xfi,  -0x67,
+    -0x70,  +0x71,  +0x72,  x0xFI,  +0x74,  -0x75,  -0x76,  x0xfi
+};
+
+const float ref_i16_f32_1[ref_n] = {
+    x0xfi/K,  -0x01/K,  -0x02/K,  +0x03/K,  x0xFI/K,  -0x05/K,  -0x06/K,  +0x07/K,
+    -0x10/K,  x0xfi/K,  +0x12/K,  -0x13/K,  -0x14/K,  x0xFI/K,  -0x16/K,  -0x17/K,
+    +0x20/K,  +0x21/K,  x0xfi/K,  -0x23/K,  -0x24/K,  -0x25/K,  x0xFI/K,  -0x27/K,
+    -0x30/K,  -0x31/K,  +0x32/K,  x0xfi/K,  +0x34/K,  +0x35/K,  +0x36/K,  x0xFI/K,
+    x0xFI/K,  -0x41/K,  +0x42/K,  +0x43/K,  x0xfi/K,  +0x45/K,  -0x46/K,  -0x47/K,
+    +0x50/K,  x0xFI/K,  -0x52/K,  -0x53/K,  +0x54/K,  x0xfi/K,  +0x56/K,  +0x57/K,
+    +0x60/K,  -0x61/K,  x0xFI/K,  +0x63/K,  -0x64/K,  +0x65/K,  x0xfi/K,  -0x67/K,
+    -0x70/K,  +0x71/K,  +0x72/K,  x0xFI/K,  +0x74/K,  -0x75/K,  -0x76/K,  x0xfi/K
+};
+
+
+const uint16_t src_u16[ref_n] = {
+    xxmmu,  0x01u,  0x02u,  0x03u,  xxMMu,  0x05u,  0x06u,  0x07u,
+    0x10u,  xxmmu,  0x12u,  0x13u,  0x14u,  xxMMu,  0x16u,  0x17u,
+    0x20u,  0x21u,  xxmmu,  0x23u,  0x24u,  0x25u,  xxMMu,  0x27u,
+    0x30u,  0x31u,  0x32u,  xxmmu,  0x34u,  0x35u,  0x36u,  xxMMu,
+    xxMMu,  0x41u,  0x42u,  0x43u,  xxmmu,  0x45u,  0x46u,  0x47u,
+    0x50u,  xxMMu,  0x52u,  0x53u,  0x54u,  xxmmu,  0x56u,  0x57u,
+    0x60u,  0x61u,  xxMMu,  0x63u,  0x64u,  0x65u,  xxmmu,  0x67u,
+    0x70u,  0x71u,  0x72u,  xxMMu,  0x74u,  0x75u,  0x76u,  xxmmu
+};
+
+const float ref_u16_f32[ref_n] = {
+    x0xfu,  +0x01,  +0x02,  +0x03,  x0xFU,  +0x05,  +0x06,  +0x07,
+    +0x10,  x0xfu,  +0x12,  +0x13,  +0x14,  x0xFU,  +0x16,  +0x17,
+    +0x20,  +0x21,  x0xfu,  +0x23,  +0x24,  +0x25,  x0xFU,  +0x27,
+    +0x30,  +0x31,  +0x32,  x0xfu,  +0x34,  +0x35,  +0x36,  x0xFU,
+    x0xFU,  +0x41,  +0x42,  +0x43,  x0xfu,  +0x45,  +0x46,  +0x47,
+    +0x50,  x0xFU,  +0x52,  +0x53,  +0x54,  x0xfu,  +0x56,  +0x57,
+    +0x60,  +0x61,  x0xFU,  +0x63,  +0x64,  +0x65,  x0xfu,  +0x67,
+    +0x70,  +0x71,  +0x72,  x0xFU,  +0x74,  +0x75,  +0x76,  x0xfu
+};
+
+const float ref_u16_f32_1[ref_n] = {
+    x0xfu/K,  +0x01/K,  +0x02/K,  +0x03/K,  x0xFU/K,  +0x05/K,  +0x06/K,  +0x07/K,
+    +0x10/K,  x0xfu/K,  +0x12/K,  +0x13/K,  +0x14/K,  x0xFU/K,  +0x16/K,  +0x17/K,
+    +0x20/K,  +0x21/K,  x0xfu/K,  +0x23/K,  +0x24/K,  +0x25/K,  x0xFU/K,  +0x27/K,
+    +0x30/K,  +0x31/K,  +0x32/K,  x0xfu/K,  +0x34/K,  +0x35/K,  +0x36/K,  x0xFU/K,
+    x0xFU/K,  +0x41/K,  +0x42/K,  +0x43/K,  x0xfu/K,  +0x45/K,  +0x46/K,  +0x47/K,
+    +0x50/K,  x0xFU/K,  +0x52/K,  +0x53/K,  +0x54/K,  x0xfu/K,  +0x56/K,  +0x57/K,
+    +0x60/K,  +0x61/K,  x0xFU/K,  +0x63/K,  +0x64/K,  +0x65/K,  x0xfu/K,  +0x67/K,
+    +0x70/K,  +0x71/K,  +0x72/K,  x0xFU/K,  +0x74/K,  +0x75/K,  +0x76/K,  x0xfu/K
+};
+
+
+const float src_f32[ref_n] = {
+    x0xff,  -0x01,  -0x02,  +0x03,  x0xFF,  -0x05,  -0x06,  +0x07,
+    -0x10,  x0xff,  +0x12,  -0x13,  -0x14,  x0xFF,  -0x16,  -0x17,
+    +0x20,  +0x21,  x0xff,  -0x23,  -0x24,  -0x25,  x0xFF,  -0x27,
+    -0x30,  -0x31,  +0x32,  x0xff,  +0x34,  +0x35,  +0x36,  x0xFF,
+    x0xFF,  -0x41,  +0x42,  +0x43,  x0xff,  +0x45,  -0x46,  -0x47,
+    +0x50,  x0xFF,  -0x52,  -0x53,  +0x54,  x0xff,  +0x56,  +0x57,
+    +0x60,  -0x61,  x0xFF,  +0x63,  -0x64,  +0x65,  x0xff,  -0x67,
+    -0x70,  +0x71,  +0x72,  x0xFF,  +0x74,  -0x75,  -0x76,  x0xff
+};
+
+const int16_t ref_f32_i16[ref_n] = {
+    x0xmm,  -0x01,  -0x02,  +0x03,  x0xMM,  -0x05,  -0x06,  +0x07,
+    -0x10,  x0xmm,  +0x12,  -0x13,  -0x14,  x0xMM,  -0x16,  -0x17,
+    +0x20,  +0x21,  x0xmm,  -0x23,  -0x24,  -0x25,  x0xMM,  -0x27,
+    -0x30,  -0x31,  +0x32,  x0xmm,  +0x34,  +0x35,  +0x36,  x0xMM,
+    x0xMM,  -0x41,  +0x42,  +0x43,  x0xmm,  +0x45,  -0x46,  -0x47,
+    +0x50,  x0xMM,  -0x52,  -0x53,  +0x54,  x0xmm,  +0x56,  +0x57,
+    +0x60,  -0x61,  x0xMM,  +0x63,  -0x64,  +0x65,  x0xmm,  -0x67,
+    -0x70,  +0x71,  +0x72,  x0xMM,  +0x74,  -0x75,  -0x76,  x0xmm
+};
+
+const uint16_t ref_f32_u16[ref_n] = {
+    xxmmu,  xxmmu,  xxmmu,  0x03u,  xxMMu,  xxmmu,  xxmmu,  0x07u,
+    xxmmu,  xxmmu,  0x12u,  xxmmu,  xxmmu,  xxMMu,  xxmmu,  xxmmu,
+    0x20u,  0x21u,  xxmmu,  xxmmu,  xxmmu,  xxmmu,  xxMMu,  xxmmu,
+    xxmmu,  xxmmu,  0x32u,  xxmmu,  0x34u,  0x35u,  0x36u,  xxMMu,
+    xxMMu,  xxmmu,  0x42u,  0x43u,  xxmmu,  0x45u,  xxmmu,  xxmmu,
+    0x50u,  xxMMu,  xxmmu,  xxmmu,  0x54u,  xxmmu,  0x56u,  0x57u,
+    0x60u,  xxmmu,  xxMMu,  0x63u,  xxmmu,  0x65u,  xxmmu,  xxmmu,
+    xxmmu,  0x71u,  0x72u,  xxMMu,  0x74u,  xxmmu,  xxmmu,  xxmmu
+};
+
+
+const float src_f32_1[ref_n] = {
+    x0xff/K,  -0x01/K,  -0x02/K,  +0x03/K,  x0xFF/K,  -0x05/K,  -0x06/K,  +0x07/K,
+    -0x10/K,  x0xff/K,  +0x12/K,  -0x13/K,  -0x14/K,  x0xFF/K,  -0x16/K,  -0x17/K,
+    +0x20/K,  +0x21/K,  x0xff/K,  -0x23/K,  -0x24/K,  -0x25/K,  x0xFF/K,  -0x27/K,
+    -0x30/K,  -0x31/K,  +0x32/K,  x0xff/K,  +0x34/K,  +0x35/K,  +0x36/K,  x0xFF/K,
+    x0xFF/K,  -0x41/K,  +0x42/K,  +0x43/K,  x0xff/K,  +0x45/K,  -0x46/K,  -0x47/K,
+    +0x50/K,  x0xFF/K,  -0x52/K,  -0x53/K,  +0x54/K,  x0xff/K,  +0x56/K,  +0x57/K,
+    +0x60/K,  -0x61/K,  x0xFF/K,  +0x63/K,  -0x64/K,  +0x65/K,  x0xff/K,  -0x67/K,
+    -0x70/K,  +0x71/K,  +0x72/K,  x0xFF/K,  +0x74/K,  -0x75/K,  -0x76/K,  x0xff/K
+};
+
+const int16_t ref_f32_i16_1[ref_n] = {
+    x0xmm,  -0x01,  -0x02,  +0x03,  x0xMM,  -0x05,  -0x06,  +0x07,
+    -0x10,  x0xmm,  +0x12,  -0x13,  -0x14,  x0xMM,  -0x16,  -0x17,
+    +0x20,  +0x21,  x0xmm,  -0x23,  -0x24,  -0x25,  x0xMM,  -0x27,
+    -0x30,  -0x31,  +0x32,  x0xmm,  +0x34,  +0x35,  +0x36,  x0xMM,
+    x0xMM,  -0x41,  +0x42,  +0x43,  x0xmm,  +0x45,  -0x46,  -0x47,
+    +0x50,  x0xMM,  -0x52,  -0x53,  +0x54,  x0xmm,  +0x56,  +0x57,
+    +0x60,  -0x61,  x0xMM,  +0x63,  -0x64,  +0x65,  x0xmm,  -0x67,
+    -0x70,  +0x71,  +0x72,  x0xMM,  +0x74,  -0x75,  -0x76,  x0xmm
+};
+
+const uint16_t ref_f32_u16_1[ref_n] = {
+    xxmmu,  xxmmu,  xxmmu,  0x03u,  xxMMu,  xxmmu,  xxmmu,  0x07u,
+    xxmmu,  xxmmu,  0x12u,  xxmmu,  xxmmu,  xxMMu,  xxmmu,  xxmmu,
+    0x20u,  0x21u,  xxmmu,  xxmmu,  xxmmu,  xxmmu,  xxMMu,  xxmmu,
+    xxmmu,  xxmmu,  0x32u,  xxmmu,  0x34u,  0x35u,  0x36u,  xxMMu,
+    xxMMu,  xxmmu,  0x42u,  0x43u,  xxmmu,  0x45u,  xxmmu,  xxmmu,
+    0x50u,  xxMMu,  xxmmu,  xxmmu,  0x54u,  xxmmu,  0x56u,  0x57u,
+    0x60u,  xxmmu,  xxMMu,  0x63u,  xxmmu,  0x65u,  xxmmu,  xxmmu,
+    xxmmu,  0x71u,  0x72u,  xxMMu,  0x74u,  xxmmu,  xxmmu,  xxmmu
+};
+
+
+void print_i16(FILE* fp, const int16_t* vp, size_t n)
+{
+    fprintf(fp, "{ ");
+    while (n--) {
+        int i = (int)*vp++;
+        char sc = ((i < 0) ? '-' : ((i > 0) ? '+' : ' '));
+        if (i < 0) i = -i;
+        fprintf(fp, "%c%04Xh,  ", sc, (unsigned)i);
+    }
+    fprintf(fp, "}\n");
+}
+
+
+void print_u16(FILE* fp, const uint16_t* vp, size_t n)
+{
+    fprintf(fp, "{ ");
+    while (n--) {
+        fprintf(fp, "%04Xh,  ", (unsigned)*vp++);
+    }
+    fprintf(fp, "}\n");
+}
+
+
+void print_f32(FILE* fp, const float* vp, size_t n)
+{
+    fprintf(fp, "{ ");
+    while (n--) {
+        fprintf(fp, "%+6.2f,  ", *vp++);
+    }
+    fprintf(fp, "}\n");
+}
+
+
+const int16_t* compare_i16(const int16_t* bufp, const int16_t* refp, size_t len)
+{
+    while (len--) {
+        if (*bufp != *refp) {
+            return bufp;
+        }
+        ++bufp;
+        ++refp;
+    }
+    return NULL;
+}
+
+
+const uint16_t* compare_u16(const uint16_t* bufp, const uint16_t* refp, size_t len)
+{
+    while (len--) {
+        if (*bufp != *refp) {
+            return bufp;
+        }
+        ++bufp;
+        ++refp;
+    }
+    return NULL;
+}
+
+
+const float* compare_f32(const float* bufp, const float* refp, size_t len, float epsilon)
+{
+    while (len--) {
+        if (fabsf(*bufp - *refp) > epsilon) {
+            return bufp;
+        }
+        ++bufp;
+        ++refp;
+    }
+    return NULL;
+}
+
+
+const void* compare_dirty(const void* bufp, uint8_t refv, size_t size)
+{
+    const uint8_t* sp = (const uint8_t*)bufp;
+    const uint8_t* ep = (sp + size);
+    while (sp != ep) {
+        if (*sp != refv) {
+            return sp;
+        }
+        ++sp;
+    }
+    return NULL;
+}
diff --git a/tests/test_convert_x86_avx2.c b/tests/test_convert_x86_avx2.c
new file mode 100644
index 0000000..666c700
--- /dev/null
+++ b/tests/test_convert_x86_avx2.c
@@ -0,0 +1,376 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+#include "aymo_file.h"
+#include "aymo_testing.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_convert_x86_avx2.h"
+
+#include "test_convert_prologue_inline.h"
+
+
+void test_aymo_convert_x86_avx2_i16_f32(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32)((ei - si), &src_i16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_f32_i16(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16)((ei - si), &src_f32[si], &buf_i16[si]);
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_i16_f32_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32_1)((ei - si), &src_i16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_f32_i16_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16_1)((ei - si), &src_f32_1[si], &buf_i16[si]);
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_i16_f32_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32_k)((ei - si), &src_i16[si], &buf_f32[si], (float)(1. / K));
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_f32_i16_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16_k)((ei - si), &src_f32_1[si], &buf_i16[si], (float)(K));
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_u16_f32(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32)((ei - si), &src_u16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_f32_u16(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16)((ei - si), &src_f32[si], &buf_u16[si]);
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_u16_f32_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32_1)((ei - si), &src_u16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_f32_u16_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16_1)((ei - si), &src_f32_1[si], &buf_u16[si]);
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_u16_f32_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32_k)((ei - si), &src_u16[si], &buf_f32[si], (float)(1. / K));
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_avx2_f32_u16_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16_k)((ei - si), &src_f32_1[si], &buf_u16[si], (float)(K));
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16_1, ref_n);
+}
+
+
+struct aymo_testing_entry unit_tests[] =
+{
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_i16_f32),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_i16),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_i16_f32_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_i16_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_i16_f32_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_i16_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_u16_f32),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_u16),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_u16_f32_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_u16_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_u16_f32_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_u16_k)
+};
+
+
+#include "aymo_testing_epilogue_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
diff --git a/tests/test_convert_x86_sse41.c b/tests/test_convert_x86_sse41.c
new file mode 100644
index 0000000..8782118
--- /dev/null
+++ b/tests/test_convert_x86_sse41.c
@@ -0,0 +1,376 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#include "aymo_file.h"
+#include "aymo_testing.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_convert_x86_sse41.h"
+
+#include "test_convert_prologue_inline.h"
+
+
+void test_aymo_convert_x86_sse41_i16_f32(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32)((ei - si), &src_i16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_f32_i16(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16)((ei - si), &src_f32[si], &buf_i16[si]);
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_i16_f32_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32_1)((ei - si), &src_i16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_f32_i16_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16_1)((ei - si), &src_f32_1[si], &buf_i16[si]);
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_i16_f32_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(i16_f32_k)((ei - si), &src_i16[si], &buf_f32[si], (float)(1. / K));
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_i16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_f32_i16_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_i16, (int)DIRTY, sizeof(buf_i16));
+            aymo_(f32_i16_k)((ei - si), &src_f32_1[si], &buf_i16[si], (float)(K));
+            if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_i16(stderr, buf_i16, ref_n);
+    print_i16(stderr, ref_f32_i16_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_u16_f32(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32)((ei - si), &src_u16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_f32_u16(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16)((ei - si), &src_f32[si], &buf_u16[si]);
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_u16_f32_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32_1)((ei - si), &src_u16[si], &buf_f32[si]);
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_f32_u16_1(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16_1)((ei - si), &src_f32_1[si], &buf_u16[si]);
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_u16_f32_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_f32, (int)DIRTY, sizeof(buf_f32));
+            aymo_(u16_f32_k)((ei - si), &src_u16[si], &buf_f32[si], (float)(1. / K));
+            if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_f32(stderr, buf_f32, ref_n);
+    print_f32(stderr, ref_u16_f32_1, ref_n);
+}
+
+
+void test_aymo_convert_x86_sse41_f32_u16_k(void)
+{
+    unsigned si, ei; int line = 0;
+    for (si = 0; si < ref_n; ++si) {
+        for (ei = si; ei < ref_n; ++ei) {
+            memset(buf_u16, (int)DIRTY, sizeof(buf_u16));
+            aymo_(f32_u16_k)((ei - si), &src_f32_1[si], &buf_u16[si], (float)(K));
+            if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) {
+                line = __LINE__; goto error_;
+            }
+            if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) {
+                line = __LINE__; goto error_;
+            }
+        }
+    }
+    return;
+error_:
+    app_return = TEST_STATUS_FAIL;
+    fprintf(stderr, "%s @ %d:  si=%u, ei=%u\n", __func__, line, si, ei);
+    print_u16(stderr, buf_u16, ref_n);
+    print_u16(stderr, ref_f32_u16_1, ref_n);
+}
+
+
+struct aymo_testing_entry unit_tests[] =
+{
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_i16_f32),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_i16),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_i16_f32_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_i16_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_i16_f32_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_i16_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_u16_f32),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_u16),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_u16_f32_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_u16_1),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_u16_f32_k),
+    AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_u16_k)
+};
+
+
+#include "aymo_testing_epilogue_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
diff --git a/tests/test_tda8425_arm_neon_sweep.c b/tests/test_tda8425_arm_neon_sweep.c
new file mode 100644
index 0000000..6e593e7
--- /dev/null
+++ b/tests/test_tda8425_arm_neon_sweep.c
@@ -0,0 +1,31 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_tda8425_arm_neon.h"
+
+
+#include "test_tda8425_sweep_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
diff --git a/tests/test_tda8425_none_sweep.c b/tests/test_tda8425_none_sweep.c
new file mode 100644
index 0000000..d0aeaff
--- /dev/null
+++ b/tests/test_tda8425_none_sweep.c
@@ -0,0 +1,27 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_tda8425_none.h"
+
+
+#include "test_tda8425_sweep_inline.h"
diff --git a/tests/test_tda8425_sweep_inline.h b/tests/test_tda8425_sweep_inline.h
new file mode 100644
index 0000000..717e9fc
--- /dev/null
+++ b/tests/test_tda8425_sweep_inline.h
@@ -0,0 +1,330 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#include "aymo_tda8425.h"
+#include "aymo_testing.h"
+#include "TDA8425_emu.h"
+
+#ifdef TEST_FILES
+#include "aymo_wave.h"
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#ifndef M_PI
+#define M_PI (3.14159265358979323846264338327950288)
+#endif
+
+#ifndef INPUT_AMPLITUDE
+#define INPUT_AMPLITUDE (.4)
+#endif
+
+#ifndef STDEV_LIMIT
+#define STDEV_LIMIT (.0002)
+#endif
+
+#define EMU_AHEAD   4
+
+static AYMO_TDA8425_DEFINE_MATH_DEFAULT(tda8425_math);
+
+
+struct app_args {
+    int argc;
+    char** argv;
+    double fs;
+    double tt;
+    uint8_t reg_vl;
+    uint8_t reg_vr;
+    uint8_t reg_ba;
+    uint8_t reg_tr;
+    uint8_t reg_pp;
+    uint8_t reg_sf;
+};
+
+
+static int app_return;
+static struct app_args app_args;
+
+static TDA8425_Chip emu;
+static struct aymo_(chip) chip;
+
+#ifdef TEST_FILES
+static char* in_name;
+static char* emu_out_name;
+static char* chip_out_name;
+static FILE* in_file;
+static FILE* emu_out_file;
+static FILE* chip_out_file;
+#endif  // TEST_FILES
+
+
+static int arg2reg(const char* arg, uint8_t* reg) {
+    errno = 0;
+    unsigned long x = strtoul(arg, NULL, 0);
+    if ((x != ULONG_MAX) || (errno != ERANGE)) {
+        if (x <= UINT8_MAX) {
+            *reg = (uint8_t)x;
+        } else {
+            errno = ERANGE;
+        }
+    }
+    return errno;
+}
+
+
+static int arg2posf(const char* arg, double* val) {
+    errno = 0;
+    double x = strtod(arg, NULL);
+    if (((x != +HUGE_VAL) && (x != -HUGE_VAL)) || (errno != ERANGE)) {
+        if (x > 0.) {
+            *val = (float)x;
+        } else {
+            errno = ERANGE;
+        }
+    }
+    return errno;
+}
+
+
+static void app_boot(void)
+{
+    aymo_cpu_boot();
+    aymo_tda8425_boot(&tda8425_math);
+
+    app_return = TEST_STATUS_PASS;
+    memset(&app_args, 0, sizeof(app_args));
+
+    TDA8425_Chip_Ctor(&emu);
+}
+
+
+static void app_setup(void)
+{
+    TDA8425_Chip_Setup(
+        &emu,
+        (TDA8425_Float)app_args.fs,
+        (TDA8425_Float)TDA8425_Pseudo_C1_Table[TDA8425_Pseudo_Preset_1],
+        (TDA8425_Float)TDA8425_Pseudo_C2_Table[TDA8425_Pseudo_Preset_1],
+        TDA8425_Tfilter_Mode_Disabled
+    );
+    TDA8425_Chip_Reset(&emu);
+    TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_VL, app_args.reg_vl);
+    TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_VR, app_args.reg_vr);
+    TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_BA, app_args.reg_ba);
+    TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_TR, app_args.reg_tr);
+    TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_SF, app_args.reg_sf);
+    TDA8425_Chip_Start(&emu);
+
+    aymo_(ctor)(&chip, (float)app_args.fs);
+    aymo_(write)(&chip, 0x00u, app_args.reg_vl);
+    aymo_(write)(&chip, 0x01u, app_args.reg_vr);
+    aymo_(write)(&chip, 0x02u, app_args.reg_ba);
+    aymo_(write)(&chip, 0x03u, app_args.reg_tr);
+    aymo_(write)(&chip, 0x07u, app_args.reg_pp);
+    aymo_(write)(&chip, 0x08u, app_args.reg_sf);
+
+#ifdef TEST_FILES
+    in_name = aymo_test_args_to_str(0, (app_args.argc - 1), app_args.argv, "", "_in.wav");
+    emu_out_name = aymo_test_args_to_str(0, (app_args.argc - 1), app_args.argv, "", "_emu_out.wav");
+    chip_out_name = aymo_test_args_to_str(0, (app_args.argc - 1), app_args.argv, "", "_chip_out.wav");
+    assert(in_name);
+    assert(emu_out_name);
+    assert(chip_out_name);
+    fprintf(stderr, "in_name:       \"%s\"\n", in_name);
+    fprintf(stderr, "emu_out_name:  \"%s\"\n", emu_out_name);
+    fprintf(stderr, "chip_out_name: \"%s\"\n", chip_out_name);
+
+    in_file = fopen(in_name, "wb");
+    emu_out_file = fopen(emu_out_name, "wb");
+    chip_out_file = fopen(chip_out_name, "wb");
+    assert(in_file);
+    assert(emu_out_file);
+    assert(chip_out_file);
+
+    double fs = app_args.fs;
+    double T = app_args.tt;
+    uint32_t N = (uint32_t)fmax(16., (fs * T));
+    uint16_t fmt = AYMO_WAVE_FMT_TYPE_FLOAT;
+    struct aymo_wave_heading wavh;
+    aymo_wave_heading_setup(&wavh, fmt, 2u, 32u, (uint32_t)app_args.fs, (2u * N));
+    fwrite(&wavh, sizeof(wavh), 1u, in_file);
+    aymo_wave_heading_setup(&wavh, fmt, 2u, 32u, (uint32_t)app_args.fs, (2u * N));
+    fwrite(&wavh, sizeof(wavh), 1u, emu_out_file);
+    aymo_wave_heading_setup(&wavh, fmt, 2u, 32u, (uint32_t)app_args.fs, (2u * N));
+    fwrite(&wavh, sizeof(wavh), 1u, chip_out_file);
+#endif  // TEST_FILES
+}
+
+
+static void app_teardown(void)
+{
+    TDA8425_Chip_Stop(&emu);
+    TDA8425_Chip_Dtor(&emu);
+
+    aymo_(dtor)(&chip);
+
+#ifdef TEST_FILES
+    fclose(in_file);
+    fclose(emu_out_file);
+    fclose(chip_out_file);
+#endif  // TEST_FILES
+}
+
+
+static void app_run(void)
+{
+    double fs = app_args.fs;
+    double T = app_args.tt;
+    long N = (long)fmax(16., (fs * T));
+    double f0 = 10.;
+    double f1 = fmin((fs / 2.), 21000.);
+
+    TDA8425_Chip_Process_Data emu_data;
+    memset(&emu_data, 0, sizeof(emu_data));
+    float emu_y[EMU_AHEAD][2] = {{0}};
+    float chip_x[2] = {0};
+    float chip_y[2] = {0};
+    double sum_el = 0.;
+    double sum_eel = 0.;
+    double sum_er = 0.;
+    double sum_eer = 0.;
+    long k;
+
+    for (k = 0; k < N; ++k) {
+        double t = ((double)k / fs);
+        double th = ((2. * M_PI * f0 * T) * (pow((f1 / f0), (t / T)) - 1.) / log(f1 / f0));
+        th = fmod(th, (2. * M_PI));
+        float xl = (float)(INPUT_AMPLITUDE * sin(th));
+        float xr = (float)(INPUT_AMPLITUDE * cos(th));
+
+        emu_data.inputs[0][0] = (TDA8425_Float)xl;
+        emu_data.inputs[0][1] = (TDA8425_Float)xr;
+        chip_x[0] = xl;
+        chip_x[1] = xr;
+
+        TDA8425_Chip_Process(&emu, &emu_data);
+        for (int i = (EMU_AHEAD - 1); i > 0; --i) {
+            emu_y[i][0] = emu_y[i-1][0];
+            emu_y[i][1] = emu_y[i-1][1];
+        }
+        emu_y[0][0] = (float)emu_data.outputs[0];
+        emu_y[0][1] = (float)emu_data.outputs[1];
+
+        aymo_(process_f32)(&chip, 1u, chip_x, chip_y);
+
+        double el = (emu_y[EMU_AHEAD-1][0] - chip_y[0]);
+        double er = (emu_y[EMU_AHEAD-1][1] - chip_y[1]);
+        sum_el += el;
+        sum_er += er;
+        sum_eel = (el * el);
+        sum_eer = (er * er);
+
+#ifdef TEST_FILES
+        if (in_file) {
+            fwrite(chip_x, sizeof(float), 2, in_file);
+        }
+        if (emu_out_file) {
+            fwrite(emu_y[EMU_AHEAD-1], sizeof(float), 2, emu_out_file);
+        }
+        if (chip_out_file) {
+            fwrite(chip_y, sizeof(float), 2, chip_out_file);
+        }
+#endif  // TEST_FILES
+    }
+
+    double avg_el = (sum_el / (double)k);
+    double avg_er = (sum_er / (double)k);
+    double avg_eel = (sum_eel / (double)k);
+    double avg_eer = (sum_eer / (double)k);
+    double var_el = fabs(avg_eel - (avg_el * avg_el));
+    double var_er = fabs(avg_eer - (avg_er * avg_er));
+    double stdev_el = sqrt(var_el);
+    double stdev_er = sqrt(var_er);
+
+    fprintf(stderr, "L: stdev_e=%g  N=%ld  k=%ld  sum_e=%g  sum_ee=%g\n", stdev_el, N, k, sum_el, sum_eel);
+    fprintf(stderr, "R: stdev_e=%g  N=%ld  k=%ld  sum_e=%g  sum_ee=%g\n", stdev_er, N, k, sum_er, sum_eer);
+
+    if ((stdev_el > STDEV_LIMIT) || (stdev_er > STDEV_LIMIT)) {
+        app_return = TEST_STATUS_FAIL;
+    }
+}
+
+
+int main(int argc, char** argv)
+{
+    app_boot();
+
+    app_args.argc = argc;
+    app_args.argv = argv;
+
+    if (argc != 9) {
+        fprintf(stderr, "USAGE:\t%s VL VR BA TR PP SF fs tt\n", (argc ? argv[0] : "test_exe"));
+        app_return = TEST_STATUS_HARD;
+        goto catch_;
+    }
+
+    if (arg2reg(argv[1], &app_args.reg_vl)) {
+        perror("VL"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+    if (arg2reg(argv[2], &app_args.reg_vr)) {
+        perror("VR"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+    if (arg2reg(argv[3], &app_args.reg_ba)) {
+        perror("BA"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+    if (arg2reg(argv[4], &app_args.reg_tr)) {
+        perror("TR"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+    if (arg2reg(argv[5], &app_args.reg_pp)) {
+        perror("PP"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+    if (arg2reg(argv[6], &app_args.reg_sf)) {
+        perror("SF"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+    if (arg2posf(argv[7], &app_args.fs)) {
+        perror("fs"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+    if (arg2posf(argv[8], &app_args.tt)) {
+        perror("tt"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+
+    app_setup();
+    app_run();
+    goto finally_;
+
+catch_:
+finally_:
+    app_teardown();
+    return app_return;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/tests/test_tda8425_x86_avx2_sweep.c b/tests/test_tda8425_x86_avx2_sweep.c
new file mode 100644
index 0000000..ddaec79
--- /dev/null
+++ b/tests/test_tda8425_x86_avx2_sweep.c
@@ -0,0 +1,31 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_tda8425_x86_avx2.h"
+
+
+#include "test_tda8425_sweep_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
diff --git a/tests/test_tda8425_x86_sse41_sweep.c b/tests/test_tda8425_x86_sse41_sweep.c
new file mode 100644
index 0000000..fd33de9
--- /dev/null
+++ b/tests/test_tda8425_x86_sse41_sweep.c
@@ -0,0 +1,31 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_tda8425_x86_sse41.h"
+
+
+#include "test_tda8425_sweep_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
diff --git a/tests/test_ym7128_arm_neon_sweep.c b/tests/test_ym7128_arm_neon_sweep.c
new file mode 100644
index 0000000..6c8b9b0
--- /dev/null
+++ b/tests/test_ym7128_arm_neon_sweep.c
@@ -0,0 +1,31 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_ARM_NEON
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ym7128_arm_neon.h"
+
+
+#include "test_ym7128_sweep_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_ARM_NEON
diff --git a/tests/test_ym7128_none_sweep.c b/tests/test_ym7128_none_sweep.c
new file mode 100644
index 0000000..da770c0
--- /dev/null
+++ b/tests/test_ym7128_none_sweep.c
@@ -0,0 +1,27 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ym7128_none.h"
+
+
+#include "test_ym7128_sweep_inline.h"
diff --git a/tests/test_ym7128_sweep_inline.h b/tests/test_ym7128_sweep_inline.h
new file mode 100644
index 0000000..defa42d
--- /dev/null
+++ b/tests/test_ym7128_sweep_inline.h
@@ -0,0 +1,316 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_cpu.h"
+#include "aymo_testing.h"
+#include "aymo_ym7128.h"
+#include "YM7128B_emu.h"
+
+#ifdef TEST_FILES
+#include "aymo_wave.h"
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+#ifndef M_PI
+#define M_PI (3.14159265358979323846264338327950288)
+#endif
+
+#ifndef DUTY_RATIO
+#define DUTY_RATIO  (.8)
+#endif
+
+#ifndef INPUT_AMPLITUDE
+#define INPUT_AMPLITUDE (.25)
+#endif
+
+#ifndef STDEV_LIMIT
+#define STDEV_LIMIT (.1)
+#endif
+
+
+struct test_args {
+    int argc;
+    char** argv;
+    double tt;
+    uint8_t regs[YM7128B_Reg_Count];
+};
+
+
+static int app_return;
+static struct test_args test_args;
+
+static YM7128B_ChipFixed emu;
+static struct aymo_(chip) chip;
+
+#ifdef TEST_FILES
+static char* in_name;
+static char* emu_out_name;
+static char* chip_out_name;
+static FILE* in_file;
+static FILE* emu_out_file;
+static FILE* chip_out_file;
+#endif  // TEST_FILES
+
+
+static int arg2reg(const char* arg, uint8_t* reg) {
+    errno = 0;
+    unsigned long x = strtoul(arg, NULL, 0);
+    if ((x != ULONG_MAX) || (errno != ERANGE)) {
+        if (x <= UINT8_MAX) {
+            *reg = (uint8_t)x;
+        } else {
+            errno = ERANGE;
+        }
+    }
+    return errno;
+}
+
+
+static int arg2posf(const char* arg, double* val) {
+    errno = 0;
+    double x = strtod(arg, NULL);
+    if (((x != +HUGE_VAL) && (x != -HUGE_VAL)) || (errno != ERANGE)) {
+        if (x > 0.) {
+            *val = (float)x;
+        } else {
+            errno = ERANGE;
+        }
+    }
+    return errno;
+}
+
+
+static void app_boot(void)
+{
+    aymo_cpu_boot();
+    aymo_ym7128_boot();
+
+    app_return = TEST_STATUS_PASS;
+    memset(&test_args, 0, sizeof(test_args));
+
+    YM7128B_ChipFixed_Ctor(&emu);
+}
+
+
+static void app_setup(void)
+{
+    YM7128B_ChipFixed_Reset(&emu);
+    for (int i = 0; i < YM7128B_Reg_Count; ++i) {
+        YM7128B_ChipFixed_Write(&emu, (YM7128B_Address)i, test_args.regs[i]);
+    }
+
+    aymo_(ctor)(&chip);
+    for (int i = 0; i < AYMO_YM7128_REG_COUNT; ++i) {
+        aymo_(write)(&chip, (uint16_t)i, test_args.regs[i]);
+    }
+
+#ifdef TEST_FILES
+    in_name = aymo_test_args_to_str(0, (test_args.argc - 1), test_args.argv, "", "_in.wav");
+    emu_out_name = aymo_test_args_to_str(0, (test_args.argc - 1), test_args.argv, "", "_emu_out.wav");
+    chip_out_name = aymo_test_args_to_str(0, (test_args.argc - 1), test_args.argv, "", "_chip_out.wav");
+    assert(in_name);
+    assert(emu_out_name);
+    assert(chip_out_name);
+    fprintf(stderr, "in_name:       \"%s\"\n", in_name);
+    fprintf(stderr, "emu_out_name:  \"%s\"\n", emu_out_name);
+    fprintf(stderr, "chip_out_name: \"%s\"\n", chip_out_name);
+
+    in_file = fopen(in_name, "wb");
+    emu_out_file = fopen(emu_out_name, "wb");
+    chip_out_file = fopen(chip_out_name, "wb");
+    assert(in_file);
+    assert(emu_out_file);
+    assert(chip_out_file);
+
+    double fs = (double)YM7128B_Input_Rate;
+    double T = (test_args.tt * DUTY_RATIO);
+    uint32_t N = (uint32_t)fmax(16., (fs * (T / DUTY_RATIO)));
+    uint16_t fmt = AYMO_WAVE_FMT_TYPE_PCM;
+    struct aymo_wave_heading wavh;
+    aymo_wave_heading_setup(&wavh, fmt, 1u, 16u, (uint32_t)YM7128B_Input_Rate, (1u * N));
+    fwrite(&wavh, sizeof(wavh), 1u, in_file);
+    aymo_wave_heading_setup(&wavh, fmt, 2u, 16u, (uint32_t)YM7128B_Output_Rate, (2u * N));
+    fwrite(&wavh, sizeof(wavh), 1u, emu_out_file);
+    aymo_wave_heading_setup(&wavh, fmt, 2u, 16u, (uint32_t)YM7128B_Output_Rate, (2u * N));
+    fwrite(&wavh, sizeof(wavh), 1u, chip_out_file);
+#endif  // TEST_FILES
+}
+
+
+static void app_teardown(void)
+{
+    aymo_(dtor)(&chip);
+
+#ifdef TEST_FILES
+    fclose(in_file);
+    fclose(emu_out_file);
+    fclose(chip_out_file);
+#endif  // TEST_FILES
+}
+
+
+static void app_run(void)
+{
+    double fs = (double)YM7128B_Input_Rate;
+    double T = (test_args.tt * DUTY_RATIO);
+    long N = (long)fmax(16., (fs * (T / DUTY_RATIO)));
+    double f0 = 10.;
+    double f1 = fmin((fs / 2.), 21000.);
+
+    YM7128B_ChipFixed_Process_Data emu_data;
+    memset(&emu_data, 0, sizeof(emu_data));
+    int16_t chip_x[1] = {0};
+    int16_t chip_y[4] = {0};
+    double sum_e0l = 0.;
+    double sum_ee0l = 0.;
+    double sum_e0r = 0.;
+    double sum_ee0r = 0.;
+    double sum_e1l = 0.;
+    double sum_ee1l = 0.;
+    double sum_e1r = 0.;
+    double sum_ee1r = 0.;
+    long k;
+
+    for (k = 0; k < N; ++k) {
+        double xx = 0.;
+        if ((double)k < ((double)N * DUTY_RATIO)) {
+            double t = ((double)k / fs);
+            double th = ((2. * M_PI * f0 * T) * (pow((f1 / f0), (t / T)) - 1.) / log(f1 / f0));
+            th = fmod(th, (2. * M_PI));
+            xx = (INPUT_AMPLITUDE * cos(th));
+        }
+        int16_t x = (int16_t)(xx * (double)YM7128B_Fixed_Max);
+
+        emu_data.inputs[0] = x;
+        chip_x[0] = x;
+
+        YM7128B_ChipFixed_Process(&emu, &emu_data);
+
+        aymo_(process_i16)(&chip, 1u, chip_x, chip_y);
+
+        double e0l = ((double)emu_data.outputs[0][0] - (double)chip_y[0]);
+        double e0r = ((double)emu_data.outputs[1][0] - (double)chip_y[1]);
+        double e1l = ((double)emu_data.outputs[0][1] - (double)chip_y[2]);
+        double e1r = ((double)emu_data.outputs[1][1] - (double)chip_y[3]);
+        sum_e0l += e0l; sum_ee0l = (e0l * e0l);
+        sum_e0r += e0r; sum_ee0r = (e0r * e0r);
+        sum_e1l += e1l; sum_ee1l = (e1l * e1l);
+        sum_e1r += e1r; sum_ee1r = (e1r * e1r);
+
+#ifdef TEST_FILES
+        if (in_file) {
+            fwrite(chip_x, sizeof(int16_t), 1, in_file);
+        }
+        if (emu_out_file) {
+            fwrite(&emu_data.outputs[0][0], sizeof(int16_t), 1, emu_out_file);
+            fwrite(&emu_data.outputs[1][0], sizeof(int16_t), 1, emu_out_file);
+            fwrite(&emu_data.outputs[0][1], sizeof(int16_t), 1, emu_out_file);
+            fwrite(&emu_data.outputs[1][1], sizeof(int16_t), 1, emu_out_file);
+        }
+        if (chip_out_file) {
+            fwrite(chip_y, sizeof(int16_t), 4, chip_out_file);
+        }
+#endif  // TEST_FILES
+    }
+
+    double avg_e0l = (sum_e0l / (double)k);
+    double avg_ee0l = (sum_ee0l / (double)k);
+    double var_e0l = fabs(avg_ee0l - (avg_e0l * avg_e0l));
+    double stdev_e0l = sqrt(var_e0l);
+    fprintf(stderr, "L0: stdev_e=%g  N=%ld  k=%ld  sum_e=%g  sum_ee=%g\n", stdev_e0l, N, k, sum_e0l, sum_ee0l);
+
+    double avg_e0r = (sum_e0r / (double)k);
+    double avg_ee0r = (sum_ee0r / (double)k);
+    double var_e0r = fabs(avg_ee0r - (avg_e0r * avg_e0r));
+    double stdev_e0r = sqrt(var_e0r);
+    fprintf(stderr, "R0: stdev_e=%g  N=%ld  k=%ld  sum_e=%g  sum_ee=%g\n", stdev_e0r, N, k, sum_e0r, sum_ee0r);
+
+    double avg_e1l = (sum_e1l / (double)k);
+    double avg_ee1l = (sum_ee1l / (double)k);
+    double var_e1l = fabs(avg_ee1l - (avg_e1l * avg_e1l));
+    double stdev_e1l = sqrt(var_e1l);
+    fprintf(stderr, "L1: stdev_e=%g  N=%ld  k=%ld  sum_e=%g  sum_ee=%g\n", stdev_e1l, N, k, sum_e1l, sum_ee1l);
+
+    double avg_e1r = (sum_e1r / (double)k);
+    double avg_ee1r = (sum_ee1r / (double)k);
+    double var_e1r = fabs(avg_ee1r - (avg_e1r * avg_e1r));
+    double stdev_e1r = sqrt(var_e1r);
+    fprintf(stderr, "R1: stdev_e=%g  N=%ld  k=%ld  sum_e=%g  sum_ee=%g\n", stdev_e1r, N, k, sum_e1r, sum_ee1r);
+
+    double stdev_e = sqrt(var_e0l + var_e0r + var_e1l + var_e1r);
+    if (stdev_e > STDEV_LIMIT) {
+        app_return = TEST_STATUS_FAIL;
+    }
+}
+
+
+int main(int argc, char** argv)
+{
+    const int argo = 2;
+
+    app_boot();
+
+    test_args.argc = argc;
+    test_args.argv = argv;
+
+    if ((argc < argo) || (argc > (argo + AYMO_YM7128_REG_COUNT))) {
+        fprintf(stderr, "USAGE:\t%s tt REGn...\n", (argc ? argv[0] : "test_exe"));
+        app_return = TEST_STATUS_HARD;
+        goto catch_;
+    }
+
+    if (arg2posf(argv[1], &test_args.tt)) {
+        perror("tt"); app_return = TEST_STATUS_HARD; goto catch_;
+    }
+
+    int arge = (argo + AYMO_YM7128_REG_COUNT);
+    if (arge > argc) {
+        arge = argc;
+    }
+    for (int i = argo; i < arge; ++i) {
+        if (arg2reg(argv[i], &test_args.regs[i - argo])) {
+            char text[16]; sprintf(text, "%d", i);
+            perror(text); app_return = TEST_STATUS_HARD; goto catch_;
+        }
+    }
+
+    app_setup();
+    app_run();
+    goto finally_;
+
+catch_:
+finally_:
+    app_teardown();
+    return app_return;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/tests/test_ym7128_x86_sse41_sweep.c b/tests/test_ym7128_x86_sse41_sweep.c
new file mode 100644
index 0000000..e74a0df
--- /dev/null
+++ b/tests/test_ym7128_x86_sse41_sweep.c
@@ -0,0 +1,31 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ym7128_x86_sse41.h"
+
+
+#include "test_ym7128_sweep_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
diff --git a/tests/test_ymf262_arm_neon_compare.c b/tests/test_ymf262_arm_neon_compare.c
new file mode 100644
index 0000000..6b83ada
--- /dev/null
+++ b/tests/test_ymf262_arm_neon_compare.c
@@ -0,0 +1,77 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_arm_neon.h"
+
+#include "test_ymf262_compare_prologue_inline.h"
+
+
+static int compare_slots(int slot_)
+{
+    if (slot_ >= 36) {
+        return 0;  // ignore
+    }
+
+    // TODO:
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+static int compare_ch2xs(int ch2x)
+{
+    if (ch2x >= 18) {
+        return 0;  // ignore
+    }
+
+    // TODO:
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+static int compare_chips(void)
+{
+    // TODO:
+
+    for (int ch2x = 0; ch2x < 18; ++ch2x) {
+        if (compare_ch2xs(ch2x)) {
+            assert(0);
+        }
+    }
+
+    for (int slot = 0; slot < 36; ++slot) {
+        if (compare_slots(slot)) {
+            assert(0);
+        }
+    }
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+#include "test_ymf262_compare_epilogue_inline.h"
diff --git a/tests/test_ymf262_compare_epilogue_inline.h b/tests/test_ymf262_compare_epilogue_inline.h
new file mode 100644
index 0000000..e67d121
--- /dev/null
+++ b/tests/test_ymf262_compare_epilogue_inline.h
@@ -0,0 +1,165 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+
+#include <assert.h>
+
+
+static int app_boot(void)
+{
+    app_return = TEST_STATUS_HARD;
+
+    aymo_cpu_boot();
+
+    score_data = NULL;
+    score_size = 0u;
+    memset(&score, 0, sizeof(score));
+
+    memset(&nuked_chip, 0, sizeof(nuked_chip));
+    memset(&nuked_out, 0, sizeof(nuked_out));
+    memset(&aymo_chip, 0, sizeof(aymo_chip));
+
+    return TEST_STATUS_PASS;
+}
+
+
+static int app_args_init(int argc, char** argv)
+{
+    memset(&app_args, 0, sizeof(app_args));
+
+    app_args.argc = argc;
+    app_args.argv = argv;
+
+    app_args.score_type = aymo_score_type_unknown;
+
+    return TEST_STATUS_PASS;
+}
+
+
+static int app_args_parse(void)
+{
+    if (app_args.argc != 3) {
+        fprintf(stderr, "USAGE:\t%s SCORETYPE SCOREPATH\n",
+                (app_args.argc ? app_args.argv[0] : "test_exe"));
+        return TEST_STATUS_HARD;
+    }
+
+    app_args.score_type_cstr = app_args.argv[1];
+    app_args.score_type = aymo_score_ext_to_type(app_args.score_type_cstr);
+    if (app_args.score_type == aymo_score_type_unknown) {
+        fprintf(stderr, "Unsupported score format: %s\n", app_args.score_type_cstr);
+        return TEST_STATUS_HARD;
+    }
+
+    app_args.score_path_cstr = app_args.argv[2];
+
+    return TEST_STATUS_PASS;
+}
+
+
+static int app_setup(void)
+{
+    score.base.vt = aymo_score_type_to_vt(app_args.score_type);
+    aymo_score_ctor(&score.base);
+
+    if (aymo_file_load(app_args.score_path_cstr, &score_data, &score_size)) {
+        perror("aymo_file_load()");
+        return TEST_STATUS_HARD;
+    }
+
+    assert(score_size <= UINT32_MAX);
+    if (aymo_score_load(&score.base, score_data, (uint32_t)score_size)) {
+        fprintf(stderr, "Cannot load score: %s\n", app_args.argv[2]);
+        return TEST_STATUS_HARD;
+    }
+
+    OPL3_Reset(&nuked_chip, (uint32_t)AYMO_YMF262_SAMPLE_RATE);
+    aymo_(ctor)(&aymo_chip);
+
+    return TEST_STATUS_PASS;
+}
+
+
+static void app_teardown(void)
+{
+    aymo_(dtor)(&aymo_chip);
+
+    if (score.base.vt) {
+        aymo_score_unload(&score.base);
+        aymo_score_dtor(&score.base);
+    }
+    aymo_file_unload(score_data);
+    score_data = NULL;
+}
+
+
+static int app_run(void)
+{
+    struct aymo_score_status* status = aymo_score_get_status(&score.base);
+
+    while (!(status->flags & AYMO_SCORE_FLAG_EOF)) {
+        if (compare_chips()) {
+            fprintf(stderr, "Chips do not match\n");
+            return TEST_STATUS_FAIL;
+        }
+
+        aymo_score_tick(&score.base, 1u);
+
+        if (status->flags & AYMO_SCORE_FLAG_EVENT) {
+            OPL3_WriteReg(&nuked_chip, status->address, status->value);
+            aymo_(write)(&aymo_chip, status->address, status->value);
+        }
+
+        OPL3_Generate4Ch(&nuked_chip, &nuked_out[0]);
+        aymo_(tick)(&aymo_chip, 1u);
+    }
+    return TEST_STATUS_PASS;
+}
+
+
+int main(int argc, char** argv)
+{
+    app_return = app_boot();
+    if (app_return) goto catch_;
+
+    app_return = app_args_init(argc, argv);
+    if (app_return) goto catch_;
+
+    app_return = app_args_parse();
+    if (app_return) goto catch_;
+
+    app_return = app_setup();
+    if (app_return) goto catch_;
+
+    app_return = app_run();
+    if (app_return) goto catch_;
+
+    app_return = TEST_STATUS_PASS;
+    goto finally_;
+
+catch_:
+finally_:
+    app_teardown();
+    return app_return;
+}
+
+
+AYMO_CXX_EXTERN_C_END
diff --git a/tests/test_ymf262_compare_prologue_inline.h b/tests/test_ymf262_compare_prologue_inline.h
new file mode 100644
index 0000000..6828031
--- /dev/null
+++ b/tests/test_ymf262_compare_prologue_inline.h
@@ -0,0 +1,77 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo_file.h"
+#include "aymo_score_dro.h"
+#include "aymo_score_avd.h"
+#include "aymo_score_imf.h"
+#include "aymo_testing.h"
+#include "aymo_ymf262.h"
+
+#include "opl3.h"
+
+#include <stdio.h>
+#include <string.h>
+
+AYMO_CXX_EXTERN_C_BEGIN
+
+
+struct app_args {
+    int argc;
+    char** argv;
+
+    // Score parameters
+    const char* score_path_cstr;  // NULL or "-" for stdin
+    const char* score_type_cstr;  // NULL uses score file extension
+    enum aymo_score_type score_type;
+};
+
+
+// copied from opl3.c
+static const uint8_t mt[16] = {
+    1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 20, 24, 24, 30, 30
+};
+
+
+static int app_return;
+
+static struct app_args app_args;
+
+static void* score_data;
+static size_t score_size;
+static union app_scores {
+    struct aymo_score_instance base;
+    struct aymo_score_avd_instance avd;
+    struct aymo_score_dro_instance dro;
+    struct aymo_score_imf_instance imf;
+} score;
+
+static struct aymo_(chip) aymo_chip;
+static opl3_chip nuked_chip;
+static int16_t nuked_out[4];
+
+
+#undef assert
+#define assert(x)  {  \
+    if (!(x)) {  \
+        fprintf(stderr, "@ %d: FAILED assert(%s)\n",  \
+        __LINE__, (#x)); goto catch_;  \
+    }  \
+}//
diff --git a/tests/test_ymf262_none_compare.c b/tests/test_ymf262_none_compare.c
new file mode 100644
index 0000000..f2337fc
--- /dev/null
+++ b/tests/test_ymf262_none_compare.c
@@ -0,0 +1,73 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_none.h"
+
+#include "test_ymf262_compare_prologue_inline.h"
+
+
+static int compare_slots(int slot_)
+{
+    if (slot_ >= 36) {
+        return 0;  // ignore
+    }
+
+    // TODO:
+
+    return 0;
+//catch_:
+//    return 1;
+}
+
+
+static int compare_ch2xs(int ch2x)
+{
+    if (ch2x >= 18) {
+        return 0;  // ignore
+    }
+
+    // TODO:
+
+    return 0;
+//catch_:
+//    return 1;
+}
+
+
+static int compare_chips(void)
+{
+    // TODO:
+
+    for (int ch2x = 0; ch2x < 18; ++ch2x) {
+        assert(!compare_ch2xs(ch2x));
+    }
+
+    for (int slot = 0; slot < 36; ++slot) {
+        assert(!compare_slots(slot));
+    }
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+#include "test_ymf262_compare_epilogue_inline.h"
diff --git a/tests/test_ymf262_x86_avx2_compare.c b/tests/test_ymf262_x86_avx2_compare.c
new file mode 100644
index 0000000..198042f
--- /dev/null
+++ b/tests/test_ymf262_x86_avx2_compare.c
@@ -0,0 +1,170 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_X86_AVX2
+
+#include "aymo_cpu_x86_avx2_inline.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_x86_avx2.h"
+
+#include "test_ymf262_compare_prologue_inline.h"
+
+
+static int compare_slots(int slot_)
+{
+    if (slot_ >= 36) {
+        return 0;  // ignore
+    }
+
+    int word = aymo_ymf262_slot_to_word[slot_];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    const struct aymo_(slot_group)* sg = &aymo_chip.sg[sgi];
+    const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi];
+    const opl3_slot* slot = &nuked_chip.slot[slot_];
+    (void)cg;
+
+    // TODO: Commented stuff
+    assert((int16_t)vextractn(sg->wg_out, sgo) == slot->out);
+    assert((int16_t)vextractn(sg->wg_fb_mulhi, sgo) == (int16_t)(slot->channel->fb ? (0x40 << slot->channel->fb) : 0));
+//    assert(vextractn(sg->wg_fbmod, sgo) == slot->fbmod);
+//    assert(vextractn(sg->wg_mod, sgo) == *slot->mod);
+    assert((int16_t)vextractn(sg->wg_prout, sgo) == slot->prout);
+    assert((uint16_t)vextractn(sg->eg_rout, sgo) == slot->eg_rout);
+    assert((uint16_t)vextractn(sg->eg_out, sgo) == slot->eg_out);
+//    assert(vextractn(sg->eg_inc, sgo) == slot->eg_inc);
+    assert((uint16_t)vextractn(sg->eg_gen, sgo) == slot->eg_gen);
+//    assert(vextractn(sg->eg_rate, sgo) == slot->eg_rate);
+//    assert(vextractn(sg->eg_ksl, sgo) == slot->eg_ksl);
+    assert((int16_t)vextractn(sg->eg_tremolo_am, sgo) == *slot->trem);
+    assert((uint16_t)-vextractn(sg->pg_vib, sgo) == slot->reg_vib);
+    //assert(vextractn(sg->eg_egt, sgo) == slot->reg_type);
+    //assert(vextractn(sg->eg_ksr, sgo) == slot->reg_ksr);
+    assert((uint16_t)vextractn(sg->pg_mult_x2, sgo) == mt[slot->reg_mult]);
+//FIXME:    assert((uint16_t)vextractn(sg->eg_tl_x4, sgo) == slot->reg_tl * 4U);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 12) & 15) == slot->reg_ar);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >>  8) & 15) == slot->reg_dr);
+    assert((uint16_t)vextractn(sg->eg_sl, sgo) == slot->reg_sl);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >>  0) & 15) == slot->reg_rr);
+    //assert(vextractn(sg->wg_wf, sgo) == slot->reg_wf);
+    assert((uint16_t)vextractn(sg->eg_key, sgo) == slot->key);
+    vi32_t pg_phase_vv = (aymo_(sgo_side)[sgo] ? sg->pg_phase_hi : sg->pg_phase_lo);
+    uint32_t pg_phase = vvextractn(pg_phase_vv, aymo_(sgo_cell)[sgo]);
+    assert(pg_phase == slot->pg_phase);
+    assert((uint16_t)vextractn(sg->pg_phase_out, sgo) == slot->pg_phase_out);
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+static int compare_ch2xs(int ch2x)
+{
+    if (ch2x >= 18) {
+        return 0;  // ignore
+    }
+
+    int word = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi];
+    const opl3_channel* channel = &nuked_chip.channel[ch2x];
+
+    // TODO: Commented stuff
+    //int16_t* out[0];
+    //int16_t* out[1];
+    //int16_t* out[2];
+    //int16_t* out[3];
+    //int32_t leftpan;
+    //int32_t rightpan;
+    //uint8_t chtype;
+    assert((uint16_t)vextractn(cg->pg_fnum, sgo) == channel->f_num);
+    assert((uint16_t)vextractn(cg->pg_block, sgo) == channel->block);
+    //uint8_t fb;  // compared at slot group level
+    //uint8_t con;
+    //uint8_t alg;
+    assert((uint16_t)vextractn(cg->eg_ksv, sgo) == channel->ksv);
+    assert((uint16_t)vextractn(cg->og_ch_gate_a, sgo) == channel->cha);
+    assert((uint16_t)vextractn(cg->og_ch_gate_b, sgo) == channel->chb);
+    assert((uint16_t)vextractn(cg->og_ch_gate_c, sgo) == channel->chc);
+    assert((uint16_t)vextractn(cg->og_ch_gate_d, sgo) == channel->chd);
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+static int compare_chips(void)
+{
+    _mm_sfence();
+
+    // TODO: Commented stuff
+    assert((uint16_t)aymo_chip.tm_timer == nuked_chip.timer);
+    assert(aymo_chip.eg_timer == nuked_chip.eg_timer);
+    assert(aymo_chip.eg_timerrem == nuked_chip.eg_timerrem);
+    assert(aymo_chip.eg_state == nuked_chip.eg_state);
+    assert((uint16_t)vextractn(aymo_chip.eg_add, 0) == nuked_chip.eg_add);
+    //uint8_t newm;
+    //uint8_t nts;
+    //uint8_t rhy;
+    assert(aymo_chip.pg_vibpos == nuked_chip.vibpos);
+    assert(aymo_chip.eg_vibshift == nuked_chip.vibshift);
+    //assert((uint16_t)vextractn(aymo_chip.eg_tremolo, 0) == nuked_chip.tremolo);
+    assert(aymo_chip.eg_tremolopos == nuked_chip.tremolopos);
+    assert(aymo_chip.eg_tremoloshift == nuked_chip.tremoloshift);
+    assert(aymo_chip.ng_noise == nuked_chip.noise);
+    assert((int16_t)_mm_extract_epi16(aymo_chip.og_out, 0) == nuked_out[0]);
+    assert((int16_t)_mm_extract_epi16(aymo_chip.og_out, 1) == nuked_out[1]);
+    assert((int16_t)_mm_extract_epi16(aymo_chip.og_out, 2) == nuked_out[2]);
+    assert((int16_t)_mm_extract_epi16(aymo_chip.og_out, 3) == nuked_out[3]);
+    assert(aymo_chip.rm_hh_bit2 == nuked_chip.rm_hh_bit2);
+    assert(aymo_chip.rm_hh_bit3 == nuked_chip.rm_hh_bit3);
+    assert(aymo_chip.rm_hh_bit7 == nuked_chip.rm_hh_bit7);
+    assert(aymo_chip.rm_hh_bit8 == nuked_chip.rm_hh_bit8);
+    assert(aymo_chip.rm_tc_bit3 == nuked_chip.rm_tc_bit3);
+    assert(aymo_chip.rm_tc_bit5 == nuked_chip.rm_tc_bit5);
+
+    for (int ch2x = 0; ch2x < 18; ++ch2x) {
+        if (compare_ch2xs(ch2x)) {
+            assert(0);
+        }
+    }
+
+    for (int slot = 0; slot < 36; ++slot) {
+        if (compare_slots(slot)) {
+            assert(0);
+        }
+    }
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+#include "test_ymf262_compare_epilogue_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_X86_AVX2
diff --git a/tests/test_ymf262_x86_avx_compare.c b/tests/test_ymf262_x86_avx_compare.c
new file mode 100644
index 0000000..bcc1d3a
--- /dev/null
+++ b/tests/test_ymf262_x86_avx_compare.c
@@ -0,0 +1,170 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#include "aymo_cpu_x86_sse41_inline.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_x86_sse41.h"
+
+#include "test_ymf262_compare_prologue_inline.h"
+
+
+static int compare_slots(int slot_)
+{
+    if (slot_ >= 36) {
+        return 0;  // ignore
+    }
+
+    int word = aymo_ymf262_slot_to_word[slot_];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    const struct aymo_(slot_group)* sg = &aymo_chip.sg[sgi];
+    const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi];
+    const opl3_slot* slot = &nuked_chip.slot[slot_];
+    (void)cg;
+
+    // TODO: Commented stuff
+    assert((int16_t)vextractn(sg->wg_out, sgo) == slot->out);
+    assert((int16_t)vextractn(sg->wg_fb_mulhi, sgo) == (int16_t)(slot->channel->fb ? (0x40 << slot->channel->fb) : 0));
+//    assert(vextractn(sg->wg_fbmod, sgo) == slot->fbmod);
+//    assert(vextractn(sg->wg_mod, sgo) == *slot->mod);
+    assert((int16_t)vextractn(sg->wg_prout, sgo) == slot->prout);
+    assert((uint16_t)vextractn(sg->eg_rout, sgo) == slot->eg_rout);
+    assert((uint16_t)vextractn(sg->eg_out, sgo) == slot->eg_out);
+//    assert(vextractn(sg->eg_inc, sgo) == slot->eg_inc);
+    assert((uint16_t)vextractn(sg->eg_gen, sgo) == slot->eg_gen);
+//    assert(vextractn(sg->eg_rate, sgo) == slot->eg_rate);
+//    assert(vextractn(sg->eg_ksl, sgo) == slot->eg_ksl);
+    assert((int16_t)vextractn(sg->eg_tremolo_am, sgo) == *slot->trem);
+    assert((uint16_t)-vextractn(sg->pg_vib, sgo) == slot->reg_vib);
+    //assert(vextractn(sg->eg_egt, sgo) == slot->reg_type);
+    //assert(vextractn(sg->eg_ksr, sgo) == slot->reg_ksr);
+    assert((uint16_t)vextractn(sg->pg_mult_x2, sgo) == mt[slot->reg_mult]);
+    assert((uint16_t)vextractn(sg->eg_tl_x4, sgo) == slot->reg_tl * 4U);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 12) & 15) == slot->reg_ar);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >>  8) & 15) == slot->reg_dr);
+    assert((uint16_t)vextractn(sg->eg_sl, sgo) == slot->reg_sl);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >>  0) & 15) == slot->reg_rr);
+    //assert(vextractn(sg->wg_wf, sgo) == slot->reg_wf);
+    assert((uint16_t)vextractn(sg->eg_key, sgo) == slot->key);
+    vi32_t pg_phase_vv = (aymo_(sgo_side)[sgo] ? sg->pg_phase_hi : sg->pg_phase_lo);
+    uint32_t pg_phase = vvextractn(pg_phase_vv, aymo_(sgo_cell)[sgo]);
+    assert(pg_phase == slot->pg_phase);
+    assert((uint16_t)vextractn(sg->pg_phase_out, sgo) == slot->pg_phase_out);
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+static int compare_ch2xs(int ch2x)
+{
+    if (ch2x >= 18) {
+        return 0;  // ignore
+    }
+
+    int word = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi];
+    const opl3_channel* channel = &nuked_chip.channel[ch2x];
+
+    // TODO: Commented stuff
+    //int16_t* out[0];
+    //int16_t* out[1];
+    //int16_t* out[2];
+    //int16_t* out[3];
+    //int32_t leftpan;
+    //int32_t rightpan;
+    //uint8_t chtype;
+    assert((uint16_t)vextractn(cg->pg_fnum, sgo) == channel->f_num);
+    assert((uint16_t)vextractn(cg->pg_block, sgo) == channel->block);
+    //uint8_t fb;  // compared at slot group level
+    //uint8_t con;
+    //uint8_t alg;
+    assert((uint16_t)vextractn(cg->eg_ksv, sgo) == channel->ksv);
+    assert((uint16_t)vextractn(cg->og_ch_gate_a, sgo) == channel->cha);
+    assert((uint16_t)vextractn(cg->og_ch_gate_b, sgo) == channel->chb);
+    assert((uint16_t)vextractn(cg->og_ch_gate_c, sgo) == channel->chc);
+    assert((uint16_t)vextractn(cg->og_ch_gate_d, sgo) == channel->chd);
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+static int compare_chips(void)
+{
+    _mm_sfence();
+
+    // TODO: Commented stuff
+    assert((uint16_t)aymo_chip.tm_timer == nuked_chip.timer);
+    assert(aymo_chip.eg_timer == nuked_chip.eg_timer);
+    assert(aymo_chip.eg_timerrem == nuked_chip.eg_timerrem);
+    assert(aymo_chip.eg_state == nuked_chip.eg_state);
+    assert((uint16_t)vextract(aymo_chip.eg_add, 0) == nuked_chip.eg_add);
+    //uint8_t newm;
+    //uint8_t nts;
+    //uint8_t rhy;
+    assert(aymo_chip.pg_vibpos == nuked_chip.vibpos);
+    assert(aymo_chip.eg_vibshift == nuked_chip.vibshift);
+    //assert((uint16_t)vextractn(aymo_chip.eg_tremolo, 0) == nuked_chip.tremolo);
+    assert(aymo_chip.eg_tremolopos == nuked_chip.tremolopos);
+    assert(aymo_chip.eg_tremoloshift == nuked_chip.tremoloshift);
+    assert(aymo_chip.ng_noise == nuked_chip.noise);
+    assert((int16_t)vextract(aymo_chip.og_out, 0) == nuked_out[0]);
+    assert((int16_t)vextract(aymo_chip.og_out, 1) == nuked_out[1]);
+    assert((int16_t)vextract(aymo_chip.og_out, 2) == nuked_out[2]);
+    assert((int16_t)vextract(aymo_chip.og_out, 3) == nuked_out[3]);
+    assert(aymo_chip.rm_hh_bit2 == nuked_chip.rm_hh_bit2);
+    assert(aymo_chip.rm_hh_bit3 == nuked_chip.rm_hh_bit3);
+    assert(aymo_chip.rm_hh_bit7 == nuked_chip.rm_hh_bit7);
+    assert(aymo_chip.rm_hh_bit8 == nuked_chip.rm_hh_bit8);
+    assert(aymo_chip.rm_tc_bit3 == nuked_chip.rm_tc_bit3);
+    assert(aymo_chip.rm_tc_bit5 == nuked_chip.rm_tc_bit5);
+
+    for (int ch2x = 0; ch2x < 18; ++ch2x) {
+        if (compare_ch2xs(ch2x)) {
+            assert(0);
+        }
+    }
+
+    for (int slot = 0; slot < 36; ++slot) {
+        if (compare_slots(slot)) {
+            assert(0);
+        }
+    }
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+#include "test_ymf262_compare_epilogue_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41
diff --git a/tests/test_ymf262_x86_sse41_compare.c b/tests/test_ymf262_x86_sse41_compare.c
new file mode 100644
index 0000000..2d6f4ec
--- /dev/null
+++ b/tests/test_ymf262_x86_sse41_compare.c
@@ -0,0 +1,170 @@
+/*
+AYMO - Accelerated YaMaha Operator
+Copyright (c) 2023-2024 Andrea Zoppi.
+
+This file is part of AYMO.
+
+AYMO is free software: you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation, either version 2.1 of the License, or (at your option)
+any later version.
+
+AYMO is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with AYMO. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include "aymo.h"
+#ifdef AYMO_CPU_SUPPORT_X86_SSE41
+
+#include "aymo_cpu_x86_sse41_inline.h"
+#define AYMO_KEEP_SHORTHANDS
+#include "aymo_ymf262_x86_sse41.h"
+
+#include "test_ymf262_compare_prologue_inline.h"
+
+
+static int compare_slots(int slot_)
+{
+    if (slot_ >= 36) {
+        return 0;  // ignore
+    }
+
+    int word = aymo_ymf262_slot_to_word[slot_];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    const struct aymo_(slot_group)* sg = &aymo_chip.sg[sgi];
+    const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi];
+    const opl3_slot* slot = &nuked_chip.slot[slot_];
+    (void)cg;
+
+    // TODO: Commented stuff
+    assert((int16_t)vextractn(sg->wg_out, sgo) == slot->out);
+    assert((int16_t)vextractn(sg->wg_fb_mulhi, sgo) == (int16_t)(slot->channel->fb ? (0x40 << slot->channel->fb) : 0));
+//    assert(vextractn(sg->wg_fbmod, sgo) == slot->fbmod);
+//    assert(vextractn(sg->wg_mod, sgo) == *slot->mod);
+    assert((int16_t)vextractn(sg->wg_prout, sgo) == slot->prout);
+    assert((uint16_t)vextractn(sg->eg_rout, sgo) == slot->eg_rout);
+    assert((uint16_t)vextractn(sg->eg_out, sgo) == slot->eg_out);
+//    assert(vextractn(sg->eg_inc, sgo) == slot->eg_inc);
+    assert((uint16_t)vextractn(sg->eg_gen, sgo) == slot->eg_gen);
+//    assert(vextractn(sg->eg_rate, sgo) == slot->eg_rate);
+//    assert(vextractn(sg->eg_ksl, sgo) == slot->eg_ksl);
+    assert((int16_t)vextractn(sg->eg_tremolo_am, sgo) == *slot->trem);
+    assert((uint16_t)-vextractn(sg->pg_vib, sgo) == slot->reg_vib);
+    //assert(vextractn(sg->eg_egt, sgo) == slot->reg_type);
+    //assert(vextractn(sg->eg_ksr, sgo) == slot->reg_ksr);
+    assert((uint16_t)vextractn(sg->pg_mult_x2, sgo) == mt[slot->reg_mult]);
+//FIXME:    assert((uint16_t)vextractn(sg->eg_tl_x4, sgo) == slot->reg_tl * 4U);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 12) & 15) == slot->reg_ar);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >>  8) & 15) == slot->reg_dr);
+    assert((uint16_t)vextractn(sg->eg_sl, sgo) == slot->reg_sl);
+    assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >>  0) & 15) == slot->reg_rr);
+    //assert(vextractn(sg->wg_wf, sgo) == slot->reg_wf);
+    assert((uint16_t)vextractn(sg->eg_key, sgo) == slot->key);
+    vi32_t pg_phase_vv = (aymo_(sgo_side)[sgo] ? sg->pg_phase_hi : sg->pg_phase_lo);
+    uint32_t pg_phase = vvextractn(pg_phase_vv, aymo_(sgo_cell)[sgo]);
+    assert(pg_phase == slot->pg_phase);
+    assert((uint16_t)vextractn(sg->pg_phase_out, sgo) == slot->pg_phase_out);
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+static int compare_ch2xs(int ch2x)
+{
+    if (ch2x >= 18) {
+        return 0;  // ignore
+    }
+
+    int word = aymo_ymf262_ch2x_to_word[ch2x][0];
+    int sgi = (word / AYMO_(SLOT_GROUP_LENGTH));
+    int sgo = (word % AYMO_(SLOT_GROUP_LENGTH));
+    int cgi = aymo_(sgi_to_cgi)(sgi);
+    const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi];
+    const opl3_channel* channel = &nuked_chip.channel[ch2x];
+
+    // TODO: Commented stuff
+    //int16_t* out[0];
+    //int16_t* out[1];
+    //int16_t* out[2];
+    //int16_t* out[3];
+    //int32_t leftpan;
+    //int32_t rightpan;
+    //uint8_t chtype;
+    assert((uint16_t)vextractn(cg->pg_fnum, sgo) == channel->f_num);
+    assert((uint16_t)vextractn(cg->pg_block, sgo) == channel->block);
+    //uint8_t fb;  // compared at slot group level
+    //uint8_t con;
+    //uint8_t alg;
+    assert((uint16_t)vextractn(cg->eg_ksv, sgo) == channel->ksv);
+    assert((uint16_t)vextractn(cg->og_ch_gate_a, sgo) == channel->cha);
+    assert((uint16_t)vextractn(cg->og_ch_gate_b, sgo) == channel->chb);
+    assert((uint16_t)vextractn(cg->og_ch_gate_c, sgo) == channel->chc);
+    assert((uint16_t)vextractn(cg->og_ch_gate_d, sgo) == channel->chd);
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+static int compare_chips(void)
+{
+    _mm_sfence();
+
+    // TODO: Commented stuff
+    assert((uint16_t)aymo_chip.tm_timer == nuked_chip.timer);
+    assert(aymo_chip.eg_timer == nuked_chip.eg_timer);
+    assert(aymo_chip.eg_timerrem == nuked_chip.eg_timerrem);
+    assert(aymo_chip.eg_state == nuked_chip.eg_state);
+    assert((uint16_t)vextract(aymo_chip.eg_add, 0) == nuked_chip.eg_add);
+    //uint8_t newm;
+    //uint8_t nts;
+    //uint8_t rhy;
+    assert(aymo_chip.pg_vibpos == nuked_chip.vibpos);
+    assert(aymo_chip.eg_vibshift == nuked_chip.vibshift);
+    //assert((uint16_t)vextractn(aymo_chip.eg_tremolo, 0) == nuked_chip.tremolo);
+    assert(aymo_chip.eg_tremolopos == nuked_chip.tremolopos);
+    assert(aymo_chip.eg_tremoloshift == nuked_chip.tremoloshift);
+    assert(aymo_chip.ng_noise == nuked_chip.noise);
+    assert((int16_t)vextract(aymo_chip.og_out, 0) == nuked_out[0]);
+    assert((int16_t)vextract(aymo_chip.og_out, 1) == nuked_out[1]);
+    assert((int16_t)vextract(aymo_chip.og_out, 2) == nuked_out[2]);
+    assert((int16_t)vextract(aymo_chip.og_out, 3) == nuked_out[3]);
+    assert(aymo_chip.rm_hh_bit2 == nuked_chip.rm_hh_bit2);
+    assert(aymo_chip.rm_hh_bit3 == nuked_chip.rm_hh_bit3);
+    assert(aymo_chip.rm_hh_bit7 == nuked_chip.rm_hh_bit7);
+    assert(aymo_chip.rm_hh_bit8 == nuked_chip.rm_hh_bit8);
+    assert(aymo_chip.rm_tc_bit3 == nuked_chip.rm_tc_bit3);
+    assert(aymo_chip.rm_tc_bit5 == nuked_chip.rm_tc_bit5);
+
+    for (int ch2x = 0; ch2x < 18; ++ch2x) {
+        if (compare_ch2xs(ch2x)) {
+            assert(0);
+        }
+    }
+
+    for (int slot = 0; slot < 36; ++slot) {
+        if (compare_slots(slot)) {
+            assert(0);
+        }
+    }
+
+    return 0;
+catch_:
+    return 1;
+}
+
+
+#include "test_ymf262_compare_epilogue_inline.h"
+
+
+#endif  // AYMO_CPU_SUPPORT_X86_SSE41