From c08117828fcd446b303f69ac7ba094160ec6d4d6 Mon Sep 17 00:00:00 2001 From: Andrea Zoppi Date: Mon, 11 Mar 2024 23:13:30 +0100 Subject: [PATCH] Added initial source code snapshot --- apps/aymo_ymf262_play.c | 575 +++++++ apps/meson.build | 22 + aymo.pc.in | 15 + contrib/meson.build | 26 + doc/.gitkeep | 0 include/aymo.h | 33 + include/aymo_cc.h | 189 +++ include/aymo_convert.h | 54 + include/aymo_convert_arm_neon.h | 66 + include/aymo_convert_none.h | 64 + include/aymo_convert_x86_avx2.h | 66 + include/aymo_convert_x86_sse41.h | 66 + include/aymo_cpu.h | 55 + include/aymo_cpu_arm.h | 43 + include/aymo_cpu_arm_neon.h | 53 + include/aymo_cpu_arm_neon_inline.h | 382 +++++ include/aymo_cpu_x86.h | 47 + include/aymo_cpu_x86_avx2.h | 46 + include/aymo_cpu_x86_avx2_inline.h | 428 +++++ include/aymo_cpu_x86_sse41.h | 46 + include/aymo_cpu_x86_sse41_inline.h | 426 +++++ include/aymo_file.h | 42 + include/aymo_score.h | 145 ++ include/aymo_score_avd.h | 89 + include/aymo_score_dro.h | 166 ++ include/aymo_score_imf.h | 131 ++ include/aymo_score_raw.h | 99 ++ include/aymo_sys_linux.h | 25 + include/aymo_sys_windows.h | 25 + include/aymo_tda8425.h | 45 + include/aymo_tda8425_arm_neon.h | 107 ++ include/aymo_tda8425_common.h | 84 + include/aymo_tda8425_none.h | 61 + include/aymo_tda8425_x86_avx2.h | 100 ++ include/aymo_tda8425_x86_sse41.h | 106 ++ include/aymo_wave.h | 85 + include/aymo_ym7128.h | 46 + include/aymo_ym7128_arm_neon.h | 93 + include/aymo_ym7128_common.h | 118 ++ include/aymo_ym7128_none.h | 61 + include/aymo_ym7128_x86_sse41.h | 93 + include/aymo_ymf262.h | 56 + include/aymo_ymf262_arm_neon.h | 333 ++++ include/aymo_ymf262_common.h | 230 +++ include/aymo_ymf262_none.h | 79 + include/aymo_ymf262_x86_avx.h | 333 ++++ include/aymo_ymf262_x86_avx2.h | 332 ++++ include/aymo_ymf262_x86_sse41.h | 332 ++++ include/meson.build | 11 + meson.build | 688 ++++++++ meson_options.txt | 8 + msvc-arm.txt | 14 + msvc-arm_env.bat | 4 + src/aymo.c | 35 + src/aymo_convert.c | 206 +++ src/aymo_convert_arm_neon.c | 821 +++++++++ src/aymo_convert_none.c | 177 ++ src/aymo_convert_x86_avx2.c | 335 ++++ src/aymo_convert_x86_sse41.c | 796 +++++++++ src/aymo_cpu.c | 38 + src/aymo_cpu_arm.c | 61 + src/aymo_cpu_x86.c | 119 ++ src/aymo_empty.c | 1 + src/aymo_file.c | 133 ++ src/aymo_score.c | 153 ++ src/aymo_score_avd.c | 174 ++ src/aymo_score_dro.c | 376 +++++ src/aymo_score_imf.c | 266 +++ src/aymo_score_raw.c | 231 +++ src/aymo_sys_linux.c | 19 + src/aymo_sys_windows.c | 71 + src/aymo_tda8425.c | 172 ++ src/aymo_tda8425_arm_neon.c | 504 ++++++ src/aymo_tda8425_common.c | 150 ++ src/aymo_tda8425_none.c | 148 ++ src/aymo_tda8425_x86_avx2.c | 499 ++++++ src/aymo_tda8425_x86_sse41.c | 512 ++++++ src/aymo_wave.c | 79 + src/aymo_ym7128.c | 148 ++ src/aymo_ym7128_arm_neon.c | 270 +++ src/aymo_ym7128_common.c | 192 +++ src/aymo_ym7128_none.c | 130 ++ src/aymo_ym7128_x86_sse41.c | 270 +++ src/aymo_ymf262.c | 250 +++ src/aymo_ymf262_arm_neon.c | 1688 ++++++++++++++++++ src/aymo_ymf262_common.c | 263 +++ src/aymo_ymf262_none.c | 200 +++ src/aymo_ymf262_x86_avx.c | 1691 +++++++++++++++++++ src/aymo_ymf262_x86_avx2.c | 1683 ++++++++++++++++++ src/aymo_ymf262_x86_sse41.c | 1691 +++++++++++++++++++ tests/aymo_testing.c | 110 ++ tests/aymo_testing.h | 54 + tests/aymo_testing_epilogue_inline.h | 41 + tests/meson.build | 392 +++++ tests/test_convert_arm_neon.c | 376 +++++ tests/test_convert_none.c | 371 ++++ tests/test_convert_prologue_inline.h | 296 ++++ tests/test_convert_x86_avx2.c | 376 +++++ tests/test_convert_x86_sse41.c | 376 +++++ tests/test_tda8425_arm_neon_sweep.c | 31 + tests/test_tda8425_none_sweep.c | 27 + tests/test_tda8425_sweep_inline.h | 330 ++++ tests/test_tda8425_x86_avx2_sweep.c | 31 + tests/test_tda8425_x86_sse41_sweep.c | 31 + tests/test_ym7128_arm_neon_sweep.c | 31 + tests/test_ym7128_none_sweep.c | 27 + tests/test_ym7128_sweep_inline.h | 316 ++++ tests/test_ym7128_x86_sse41_sweep.c | 31 + tests/test_ymf262_arm_neon_compare.c | 77 + tests/test_ymf262_compare_epilogue_inline.h | 165 ++ tests/test_ymf262_compare_prologue_inline.h | 77 + tests/test_ymf262_none_compare.c | 73 + tests/test_ymf262_x86_avx2_compare.c | 170 ++ tests/test_ymf262_x86_avx_compare.c | 170 ++ tests/test_ymf262_x86_sse41_compare.c | 170 ++ 115 files changed, 25544 insertions(+) create mode 100644 apps/aymo_ymf262_play.c create mode 100644 apps/meson.build create mode 100644 aymo.pc.in create mode 100644 contrib/meson.build create mode 100644 doc/.gitkeep create mode 100644 include/aymo.h create mode 100644 include/aymo_cc.h create mode 100644 include/aymo_convert.h create mode 100644 include/aymo_convert_arm_neon.h create mode 100644 include/aymo_convert_none.h create mode 100644 include/aymo_convert_x86_avx2.h create mode 100644 include/aymo_convert_x86_sse41.h create mode 100644 include/aymo_cpu.h create mode 100644 include/aymo_cpu_arm.h create mode 100644 include/aymo_cpu_arm_neon.h create mode 100644 include/aymo_cpu_arm_neon_inline.h create mode 100644 include/aymo_cpu_x86.h create mode 100644 include/aymo_cpu_x86_avx2.h create mode 100644 include/aymo_cpu_x86_avx2_inline.h create mode 100644 include/aymo_cpu_x86_sse41.h create mode 100644 include/aymo_cpu_x86_sse41_inline.h create mode 100644 include/aymo_file.h create mode 100644 include/aymo_score.h create mode 100644 include/aymo_score_avd.h create mode 100644 include/aymo_score_dro.h create mode 100644 include/aymo_score_imf.h create mode 100644 include/aymo_score_raw.h create mode 100644 include/aymo_sys_linux.h create mode 100644 include/aymo_sys_windows.h create mode 100644 include/aymo_tda8425.h create mode 100644 include/aymo_tda8425_arm_neon.h create mode 100644 include/aymo_tda8425_common.h create mode 100644 include/aymo_tda8425_none.h create mode 100644 include/aymo_tda8425_x86_avx2.h create mode 100644 include/aymo_tda8425_x86_sse41.h create mode 100644 include/aymo_wave.h create mode 100644 include/aymo_ym7128.h create mode 100644 include/aymo_ym7128_arm_neon.h create mode 100644 include/aymo_ym7128_common.h create mode 100644 include/aymo_ym7128_none.h create mode 100644 include/aymo_ym7128_x86_sse41.h create mode 100644 include/aymo_ymf262.h create mode 100644 include/aymo_ymf262_arm_neon.h create mode 100644 include/aymo_ymf262_common.h create mode 100644 include/aymo_ymf262_none.h create mode 100644 include/aymo_ymf262_x86_avx.h create mode 100644 include/aymo_ymf262_x86_avx2.h create mode 100644 include/aymo_ymf262_x86_sse41.h create mode 100644 include/meson.build create mode 100644 meson.build create mode 100644 meson_options.txt create mode 100644 msvc-arm.txt create mode 100644 msvc-arm_env.bat create mode 100644 src/aymo.c create mode 100644 src/aymo_convert.c create mode 100644 src/aymo_convert_arm_neon.c create mode 100644 src/aymo_convert_none.c create mode 100644 src/aymo_convert_x86_avx2.c create mode 100644 src/aymo_convert_x86_sse41.c create mode 100644 src/aymo_cpu.c create mode 100644 src/aymo_cpu_arm.c create mode 100644 src/aymo_cpu_x86.c create mode 100644 src/aymo_empty.c create mode 100644 src/aymo_file.c create mode 100644 src/aymo_score.c create mode 100644 src/aymo_score_avd.c create mode 100644 src/aymo_score_dro.c create mode 100644 src/aymo_score_imf.c create mode 100644 src/aymo_score_raw.c create mode 100644 src/aymo_sys_linux.c create mode 100644 src/aymo_sys_windows.c create mode 100644 src/aymo_tda8425.c create mode 100644 src/aymo_tda8425_arm_neon.c create mode 100644 src/aymo_tda8425_common.c create mode 100644 src/aymo_tda8425_none.c create mode 100644 src/aymo_tda8425_x86_avx2.c create mode 100644 src/aymo_tda8425_x86_sse41.c create mode 100644 src/aymo_wave.c create mode 100644 src/aymo_ym7128.c create mode 100644 src/aymo_ym7128_arm_neon.c create mode 100644 src/aymo_ym7128_common.c create mode 100644 src/aymo_ym7128_none.c create mode 100644 src/aymo_ym7128_x86_sse41.c create mode 100644 src/aymo_ymf262.c create mode 100644 src/aymo_ymf262_arm_neon.c create mode 100644 src/aymo_ymf262_common.c create mode 100644 src/aymo_ymf262_none.c create mode 100644 src/aymo_ymf262_x86_avx.c create mode 100644 src/aymo_ymf262_x86_avx2.c create mode 100644 src/aymo_ymf262_x86_sse41.c create mode 100644 tests/aymo_testing.c create mode 100644 tests/aymo_testing.h create mode 100644 tests/aymo_testing_epilogue_inline.h create mode 100644 tests/meson.build create mode 100644 tests/test_convert_arm_neon.c create mode 100644 tests/test_convert_none.c create mode 100644 tests/test_convert_prologue_inline.h create mode 100644 tests/test_convert_x86_avx2.c create mode 100644 tests/test_convert_x86_sse41.c create mode 100644 tests/test_tda8425_arm_neon_sweep.c create mode 100644 tests/test_tda8425_none_sweep.c create mode 100644 tests/test_tda8425_sweep_inline.h create mode 100644 tests/test_tda8425_x86_avx2_sweep.c create mode 100644 tests/test_tda8425_x86_sse41_sweep.c create mode 100644 tests/test_ym7128_arm_neon_sweep.c create mode 100644 tests/test_ym7128_none_sweep.c create mode 100644 tests/test_ym7128_sweep_inline.h create mode 100644 tests/test_ym7128_x86_sse41_sweep.c create mode 100644 tests/test_ymf262_arm_neon_compare.c create mode 100644 tests/test_ymf262_compare_epilogue_inline.h create mode 100644 tests/test_ymf262_compare_prologue_inline.h create mode 100644 tests/test_ymf262_none_compare.c create mode 100644 tests/test_ymf262_x86_avx2_compare.c create mode 100644 tests/test_ymf262_x86_avx_compare.c create mode 100644 tests/test_ymf262_x86_sse41_compare.c diff --git a/apps/aymo_ymf262_play.c b/apps/aymo_ymf262_play.c new file mode 100644 index 0000000..1f231f3 --- /dev/null +++ b/apps/aymo_ymf262_play.c @@ -0,0 +1,575 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . + +--- + +To play via shell pipe, run: + + - ALSA Play: + aymo_ymf262_play SCORE | aplay -c 2 -r 47916 -f S16_LE + + - VLC: + aymo_ymf262_play SCORE | vlc --demux=rawaud --rawaud-channels 2 --rawaud-samplerate 47916 - +*/ + +#include "aymo.h" +#include "aymo_cpu.h" +#include "aymo_file.h" +#include "aymo_score.h" +#include "aymo_score_avd.h" +#include "aymo_score_dro.h" +#include "aymo_score_imf.h" +#include "aymo_wave.h" +#include "aymo_ymf262.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if (defined(__WINDOWS__) || defined(__CYGWIN__)) + #include + #include + #ifndef _MSC_VER + #define _fileno(f) ((f)->_file) + #endif +#endif + +AYMO_CXX_EXTERN_C_BEGIN + + +struct app_args { + int argc; + char** argv; + + // App parameters + unsigned loops; + bool benchmark; + + // Score parameters + const char* score_path_cstr; // NULL or "-" for stdin + const char* score_type_cstr; // NULL uses score file extension + enum aymo_score_type score_type; + unsigned score_after; + + // Output parameters + const char* out_path_cstr; // NULL or "-" for stdout + uint32_t out_frame_length; + bool out_quad; + + // YMF262 parameters + const struct aymo_ymf262_vt* ymf262_vt; + bool ymf262_extensions; +}; + + +static int app_return; + +static struct app_args app_args; +static clock_t clock_start; +static clock_t clock_end; + +static void* score_data; +static size_t score_size; +static union app_scores { + struct aymo_score_instance base; + struct aymo_score_avd_instance avd; + struct aymo_score_dro_instance dro; + struct aymo_score_imf_instance imf; +} score; + +static struct aymo_ymf262_chip* chip; + +static bool out_stdout; +static FILE* out_file; +static int16_t out_buffer_default[4]; +static int16_t* out_buffer_ptr; +static uint32_t out_frame_length; +static struct aymo_wave_heading wave_head; + +static void* aymo_aligned_alloc(size_t size, size_t align) +{ + assert(align); + assert(size < (SIZE_MAX - align - align)); + + void* allocptr = calloc((size + align + align), 1u); + if (allocptr) { + uintptr_t alignaddr = ((uintptr_t)(void*)allocptr + align); + uintptr_t offset = (alignaddr % align); + alignaddr += ((align - offset) % align); + void* alignptr = (void*)alignaddr; + uintptr_t refaddr = (alignaddr - sizeof(void*)); + void** refptr = (void**)(void*)refaddr; + *refptr = allocptr; + return alignptr; + } + return NULL; +} + + +static void aymo_aligned_free(void* alignptr) +{ + if (alignptr) { + uintptr_t alignaddr = (uintptr_t)alignptr; + uintptr_t refaddr = (alignaddr - sizeof(void*)); + void** refptr = (void**)(void*)refaddr; + void* allocptr = *refptr; + free(allocptr); + } +} + + +static int app_boot(void) +{ + app_return = 2; + + aymo_cpu_boot(); + aymo_ymf262_boot(); + + score_data = NULL; + score_size = 0u; + memset(&score, 0, sizeof(score)); + + chip = NULL; + + out_file = NULL; + out_buffer_ptr = out_buffer_default; + out_frame_length = 1u; + + return 0; +} + + +static int app_args_init(int argc, char** argv) +{ + memset(&app_args, 0, sizeof(app_args)); + + app_args.argc = argc; + app_args.argv = argv; + + app_args.loops = 1u; + + app_args.score_type = aymo_score_type_unknown; + + app_args.out_frame_length = 1u; + + app_args.ymf262_vt = aymo_ymf262_get_best_vt(); + + return 0; +} + + +static int app_usage(void) +{ + printf("TODO: USAGE\n"); + + return -1; // help +} + + +static int app_args_parse(void) +{ + int argi; + + for (argi = 1; argi < app_args.argc; ++argi) { + const char* name = app_args.argv[argi]; + + if (!strcmp(name, "--")) { + ++argi; + break; + } + + // Unary options + if (!strcmp(name, "--help") || !strcmp(name, "-h")) { + return app_usage(); + } + if (!strcmp(name, "--benchmark")) { + app_args.benchmark = true; + continue; + } + if (!strcmp(name, "--out-quad")) { + app_args.out_quad = true; + continue; + } + if (!strcmp(name, "--ymf62-extensions")) { + app_args.ymf262_extensions = true; + continue; + } + + // Binary options + if (argi >= (app_args.argc - 1)) { + break; + } + if (!strcmp(name, "--loops")) { + const char* text = app_args.argv[++argi]; + errno = 0; + app_args.loops = strtoul(text, NULL, 0); + if (errno) { + perror(name); + return 1; + } + continue; + } + if (!strcmp(name, "--score-after")) { + const char* text = app_args.argv[++argi]; + errno = 0; + app_args.score_after = strtoul(text, NULL, 0); + if (errno) { + perror(name); + return 1; + } + continue; + } + if (!strcmp(name, "--score-type")) { + const char* value = app_args.argv[++argi]; + app_args.score_type = aymo_score_ext_to_type(value); + if (app_args.score_type == aymo_score_type_unknown) { + fprintf(stderr, "ERROR: Unknown score type \"%s\"\n", value); + return 1; + } + continue; + } + if (!strcmp(name, "--cpu-ext")) { + const char* text = app_args.argv[++argi]; + app_args.ymf262_vt = aymo_ymf262_get_vt(text); + if (!app_args.ymf262_vt) { + fprintf(stderr, "ERROR: Unsupported CPU extensions tag: \"%s\"\n", text); + return 1; + } + continue; + } + if (!strcmp(name, "--buffer-size")) { + const char* text = app_args.argv[++argi]; + errno = 0; + app_args.out_frame_length = strtoul(text, NULL, 0); + if (errno) { + perror(name); + return 1; + } + continue; + } + break; + } + + if (argi == (app_args.argc - 2)) { + const char* text = app_args.argv[argi++]; + if (!strcmp(text, "-")) { + text = NULL; + } + app_args.score_path_cstr = text; + } + + if (argi == (app_args.argc - 1)) { + const char* text = app_args.argv[argi++]; + if (!strcmp(text, "-")) { + text = NULL; + } + + if (app_args.score_path_cstr) { + app_args.out_path_cstr = text; + } + else { + app_args.score_path_cstr = text; + } + } + + + if (app_args.score_type == aymo_score_type_unknown) { + const char* text = app_args.score_path_cstr; + if (text) { + const char* ext = strrchr(text, '.'); + if (ext) { + app_args.score_type = aymo_score_ext_to_type(ext + 1); + } + } + if (app_args.score_type == aymo_score_type_unknown) { + fprintf(stderr, "ERROR: Unsupported score type of \"%s\"\n", (text ? text : "")); + return 1; + } + } + + if (argi < app_args.argc) { + fprintf(stderr, "ERROR: Unknown options after #%d = \"%s\"\n", argi, app_args.argv[argi]); + return 1; + } + + return 0; +} + + +static int app_setup(void) +{ + if (aymo_file_load(app_args.score_path_cstr, &score_data, &score_size)) { + return 1; + } + score.base.vt = aymo_score_type_to_vt(app_args.score_type); + if (!score.base.vt) { + fprintf(stderr, "ERROR: Unsupported score type ID: %d\n", (int)app_args.score_type); + return 1; + } + aymo_score_ctor(&score.base); + if (aymo_score_load(&score.base, score_data, (uint32_t)score_size)) { + fprintf(stderr, "ERROR: Cannot load score \"%s\"\n", app_args.score_path_cstr); + return 1; + } + + size_t chip_size = app_args.ymf262_vt->get_sizeof(); + void* chip_alignptr = aymo_aligned_alloc(chip_size, 32u); + if (!chip_alignptr) { + perror("aymo_aligned_alloc(chip_size)"); + return 2; + } + chip = (struct aymo_ymf262_chip*)chip_alignptr; + chip->vt = app_args.ymf262_vt; + aymo_ymf262_ctor(chip); + + uint32_t out_channels = (app_args.out_quad ? 4u : 2u); + out_frame_length = app_args.out_frame_length; + if (out_frame_length < 1u) { + out_frame_length = 1u; + } + if (out_frame_length > (UINT32_MAX / (sizeof(int16_t) * out_channels))) { + out_frame_length = (UINT32_MAX / (sizeof(int16_t) * out_channels)); + } + uint32_t out_buffer_size = (out_frame_length * sizeof(int16_t) * out_channels); + out_buffer_ptr = (int16_t*)malloc(out_buffer_size); + if (!out_buffer_ptr) { + perror("malloc(out_buffer_size)"); + return 2; + } + + if (app_args.benchmark) { + out_stdout = false; + out_file = NULL; + } + else if (!app_args.out_path_cstr || !strcmp(app_args.out_path_cstr, "") || !strcmp(app_args.out_path_cstr, "-")) { + out_stdout = true; + out_file = stdout; + + #if (defined(__WINDOWS__) || defined(__CYGWIN__)) + errno = 0; + _setmode(_fileno(stdout), O_BINARY); + if (errno) { + perror("_setmode(stdout)"); + return 2; + } + #endif + } + else { + out_stdout = false; + out_file = fopen(app_args.out_path_cstr, "wb"); + if (!out_file) { + perror(app_args.out_path_cstr); + return 1; + } + + aymo_wave_heading_setup( + &wave_head, + AYMO_WAVE_FMT_TYPE_PCM, + (uint16_t)out_channels, + 16u, + AYMO_YMF262_SAMPLE_RATE, + 0u + ); + if (fwrite(&wave_head, sizeof(wave_head), 1u, out_file) != 1u) { + perror("fwrite(wave_head)"); + return 2; + } + } + + return 0; +} + + +static void app_teardown(void) +{ + if (chip) { + aymo_ymf262_dtor(chip); + aymo_aligned_free(chip); + } + chip = NULL; + + if (score.base.vt) { + aymo_score_unload(&score.base); + aymo_score_dtor(&score.base); + } + aymo_file_unload(score_data); + score_data = NULL; + + if (!out_stdout && out_file) { + fclose(out_file); + } + out_file = NULL; + + if (out_buffer_ptr && (out_buffer_ptr != out_buffer_default)) { + free(out_buffer_ptr); + } + out_buffer_ptr = NULL; + out_frame_length = 0u; +} + + +static int app_run(void) +{ + size_t out_channels = (app_args.out_quad ? 4u : 2u); + size_t out_sample_length = ((size_t)out_frame_length * out_channels); + uint32_t frame_total = 0u; + unsigned pending_loops = (app_args.loops - 1u); + unsigned score_after = app_args.score_after; + + aymo_ymf262_generate_i16x2_f aymo_ymf262_generate_i16; + if (app_args.out_quad) { + aymo_ymf262_generate_i16 = aymo_ymf262_generate_i16x4; + } + else { + aymo_ymf262_generate_i16 = aymo_ymf262_generate_i16x2; + } + + struct aymo_score_status* status = aymo_score_get_status(&score.base); + bool playing = !(status->flags & AYMO_SCORE_FLAG_EOF); + + clock_start = clock(); + + while (playing) { + int16_t* buffer_ptr = out_buffer_ptr; + uint32_t avail_length = out_frame_length; + uint32_t delay_length = status->delay; + + while (avail_length) { + if (delay_length > avail_length) { + delay_length = avail_length; + } + + aymo_ymf262_generate_i16(chip, delay_length, buffer_ptr); + buffer_ptr += (delay_length * out_channels); + frame_total++; + + aymo_score_tick(&score.base, delay_length); + avail_length -= delay_length; + + if (status->flags & AYMO_SCORE_FLAG_EVENT) { + aymo_ymf262_enqueue_write(chip, status->address, status->value); + } + + while (!(status->flags & (AYMO_SCORE_FLAG_DELAY | AYMO_SCORE_FLAG_EOF))) { + aymo_score_tick(&score.base, 0u); + + if (status->flags & AYMO_SCORE_FLAG_EVENT) { + aymo_ymf262_enqueue_write(chip, status->address, status->value); + } + } + + if (!(status->flags & AYMO_SCORE_FLAG_EOF)) { + delay_length = status->delay; + } + else if (app_args.loops) { + if (pending_loops) { + pending_loops--; + aymo_score_restart(&score.base); + } + else if (score_after) { + status->flags |= AYMO_SCORE_FLAG_DELAY; + status->delay = score_after; + delay_length = status->delay; + score_after = 0u; + } + else { + playing = false; + } + } + else { + aymo_score_restart(&score.base); + } + } + + if (out_file) { + if (fwrite(out_buffer_ptr, sizeof(int16_t), out_sample_length, out_file) != out_sample_length) { + perror("fwrite(out_buffer)"); + return 2; + } + } + } + + if (out_file && !out_stdout) { + if (fseek(out_file, 0, SEEK_SET)) { + perror("fseek(out_file)"); + return 2; + } + aymo_wave_heading_setup( + &wave_head, + AYMO_WAVE_FMT_TYPE_PCM, + (uint16_t)out_channels, + 16u, + AYMO_YMF262_SAMPLE_RATE, + frame_total + ); + if (fwrite(&wave_head, sizeof(wave_head), 1u, out_file) != 1u) { + perror("fwrite(wave_head)"); + return 2; + } + } + + clock_end = clock(); + + if (app_args.benchmark) { + clock_t clock_duration = (clock_end - clock_start); + double seconds = ((double)clock_duration * (1. / (double)CLOCKS_PER_SEC)); + printf("Render time: %.3f seconds\n", seconds); + } + + return 0; +} + + +int main(int argc, char** argv) +{ + app_return = app_boot(); + if (app_return) goto catch_; + + app_return = app_args_init(argc, argv); + if (app_return) goto catch_; + + app_return = app_args_parse(); + if (app_return == -1) { // help + app_return = 0; + goto finally_; + } + if (app_return) goto catch_; + + app_return = app_setup(); + if (app_return) goto catch_; + + app_return = app_run(); + if (app_return) goto catch_; + + goto finally_; + +catch_: +finally_: + app_teardown(); + return app_return; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/apps/meson.build b/apps/meson.build new file mode 100644 index 0000000..83e404e --- /dev/null +++ b/apps/meson.build @@ -0,0 +1,22 @@ +apps_includes = include_directories( + '.', +) + +apps_sources = files( +) + +if not opt_apps.disabled() + app_names = [ + 'aymo_ymf262_play', + ] + + foreach app_name : app_names + executable( + app_name, + apps_sources + files('@0@.c'.format(app_name)), + include_directories: [apps_includes, aymo_includes], + link_with: [aymo_static_lib, aymo_libc_lib], + install: false, + ) + endforeach +endif diff --git a/aymo.pc.in b/aymo.pc.in new file mode 100644 index 0000000..53955a3 --- /dev/null +++ b/aymo.pc.in @@ -0,0 +1,15 @@ +# AYMO pkg-config file + +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: AYMO +Description: Accelerated YaMaha Operator +URL: @URL@ +Version: @VERSION@ +Requires: +Conflicts: +Libs: -L${libdir} -laymo +Cflags: -I${includedir}/aymo diff --git a/contrib/meson.build b/contrib/meson.build new file mode 100644 index 0000000..c11bdaa --- /dev/null +++ b/contrib/meson.build @@ -0,0 +1,26 @@ +#add_project_arguments('-DTDA8425_FLOAT=float') + +aymo_contrib_includes = include_directories( + 'Nuked-OPL3', + 'TDA8425_emu/src', + 'YM7128B_emu/src', +) + +aymo_contrib_sources = files( + 'Nuked-OPL3/opl3.c', + 'TDA8425_emu/src/TDA8425_emu.c', + 'YM7128B_emu/src/YM7128B_emu.c', +) + +aymo_contrib_lib = static_library( + 'aymo-contrib', + aymo_contrib_sources, + include_directories: aymo_contrib_includes, + dependencies: libm, + install: false, +) + +aymo_contrib_dep = declare_dependency( + include_directories: aymo_contrib_includes, + link_whole: aymo_contrib_lib, +) diff --git a/doc/.gitkeep b/doc/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/include/aymo.h b/include/aymo.h new file mode 100644 index 0000000..179c0bd --- /dev/null +++ b/include/aymo.h @@ -0,0 +1,33 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_h +#define _include_aymo_h + +#include "aymo_cc.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +AYMO_PUBLIC void aymo_boot(void); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_h diff --git a/include/aymo_cc.h b/include/aymo_cc.h new file mode 100644 index 0000000..bca79a0 --- /dev/null +++ b/include/aymo_cc.h @@ -0,0 +1,189 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cc_h +#define _include_aymo_cc_h + +// Use "aymo_cc.h" as the common file including "aymo_config.h" (if required) +#ifdef AYMO_HAVE_CONFIG_H // command line / build system + #include "aymo_config.h" +#endif + + +// Usual macros to generate strings +#ifndef AYMO_STRINGIFY + #define AYMO_STRINGIFY(token) #token + #define AYMO_STRINGIFY2(token) AYMO_STRINGIFY(token) +#endif + + +// Unused variable +#define AYMO_UNUSED_VAR(x) ((void)(x)) + + +// Common C++ name mangling wrappers. +#ifndef AYMO_CXX_EXTERN_C_BEGIN + #ifdef __cplusplus + #define AYMO_CXX_EXTERN_C_BEGIN extern "C" { + #define AYMO_CXX_EXTERN_C_END } // extern "C" + #else + #define AYMO_CXX_EXTERN_C_BEGIN // ignore + #define AYMO_CXX_EXTERN_C_END // ignore + #endif +#endif + + +// #pragma pack(push/pop) equivalents. +// Originally by MSVC, also supported by modern GCC/Clang. +#ifndef AYMO_PRAGMA_POP + #if (defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__)) + #define AYMO_PRAGMA_PACK_PUSH_N(n) _Pragma(AYMO_STRINGIFY(pack(push, n))) + #define AYMO_PRAGMA_PACK_PUSH_1 AYMO_PRAGMA_PACK_PUSH_N(1) + #define AYMO_PRAGMA_PACK_PUSH_DEFAULT _Pragma("pack(push)") _Pragma("pack()") + #define AYMO_PRAGMA_PACK_POP _Pragma("pack(pop)") + #else + // Use default packing (i.e. compiler forced to 1-byte packing) + #ifndef _include_aymo_cc_h_AYMO_PRAGMA_POP + #define _include_aymo_cc_h_AYMO_PRAGMA_POP + #warning "Unsupported packing directives. Please set 1-byte packing to your compiler." + #endif + #define AYMO_PRAGMA_PACK_PUSH_N(n) // keep default + #define AYMO_PRAGMA_PACK_PUSH_1 // keep default + #define AYMO_PRAGMA_PACK_PUSH_DEFAULT // keep default + #define AYMO_PRAGMA_PACK_POP // keep default + #endif +#endif + + +// Aligns to some bytes. +// To be put AFTER the typename. +#ifndef AYMO_ALIGN + #if defined(_MSC_VER) + #define AYMO_ALIGN(n) __declspec(align(n)) + #elif (defined(__GNUC__) || defined(__clang__)) + #define AYMO_ALIGN(n) __attribute__((aligned(n))) + #else + #define AYMO_ALIGN(n) // default + #endif +#endif + + +// Sets bit-fields order as little-endian. +#ifndef AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + #if defined(__GNUC__) && defined(AYMO_CC_ID_GCC) + #define AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN _Pragma("scalar_storage_order little-endian") + #define AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT _Pragma("scalar_storage_order default") + #else + #define AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN // default + #define AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT // default + #endif +#endif + + +// Mark the symbol as public for shared objects (aka DLL). +#ifndef AYMO_PUBLIC + #if (defined(AYMO_CC_HOST_WINDOWS) )//FIXME: || defined(AYMO_CC_HOST_CYGWIN)) + // Using MSVC attribute, also supported by modern GCC/Clang. + #ifdef AYMO_BUILD + #define AYMO_PUBLIC extern __declspec(dllexport) + #else + #define AYMO_PUBLIC extern __declspec(dllimport) + #endif + #define AYMO_PRIVATE // ignore + + #elif (defined(__GNUC__) || defined(__clang__)) + // Using GCC-specific attribute + #ifdef AYMO_BUILD + #define AYMO_PUBLIC extern __attribute__((visibility("default"))) + #else + #define AYMO_PUBLIC // ignore + #endif + #define AYMO_PRIVATE extern __attribute__((visibility("hidden"))) + + #else + #ifndef _include_aymo_cc_h_AYMO_PUBLIC + #define _include_aymo_cc_h_AYMO_PUBLIC + #warning "Cannot assume a proper way to declare shared object functions." + #endif + #define AYMO_PUBLIC extern // ignore + #define AYMO_PRIVATE extern // ignore + #endif +#endif + + +// Wrap the condition expression with this +#ifndef AYMO_LIKELY + #if (defined(__GNUC__) || defined(__clang__)) + #define AYMO_LIKELY(x) (__builtin_expect(!!(x), 1)) + #define AYMO_UNLIKELY(x) (__builtin_expect(!!(x), 0)) + #else + #define AYMO_LIKELY(x) (x) // ignore + #define AYMO_UNLIKELY(x) (x) // ignore + #endif +#endif + + +// Usual macro to get 1D array size +#ifndef AYMO_VECTOR_LENGTH + #define AYMO_VECTOR_LENGTH(name) (sizeof(name) / sizeof((name)[0])) +#endif + + +// Cheap alternative to memset() +// No care for performance; made just to avoid a library call +static inline void aymo_memset(void* data, int value, unsigned long size) +{ + char* ptr = (char*)data; + const char* end = (char*)data + size; + while (ptr != end) { + *(ptr++) = value; + } +} + + +// Cheap alternative to memcpy() +// No care for performance; made just to avoid a library call +static inline void aymo_memcpy(void* dst, void* src, unsigned long size) +{ + char* dstp = (char*)dst; + const char* srcp = (const char*)src; + const char* end = (const char*)src + size; + while (srcp != end) { + *(dstp++) = *(srcp++); + } +} + + +// Cheap alternative to strcmp() +// No care for performance; made just to avoid a library call +static inline int aymo_strcmp(const char* a, const char* b) +{ + if (a && b) { + do { + if (*a != *b) { + return (*a - *b); + } + } while (*(a++) && *(b++)); + return 0; + } + return -0x8000; +} + + +#endif // _include_aymo_cc_h diff --git a/include/aymo_convert.h b/include/aymo_convert.h new file mode 100644 index 0000000..82ae358 --- /dev/null +++ b/include/aymo_convert.h @@ -0,0 +1,54 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_convert_h +#define _include_aymo_convert_h + +#include "aymo_cc.h" + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +AYMO_PUBLIC void aymo_convert_boot(void); + +AYMO_PUBLIC void aymo_convert_i16_f32(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_convert_f32_i16(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_convert_i16_f32_1(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_convert_f32_i16_1(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_convert_i16_f32_k(size_t n, const int16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_convert_f32_i16_k(size_t n, const float f32v[], int16_t i16v[], float scale); + +AYMO_PUBLIC void aymo_convert_u16_f32(size_t n, const uint16_t u16v[], float f32v[]); +AYMO_PUBLIC void aymo_convert_f32_u16(size_t n, const float f32v[], uint16_t u16v[]); + +AYMO_PUBLIC void aymo_convert_u16_f32_1(size_t n, const uint16_t u16v[], float f32v[]); +AYMO_PUBLIC void aymo_convert_f32_u16_1(size_t n, const float f32v[], uint16_t u16v[]); + +AYMO_PUBLIC void aymo_convert_u16_f32_k(size_t n, const uint16_t u16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_convert_f32_u16_k(size_t n, const float f32v[], uint16_t u16v[], float scale); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_convert_h diff --git a/include/aymo_convert_arm_neon.h b/include/aymo_convert_arm_neon.h new file mode 100644 index 0000000..cfe093b --- /dev/null +++ b/include/aymo_convert_arm_neon.h @@ -0,0 +1,66 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_convert_arm_neon_h +#define _include_aymo_convert_arm_neon_h + +#include "aymo_cc.h" +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_CONVERT_ARM_NEON_##_token_ +#define aymo_(_token_) aymo_convert_arm_neon_##_token_ + + +AYMO_PUBLIC void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale); + +AYMO_PUBLIC void aymo_(u16_f32)(size_t n, const uint16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t i16v[]); + +AYMO_PUBLIC void aymo_(u16_f32_1)(size_t n, const uint16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t i16v[]); + +AYMO_PUBLIC void aymo_(u16_f32_k)(size_t n, const uint16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t i16v[], float scale); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON +#endif // _include_aymo_convert_arm_neon_h diff --git a/include/aymo_convert_none.h b/include/aymo_convert_none.h new file mode 100644 index 0000000..f665f4c --- /dev/null +++ b/include/aymo_convert_none.h @@ -0,0 +1,64 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_convert_none_h +#define _include_aymo_convert_none_h + +#include "aymo_cc.h" + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_CONVERT_NONE_##_token_ +#define aymo_(_token_) aymo_convert_none_##_token_ + + +AYMO_PUBLIC void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale); + +AYMO_PUBLIC void aymo_(u16_f32)(size_t n, const uint16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t i16v[]); + +AYMO_PUBLIC void aymo_(u16_f32_1)(size_t n, const uint16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t i16v[]); + +AYMO_PUBLIC void aymo_(u16_f32_k)(size_t n, const uint16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t i16v[], float scale); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_convert_none_h diff --git a/include/aymo_convert_x86_avx2.h b/include/aymo_convert_x86_avx2.h new file mode 100644 index 0000000..76651b1 --- /dev/null +++ b/include/aymo_convert_x86_avx2.h @@ -0,0 +1,66 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_convert_x86_avx2_h +#define _include_aymo_convert_x86_avx2_h + +#include "aymo_cc.h" +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_CONVERT_X86_AVX2_##_token_ +#define aymo_(_token_) aymo_convert_x86_avx2_##_token_ + + +AYMO_PUBLIC void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale); + +AYMO_PUBLIC void aymo_(u16_f32)(size_t n, const uint16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t i16v[]); + +AYMO_PUBLIC void aymo_(u16_f32_1)(size_t n, const uint16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t i16v[]); + +AYMO_PUBLIC void aymo_(u16_f32_k)(size_t n, const uint16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t i16v[], float scale); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 +#endif // _include_aymo_convert_x86_avx2_h diff --git a/include/aymo_convert_x86_sse41.h b/include/aymo_convert_x86_sse41.h new file mode 100644 index 0000000..c83f7a7 --- /dev/null +++ b/include/aymo_convert_x86_sse41.h @@ -0,0 +1,66 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_convert_x86_sse41_h +#define _include_aymo_convert_x86_sse41_h + +#include "aymo_cc.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_CONVERT_X86_SSE41_##_token_ +#define aymo_(_token_) aymo_convert_x86_sse41_##_token_ + + +AYMO_PUBLIC void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]); + +AYMO_PUBLIC void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale); + +AYMO_PUBLIC void aymo_(u16_f32)(size_t n, const uint16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t i16v[]); + +AYMO_PUBLIC void aymo_(u16_f32_1)(size_t n, const uint16_t i16v[], float f32v[]); +AYMO_PUBLIC void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t i16v[]); + +AYMO_PUBLIC void aymo_(u16_f32_k)(size_t n, const uint16_t i16v[], float f32v[], float scale); +AYMO_PUBLIC void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t i16v[], float scale); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 +#endif // _include_aymo_convert_x86_sse41_h diff --git a/include/aymo_cpu.h b/include/aymo_cpu.h new file mode 100644 index 0000000..4697836 --- /dev/null +++ b/include/aymo_cpu.h @@ -0,0 +1,55 @@ +// Main CPU header file. +// Always include this one, not the CPU-specific ones. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_h +#define _include_aymo_cpu_h + +#include "aymo_cc.h" + +#if (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64)) + #include "aymo_cpu_x86.h" + + #if defined(AYMO_CPU_SUPPORT_X86_AVX2) + #include "aymo_cpu_x86_avx2.h" + #endif + + #if defined(AYMO_CPU_SUPPORT_X86_SSE41) + #include "aymo_cpu_x86_sse41.h" + #endif +#endif + +#if (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64)) + #include "aymo_cpu_arm.h" + + #if defined(AYMO_CPU_SUPPORT_ARM_NEON) + #include "aymo_cpu_arm_neon.h" + #endif +#endif + +AYMO_CXX_EXTERN_C_BEGIN + + +AYMO_PUBLIC void aymo_cpu_boot(void); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_cpu_h diff --git a/include/aymo_cpu_arm.h b/include/aymo_cpu_arm.h new file mode 100644 index 0000000..2ee7aa0 --- /dev/null +++ b/include/aymo_cpu_arm.h @@ -0,0 +1,43 @@ +// CPU-specific header file for ARM. +// DO NOT include this file; #include "aymo_cpu.h" instead. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_arm_h +#define _include_aymo_cpu_arm_h +#if (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64)) + +AYMO_CXX_EXTERN_C_BEGIN + + +#define AYMO_CPU_ARM_EXT_ARMV7 (1u << 0u) +#define AYMO_CPU_ARM_EXT_NEON (1u << 1u) +#define AYMO_CPU_ARM_EXT_AARCH32 (1u << 2u) +#define AYMO_CPU_ARM_EXT_AARCH64 (1u << 3u) +#define AYMO_CPU_ARM_EXT_NEON64 (1u << 4u) + + +AYMO_PUBLIC void aymo_cpu_arm_boot(void); +AYMO_PUBLIC unsigned aymo_cpu_arm_get_extensions(void); + + +AYMO_CXX_EXTERN_C_END + +#endif // (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64)) +#endif // _include_aymo_cpu_arm_h diff --git a/include/aymo_cpu_arm_neon.h b/include/aymo_cpu_arm_neon.h new file mode 100644 index 0000000..b391840 --- /dev/null +++ b/include/aymo_cpu_arm_neon.h @@ -0,0 +1,53 @@ +// CPU-specific header file for ARM NEON. +// DO NOT include this file; #include "aymo_cpu.h" instead. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_arm_neon_h +#define _include_aymo_cpu_arm_neon_h + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +typedef int16x4_t vi16x4_t; +typedef uint16x4_t vu16x4_t; + +typedef int32x2_t vi32x2_t; +typedef uint32x2_t vu32x2_t; + +typedef int16x8_t vi16x8_t; +typedef uint16x8_t vu16x8_t; + +typedef int32x4_t vi32x4_t; +typedef uint32x4_t vu32x4_t; + +typedef float32x4_t vf32x4_t; +typedef float32x2_t vf32x2_t; + + +#ifndef AYMO_ALIGN_V128 + #define AYMO_ALIGN_V128 AYMO_ALIGN(16) +#endif + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_cpu_arm_neon_h diff --git a/include/aymo_cpu_arm_neon_inline.h b/include/aymo_cpu_arm_neon_inline.h new file mode 100644 index 0000000..bfcf1a1 --- /dev/null +++ b/include/aymo_cpu_arm_neon_inline.h @@ -0,0 +1,382 @@ +// CPU-specific inline methods for ARM NEON. +// Only #include after "aymo_cpu.h" to have inline methods. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_arm_neon_inline_h +#define _include_aymo_cpu_arm_neon_inline_h + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +#include +#ifdef _MSC_VER + #include +#endif + +AYMO_CXX_EXTERN_C_BEGIN + + +// Generic CPU shorthands + +#if defined(_MSC_VER) + #define AYMO_ARM_DSB(n) (__dsb(n)) + #define AYMO_ARM_DSB_ST() (AYMO_ARM_DSB((unsigned)_ARM_BARRIER_ST)) +#elif (defined(__GNUC__) || defined(__clang__)) + #define AYMO_ARM_DSB_ST() {asm volatile ("dsb st");} +#endif + +#define vsfence AYMO_ARM_DSB_ST + + +// SIMD type shorthands +typedef vi16x8_t vi16_t; +typedef vu16x8_t vu16_t; +typedef vi32x4_t vi32_t; + + +// v*() methods are for vi16_t = int16_t[8] + +#define vi2u vreinterpretq_u16_s16 +#define vu2i vreinterpretq_s16_u16 + +#define vcreate vcreate_s16 +#define vload vld1q_s16 +#define vstore vst1q_s16 + +#define vsetx vsetz +#define vset1 vdupq_n_s16 +#define vseta vseta_s16 +#define vsetr vsetr_s16 +#define vsetz() (vset1(0)) +#define vsetf() (vset1(-1)) +#define vsetm vsetm_s16 + +#define vnot vmvnq_s16 +#define vand vandq_s16 +#define vor vorrq_s16 +#define vxor veorq_s16 +#define vandnot(a,b) (vbicq_s16((b), (a))) // ~A & B +#define vblendv(a,b,m) (vbslq_s16(vi2u(m), (b), (a))) // B if M else A + +#define vcmpeq(a, b) (vu2i(vceqq_s16((a), (b)))) +#define vcmpgt(a, b) (vu2i(vcgtq_s16((a), (b)))) +#define vcmpz(x) (vcmpeq((x), vsetz())) +#define vcmpp(x) (vcmpgt((x), vsetz())) +#define vcmpn(x) (vcmpgt(vsetz(), (x))) + +#define vadd vaddq_s16 +#define vaddsi vqaddq_s16 +#define vaddsu vqaddq_u16 + +#define vsub vsubq_s16 +#define vsubsi vqsubq_s16 +#define vsubsu vqsubq_u16 +#define vneg vqnegq_s16 + +#define vslli vshlq_n_s16 +#define vsrli(x,n) (vu2i(vshrq_n_u16(vi2u(x), (n)))) +#define vsrai vshrq_n_s16 +#define vsllv vshlq_s16 +#define vsrlv(a,b) (vu2i(vshlq_u16(vi2u(a), vnegq_s16(b)))) +#define vsrav(a,b) (vshlq_s16((a), vnegq_s16(b))) + +#define vmulhrs vqrdmulhq_s16 + +#define vmullo vmulq_s16 + +#define vmini vminq_s16 +#define vminu vminq_u16 + +#define vmaxi vmaxq_s16 +#define vmaxu vmaxq_u16 + +#define vextract vgetq_lane_s16 +#define vextractn vextractn_s16 +#define vextractv(x,i) (((const int16_t*)(const void*)&(x))[(i)]) + +#define vinsert(x,n,i) (vsetq_lane_s16((n), (x), (i))) +#define vinsertn vinsertn_s16 +#define vinsertv(x,n,i) {((int16_t*)(void*)&(x))[(i)] = (n);} + +#define vgather vgather_s16 + +#define vhsum vhsum_s16 +#define vhsums vhsum + +#define vpow2m1lt4 vpow2m1lt4_s16 +#define vpow2lt4 vpow2lt4_s16 + +#define vgetlo vget_low_s16 +#define vgethi vget_high_s16 +#define vswap(x) (vcombine(vgethi(x), vgetlo(x))) + +#define vrev vrev64q_s16 +#define vrevv(x) (vvcastv(vrev64q_s32(vcastvv(x)))) +#define vext vextq_s16 + +#define vcombine vcombine_s16 + +#define vunpack vmovl_s16 +#define vunpacklo(x) (vunpack(vgetlo(x))) +#define vunpackhi(x) (vunpack(vgethi(x))) + +#define v2vv vunpack +#define vlo2vv vunpacklo +#define vhi2vv vunpackhi + +#define vcastvv vreinterpretq_s32_s16 + + +// w*() methods are for widening/narrowing vi16_t = int16_t[8] <--> vi32_t = int32_t[4] + +#define wmullo vmull_s16 + +#define wcombine vcombine_s16 + + +// vv*() methods are for vi32_t = int32_t[4] + +#define vvi2u vreinterpretq_u32_s32 +#define vvu2i vreinterpretq_s32_u32 + +#define vvsetx vvsetz +#define vvset1 vdupq_n_s32 +#define vvsetz() (vvset1(0)) +#define vvsetf() (vvset1(-1)) + +#define vvand vandq_s32 +#define vvor vorrq_s32 +#define vvxor veorq_s32 +#define vvandnot(a,b) (vbicq_s32((b), (a))) // ~A & B + +#define vvadd vaddq_s32 +#define vwadd vaddw_s32 + +#define vvsrli(x,n) (vvu2i(vshrq_n_u32(vvi2u(x), (n)))) + +#define vvsllv vshlq_s32 + +#define vvmullo vmulq_s32 + +#define vvextract vgetq_lane_s32 +#define vvextractn vvextractn_s32 + +#define vvinsert(x,n,i) (vsetq_lane_s32((n), (x), (i))) +#define vvinsertn vvinsertn_s32 + +#define vvgetlo vget_low_s32 +#define vvgethi vget_high_s32 +#define vvswap(x) (vvcombine(vvgethi(x), vvgetlo(x))) + +#define vvrev vrev64q_s32 +#define vvext vextq_s32 + +#define vvcombine vcombine_s32 + +#define vvpack(a,b) (vcombine_s16(vmovn_s32(a), vmovn_s32(b))) +#define vvpacks(a,b) (vcombine_s16(vqmovn_s32(a), vqmovn_s32(b))) + +#define vvcastv vreinterpretq_s16_s32 + + +static inline +int16x8_t vseta_s16( + int16_t i7, + int16_t i6, + int16_t i5, + int16_t i4, + int16_t i3, + int16_t i2, + int16_t i1, + int16_t i0 +) +{ + int16_t r_n128_i16[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; + return vld1q_s16(r_n128_i16); +} + + +static inline +int16x8_t vsetr_s16( + int16_t i7, + int16_t i6, + int16_t i5, + int16_t i4, + int16_t i3, + int16_t i2, + int16_t i1, + int16_t i0 +) +{ + int16_t r_n128_i16[8] = { i7, i6, i5, i4, i3, i2, i1, i0 }; + return vld1q_s16(r_n128_i16); +} + + +static inline +int16x8_t vsetm_s16(uint8_t m) +{ + static const int16_t kk[8] = { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 }; + int16x8_t k = vld1q_s16(kk); + return vcmpeq(vand(vset1((int16_t)(uint16_t)m), k), k); +} + + +static inline +short vextractn_s16(int16x8_t x, const int i) +{ + int16_t x_n128_i16[8]; + vst1q_s16(x_n128_i16, x); + return x_n128_i16[i]; +} + + +static inline +int16x8_t vinsertn_s16(int16x8_t x, short n, const int i) +{ + int16_t x_n128_i16[8]; + vst1q_s16(x_n128_i16, x); + x_n128_i16[i] = n; + return vld1q_s16(x_n128_i16); +} + + +// Gathers 16x 16-bit words via 16x 8-bit (low) indexes +static inline +int16x8_t vgather_s16(const int16_t* v, int16x8_t i) +{ + // Plain C lookup, smallest cache footprint + uint8_t i_n128_u8[16]; + vst1q_s16((void*)i_n128_u8, i); + int16_t* r_n128_i16 = (int16_t*)(void*)i_n128_u8; + r_n128_i16[0] = v[i_n128_u8[0x0]]; + r_n128_i16[1] = v[i_n128_u8[0x2]]; + r_n128_i16[2] = v[i_n128_u8[0x4]]; + r_n128_i16[3] = v[i_n128_u8[0x6]]; + r_n128_i16[4] = v[i_n128_u8[0x8]]; + r_n128_i16[5] = v[i_n128_u8[0xA]]; + r_n128_i16[6] = v[i_n128_u8[0xC]]; + r_n128_i16[7] = v[i_n128_u8[0xE]]; + return vld1q_s16(r_n128_i16); +} + + +static inline +int32_t vhsum_s16(int16x8_t x) +{ + int32x4_t sum16 = vpaddlq_s16(x); + int64x2_t sum32 = vpaddlq_s32(sum16); + int32x2_t lo32 = vreinterpret_s32_s64(vget_low_s64(sum32)); + int32x2_t hi32 = vreinterpret_s32_s64(vget_high_s64(sum32)); + return (vget_lane_s32(lo32, 0) + vget_lane_s32(hi32, 0)); +} + + +// 0 <= x < 4 --> (1 << (x - 1)) --> 0, 1, 2, 4 +static inline +int16x8_t vpow2m1lt4_s16(int16x8_t x) +{ + return vsub(x, vcmpgt(x, vset1(2))); +} + + +// 0 <= x < 4 --> (1 << x) +static inline +int16x8_t vpow2lt4_s16(int16x8_t x) +{ + return vsllv(vset1(1), x); +} + + +static inline +int32_t vvextractn_s32(int32x4_t x, const int i) +{ + int32_t x_n128_i32[4]; + vst1q_s32(x_n128_i32, x); + return x_n128_i32[i]; +} + + +static inline +int32x4_t vvinsertn_s32(int32x4_t x, int32_t n, const int i) +{ + int32_t x_n128_i32[4]; + vst1q_s32(x_n128_i32, x); + x_n128_i32[i] = n; + return vld1q_s32(x_n128_i32); +} + + +static inline +int16_t clamp16(int x) +{ + if (x < INT16_MIN) { + return (int16_t)INT16_MIN; + } + if (x >= INT16_MAX) { + return (int16_t)INT16_MAX; + } + return (int16_t)x; +} + + +// Finds first set bit = Counts trailing zeros +// Emulates the BSD function +static inline +int uffsll(unsigned long long x) +{ +#if defined(_MSC_VER) + unsigned long i = 0; +#if defined(_WIN32) + if (_BitScanForward(&i, (uint32_t)x)) { + return (int)(i + 1); + } + if (_BitScanForward(&i, (uint32_t)(x >> 32))) { + return (int)(i + 33); + } +#else + if (_BitScanForward64(&i, (unsigned long long)x)) { + return (int)(i + 1); + } +#endif + return 0; + +#elif (defined(__GNUC__) || defined(__clang__)) + return __builtin_ffsll((long long)x); + +#else + if (x) { + int i = 0; + do { + ++i; + x <<= 1; + } while(x); + return (64 - i); + } + return 0; +#endif +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON + +#endif // _include_aymo_cpu_arm_neon_inline_h diff --git a/include/aymo_cpu_x86.h b/include/aymo_cpu_x86.h new file mode 100644 index 0000000..a808cc2 --- /dev/null +++ b/include/aymo_cpu_x86.h @@ -0,0 +1,47 @@ +// CPU-specific header file for x86. +// DO NOT include this file; #include "aymo_cpu.h" instead. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_x86_h +#define _include_aymo_cpu_x86_h +#if (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64)) + +AYMO_CXX_EXTERN_C_BEGIN + + +#define AYMO_CPU_X86_EXT_SSE (1u << 0u) +#define AYMO_CPU_X86_EXT_SSE2 (1u << 1u) +#define AYMO_CPU_X86_EXT_SSE3 (1u << 2u) +#define AYMO_CPU_X86_EXT_SSSE3 (1u << 3u) +#define AYMO_CPU_X86_EXT_SSE41 (1u << 4u) +#define AYMO_CPU_X86_EXT_SSE42 (1u << 5u) +#define AYMO_CPU_X86_EXT_AVX (1u << 6u) +#define AYMO_CPU_X86_EXT_AVX2 (1u << 7u) +#define AYMO_CPU_X86_EXT_FMA3 (1u << 8u) + + +AYMO_PUBLIC void aymo_cpu_x86_boot(void); +AYMO_PUBLIC unsigned aymo_cpu_x86_get_extensions(void); + + +AYMO_CXX_EXTERN_C_END + +#endif // (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86)) +#endif // _include_aymo_cpu_x86_h diff --git a/include/aymo_cpu_x86_avx2.h b/include/aymo_cpu_x86_avx2.h new file mode 100644 index 0000000..b8deeb4 --- /dev/null +++ b/include/aymo_cpu_x86_avx2.h @@ -0,0 +1,46 @@ +// CPU-specific header file for x86 AVX2. +// DO NOT include this file; #include "aymo_cpu.h" instead. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_x86_avx2_h +#define _include_aymo_cpu_x86_avx2_h + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +typedef __m256i vi16x16_t; +typedef __m256i vu16x16_t; + +typedef __m256i vi32x8_t; +typedef __m256i vu32x8_t; + +typedef __m256 vf32x8_t; + + +#ifndef AYMO_ALIGN_V256 + #define AYMO_ALIGN_V256 AYMO_ALIGN(32) +#endif + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_cpu_x86_avx2_h diff --git a/include/aymo_cpu_x86_avx2_inline.h b/include/aymo_cpu_x86_avx2_inline.h new file mode 100644 index 0000000..34062c5 --- /dev/null +++ b/include/aymo_cpu_x86_avx2_inline.h @@ -0,0 +1,428 @@ +// CPU-specific inline methods for x86 AVX2. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_x86_avx2_inline_h +#define _include_aymo_cpu_x86_avx2_inline_h + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#ifndef AYMO_CPU_X86_AVX2_GATHER16_STRATEGY + #define AYMO_CPU_X86_AVX2_GATHER16_STRATEGY 2 +#endif + + +// Generic CPU shorthands + +#define vsfence _mm_sfence + + +// SIMD type shorthands +typedef vi16x16_t vi16_t; +typedef vu16x16_t vu16_t; +typedef vi32x8_t vi32_t; + + +// v*() methods are for vi16_t = int16_t[16] + +#define vi2u(x) x +#define vu2i(x) x + +#define vsetx _mm256_undefined_si256 +#define vset1 _mm256_set1_epi16 +#define vseta _mm256_set_epi16 +#define vsetr _mm256_setr_epi16 +#define vsetz _mm256_setzero_si256 +#define vsetf() (vset1(-1)) +#define vsetm mm256_setm_epi16 + +#define vnot(x) (vxor((x), vsetf())) +#define vand _mm256_and_si256 +#define vor _mm256_or_si256 +#define vxor _mm256_xor_si256 +#define vandnot _mm256_andnot_si256 // ~A & B +#define vblendi _mm256_blend_epi16 +#define vblendv _mm256_blendv_epi8 + +#define vcmpeq _mm256_cmpeq_epi16 +#define vcmpgt _mm256_cmpgt_epi16 +#define vcmpz(x) (vcmpeq((x), vsetz())) +#define vcmpp(x) (vcmpgt((x), vsetz())) +#define vcmpn(x) (vcmpgt(vsetz(), (x))) + +#define vadd _mm256_add_epi16 +#define vaddsi _mm256_adds_epi16 +#define vaddsu _mm256_adds_epu16 + +#define vsub _mm256_sub_epi16 +#define vsubsi _mm256_subs_epi16 +#define vsubsu _mm256_subs_epu16 +#define vneg(x) (vsub(vsetz(), (x))) + +#define vslli _mm256_slli_epi16 +#define vsrli _mm256_srli_epi16 +#define vsrai _mm256_srai_epi16 +#define vsllv mm256_sllv_epi16 +#define vsrlv mm256_srlv_epi16 +#define vsrav mm256_srav_epi16 + +#define vmulihi _mm256_mulhi_epi16 +#define vmuluhi _mm256_mulhi_epu16 + +#define vmulilo _mm256_mullo_epi16 +#define vmululo _mm256_mullo_epi16 + +#define vmini _mm256_min_epi16 +#define vminu _mm256_min_epu16 + +#define vmaxi _mm256_max_epi16 +#define vmaxu _mm256_max_epu16 + +#define vextract _mm256_extract_epi16 +#define vextractn mm256_extractn_epi16 +#define vextractv(x,i) (((const int16_t*)(const void*)&(x))[(i)]) + +#define vinsert _mm256_insert_epi16 +#define vinsertn mm256_insertn_epi16 +#define vinsertv(x,n,i) {((int16_t*)(void*)&(x))[(i)] = (n);} + +#define vgather mm256_i16gather_epi16lo + +#define vhsum mm256_hsum_epi16 +#define vhsums mm256_hsums_epi16 + +#define vpow2m1lt4 mm256_pow2m1lt4_epi16 +#define vpow2lt4 mm256_pow2lt4_epi16 + +#define vunpacklo _mm256_unpacklo_epi16 +#define vunpackhi _mm256_unpackhi_epi16 + + +// vv*() methods are for vi32_t = int32_t[8] + +#define vvi2u(x) x +#define vvu2i(x) x + +#define vvsetx _mm256_undefined_si256 +#define vvset1 _mm256_set1_epi32 +#define vvseta _mm256_set_epi32 +#define vvsetr _mm256_setr_epi32 +#define vvsetz _mm256_setzero_si256 +#define vvsetf() (vvset1(-1)) + +#define vvand vand +#define vvor vor +#define vvxor vxor +#define vvandnot vandnot + +#define vvadd _mm256_add_epi32 + +#define vvsrli _mm256_srli_epi32 + +#define vvsllv _mm256_sllv_epi32 + +#define vvextract _mm256_extract_epi32 +#define vvextractn mm256_extractn_epi32 + +#define vvinsert _mm256_insert_epi32 +#define vvinsertn mm256_insertn_epi32 + +#define vvmullo _mm256_mullo_epi32 + +#define vvpackus _mm256_packus_epi32 + + +static inline +__m256i mm256_setm_epi16(uint16_t m) +{ + const __m256i k = vsetr( + 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, -0x8000 + ); + return vcmpeq(vand(vset1((int16_t)m), k), k); +} + + +// see: https://stackoverflow.com/questions/51789685/reproduce-mm256-sllv-epi16-and-mm256-sllv-epi8-in-avx2/51805592#51805592 +static inline +__m256i mm256_sllv_epi16(__m256i x, __m256i n) +{ + const __m256i m = _mm256_set1_epi32(0xFFFF0000); + __m256i lo = _mm256_sllv_epi32(x, _mm256_andnot_si256(m, n)); + __m256i hi = _mm256_sllv_epi32( + _mm256_and_si256(m, x), + _mm256_srli_epi32(n, 16) + ); + return _mm256_blend_epi16(lo, hi, 0xAA); +} + + +// see: https://stackoverflow.com/questions/51789685/reproduce-mm256-sllv-epi16-and-mm256-sllv-epi8-in-avx2/51805592#51805592 +static inline +__m256i mm256_srlv_epi16(__m256i x, __m256i n) +{ + const __m256i m = _mm256_set1_epi32(0x0000FFFF); + __m256i lo = _mm256_srlv_epi32( + _mm256_and_si256(m, x), + _mm256_and_si256(m, n) + ); + __m256i hi = _mm256_srlv_epi32(x, _mm256_srli_epi32(n, 16)); + return _mm256_blend_epi16(lo, hi, 0xAA); +} + + +// see: https://stackoverflow.com/questions/51789685/reproduce-mm256-sllv-epi16-and-mm256-sllv-epi8-in-avx2/51805592#51805592 +static inline +__m256i mm256_srav_epi16(__m256i x, __m256i n) +{ + const __m256i m = _mm256_set1_epi32(0x0000FFFF); + __m256i lo = _mm256_srav_epi32( + _mm256_and_si256(m, x), + _mm256_and_si256(m, n) + ); + __m256i hi = _mm256_srav_epi32(x, _mm256_srli_epi32(n, 16)); + return _mm256_blend_epi16(lo, hi, 0xAA); +} + + +static inline +short mm256_extractn_epi16(__m256i x, const int i) +{ + int16_t AYMO_ALIGN_V256 x_m256i_i16[16]; + _mm256_store_si256((__m256i*)(void*)x_m256i_i16, x); + return x_m256i_i16[i]; +} + + +static inline +__m256i mm256_insertn_epi16(__m256i x, short n, const int i) +{ + int16_t AYMO_ALIGN_V256 x_m256i_i16[16]; + _mm256_store_si256((__m256i*)(void*)x_m256i_i16, x); + x_m256i_i16[i] = n; + return _mm256_load_si256((__m256i*)(void*)x_m256i_i16); +} + + +// Gathers 16x 16-bit words via 16x 8-bit (low) indexes +static inline +__m256i mm256_i16gather_epi16lo(const int16_t* v, __m256i i) +{ +#if (AYMO_CPU_X86_AVX2_GATHER16_STRATEGY == 2) + // 2x 32-bit gatherings, 16-bit words, smallest cache footprint + const __m256i sl = _mm256_set_epi8( + -1, -1, -1, 12, -1, -1, -1, 8, -1, -1, -1, 4, -1, -1, -1, 0, + -1, -1, -1, 12, -1, -1, -1, 8, -1, -1, -1, 4, -1, -1, -1, 0 + ); + const __m256i sh = _mm256_set_epi8( + -1, -1, -1, 14, -1, -1, -1, 10, -1, -1, -1, 6, -1, -1, -1, 2, + -1, -1, -1, 14, -1, -1, -1, 10, -1, -1, -1, 6, -1, -1, -1, 2 + ); + __m256i jh = _mm256_shuffle_epi8(i, sh); + __m256i rh = _mm256_i32gather_epi32((const int32_t*)(const void*)v, jh, 2); + rh = _mm256_slli_epi32(rh, 16); + __m256i jl = _mm256_shuffle_epi8(i, sl); + __m256i rl = _mm256_i32gather_epi32((const int32_t*)(const void*)v, jl, 2); + return _mm256_blend_epi16(rl, rh, 0xAA); + +#elif (CONFIG_AYMO_X86_AVX2_GATHER16_STRATEGY == 1) + // 1x 32-bit gathering, joint 16-bit words, squared cache footprint + const __m256i s = _mm256_set_epi8( + -1, -1, 14, 12, -1, -1, 10, 8, -1, -1, 6, 4, -1, -1, 2, 0, + -1, -1, 14, 12, -1, -1, 10, 8, -1, -1, 6, 4, -1, -1, 2, 0 + ); + __m256i j = _mm256_shuffle_epi8(i, s); + return _mm256_i32gather_epi32((const int32_t*)(const void *)v, j, 4); + +#else // CONFIG_AYMO_X86_AVX2_GATHER16_STRATEGY + // Plain C lookup, smallest cache footprint + return vsetr( + v[vextract(i, 0x0)], + v[vextract(i, 0x1)], + v[vextract(i, 0x2)], + v[vextract(i, 0x3)], + v[vextract(i, 0x4)], + v[vextract(i, 0x5)], + v[vextract(i, 0x6)], + v[vextract(i, 0x7)], + v[vextract(i, 0x8)], + v[vextract(i, 0x9)], + v[vextract(i, 0xA)], + v[vextract(i, 0xB)], + v[vextract(i, 0xC)], + v[vextract(i, 0xD)], + v[vextract(i, 0xE)], + v[vextract(i, 0xF)] + ); +#endif // CONFIG_AYMO_X86_AVX2_GATHER16_STRATEGY +} + + +// see: https://stackoverflow.com/questions/60108658/fastest-method-to-calculate-sum-of-all-packed-32-bit-integers-using-avx512-or-av/ +static inline +int mm_hsum_epi32(__m128i x) +{ + __m128i hi64 = _mm_unpackhi_epi64(x, x); + __m128i sum64 = _mm_add_epi32(hi64, x); + __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + __m128i sum32 = _mm_add_epi32(sum64, hi32); + return _mm_cvtsi128_si32(sum32); +} + + +static inline +int mm256_hsum_epi32(__m256i x) +{ + __m128i lo128 = _mm256_castsi256_si128(x); + __m128i hi128 = _mm256_extracti128_si256(x, 1); + __m128i sum32 = _mm_add_epi32(lo128, hi128); + return mm_hsum_epi32(sum32); +} + + +// see: https://stackoverflow.com/questions/55057933/simd-accumulate-adjacent-pairs +static inline +int mm256_hsum_epi16(__m256i x) +{ + __m256i sum16 = _mm256_madd_epi16(x, vset1(1)); + return mm256_hsum_epi32(sum16); +} + + +static inline +int mm256_hsums_epi16(__m256i x) +{ + __m256i hs16 = _mm256_hadds_epi16(x, vsetz()); + __m256i sum16 = _mm256_unpacklo_epi16(hs16, vsetz()); + return mm256_hsum_epi32(sum16); +} + + +// 0 <= x < 4 --> (1 << (x - 1)) --> 0, 1, 2, 4 +static inline +__m256i mm256_pow2m1lt4_epi16(__m256i x) +{ + return vsub(x, vcmpgt(x, vset1(2))); +} + + +// 0 <= x < 4 --> (1 << x) +static inline +__m256i mm256_pow2lt4_epi16(__m256i x) +{ + __m256i a = vadd(x, vset1(1)); + __m256i b = vu2i(vsubsu(vi2u(x), vi2u(vset1(2)))); + __m256i c = vmululo(b, b); + return vadd(a, c); +} + + +static inline +long mm256_extractn_epi32(__m256i x, const int i) +{ + int32_t AYMO_ALIGN_V256 x_m256i_i32[8]; + _mm256_store_si256((__m256i*)(void*)x_m256i_i32, x); + return x_m256i_i32[i]; +} + + +static inline +__m256i mm256_insertn_epi32(__m256i x, long n, const int i) +{ + int32_t AYMO_ALIGN_V256 x_m256i_i32[8]; + _mm256_store_si256((__m256i*)(void*)x_m256i_i32, x); + x_m256i_i32[i] = n; + return _mm256_load_si256((__m256i*)(void*)x_m256i_i32); +} + + +static inline +float mm256_extractn_ps(__m256 x, const int i) +{ + float AYMO_ALIGN_V256 x_m256_f32[8]; + _mm256_store_ps(x_m256_f32, x); + return x_m256_f32[i]; +} + + +static inline +__m256 mm256_insertn_ps(__m256 x, float f, const int i) +{ + float AYMO_ALIGN_V256 x_m256_f32[8]; + _mm256_store_ps(x_m256_f32, x); + x_m256_f32[i] = f; + return _mm256_load_ps(x_m256_f32); +} + + +static inline +int16_t clamp16(int x) +{ + if (x < INT16_MIN) { + return (int16_t)INT16_MIN; + } + if (x >= INT16_MAX) { + return (int16_t)INT16_MAX; + } + return (int16_t)x; +} + + +// Finds first set bit = Counts trailing zeros +// Emulates the BSD function +static inline +int uffsll(unsigned long long x) +{ +#if defined(_MSC_VER) + unsigned long i = 0; + if (_BitScanForward64(&i, x)) { + return (int)(i + 1); + } + return 0; + +#elif (defined(__GNUC__) || defined(__clang__)) + return __builtin_ffsll((long long)x); + +#else + if (x) { + int i = 0; + do { + ++i; + x <<= 1; + } while(x); + return (64 - i); + } + return 0; +#endif +} + + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 + +#endif // _include_aymo_cpu_x86_avx2_inline_h diff --git a/include/aymo_cpu_x86_sse41.h b/include/aymo_cpu_x86_sse41.h new file mode 100644 index 0000000..7e12d01 --- /dev/null +++ b/include/aymo_cpu_x86_sse41.h @@ -0,0 +1,46 @@ +// CPU-specific header file for x86 SSE4.1. +// DO NOT include this file; #include "aymo_cpu.h" instead. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_x86_sse41_h +#define _include_aymo_cpu_x86_sse41_h + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +typedef __m128i vi16x8_t; +typedef __m128i vu16x8_t; + +typedef __m128i vi32x4_t; +typedef __m128i vu32x4_t; + +typedef __m128 vf32x4_t; + + +#ifndef AYMO_ALIGN_V128 + #define AYMO_ALIGN_V128 AYMO_ALIGN(16) +#endif + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_cpu_x86_sse41_h diff --git a/include/aymo_cpu_x86_sse41_inline.h b/include/aymo_cpu_x86_sse41_inline.h new file mode 100644 index 0000000..13f2a13 --- /dev/null +++ b/include/aymo_cpu_x86_sse41_inline.h @@ -0,0 +1,426 @@ +// CPU-specific inline methods for x86 SSE4.1. +// Only #include after "aymo_cpu.h" to have inline methods. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_cpu_x86_sse41_inline_h +#define _include_aymo_cpu_x86_sse41_inline_h + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#ifndef AYMO_CPU_X86_SSE41_GATHER16_STRATEGY + #define AYMO_CPU_X86_SSE41_GATHER16_STRATEGY 1 +#endif + + +// Generic CPU shorthands + +#define vsfence _mm_sfence + + +// SIMD type shorthands +typedef vi16x8_t vi16_t; +typedef vu16x8_t vu16_t; +typedef vi32x4_t vi32_t; + + +// v*() methods are for vi16_t = int16_t[8] + +#define vi2u(x) x +#define vu2i(x) x + +#define vload _mm_loadu_si128 +#define vstore _mm_storeu_si128 +#define vstorelo _mm_storel_epi64 + +#define vsetx _mm_undefined_si128 +#define vset1 _mm_set1_epi16 +#define vseta _mm_set_epi16 +#define vsetr _mm_setr_epi16 +#define vsetz _mm_setzero_si128 +#define vsetf() (vset1(-1)) +#define vsetm mm_setm_epi16 + +#define vnot(x) (vxor((x), vsetf())) +#define vand _mm_and_si128 +#define vor _mm_or_si128 +#define vxor _mm_xor_si128 +#define vandnot _mm_andnot_si128 // ~A & B +#define vblendi _mm_blend_epi16 +#define vblendv _mm_blendv_epi8 + +#define vcmpeq _mm_cmpeq_epi16 +#define vcmpgt _mm_cmpgt_epi16 +#define vcmpz(x) (vcmpeq((x), vsetz())) +#define vcmpp(x) (vcmpgt((x), vsetz())) +#define vcmpn(x) (vcmpgt(vsetz(), (x))) + +#define vadd _mm_add_epi16 +#define vaddsi _mm_adds_epi16 +#define vaddsu _mm_adds_epu16 + +#define vsub _mm_sub_epi16 +#define vsubsi _mm_subs_epi16 +#define vsubsu _mm_subs_epu16 +#define vneg(x) (vsub(vsetz(), (x))) + +#define vslli _mm_slli_epi16 +#define vsrli _mm_srli_epi16 +#define vsrai _mm_srai_epi16 +#define vsllv mm_sllv_epi16 +#define vsrlv mm_srlv_epi16 +#define vsrav mm_srav_epi16 + +#define vmulihi _mm_mulhi_epi16 +#define vmuluhi _mm_mulhi_epu16 +#define vmulhrs _mm_mulhrs_epi16 + +#define vmulilo _mm_mullo_epi16 +#define vmululo _mm_mullo_epi16 + +#define vmadd _mm_madd_epi16 + +#define vmini _mm_min_epi16 +#define vminu _mm_min_epu16 + +#define vmaxi _mm_max_epi16 +#define vmaxu _mm_max_epu16 + +#define vextract _mm_extract_epi16 +#define vextractn mm_extractn_epi16 +#define vextractv(x,i) (((const int16_t*)(const void*)&(x))[(i)]) + +#define vinsert _mm_insert_epi16 +#define vinsertn mm_insertn_epi16 +#define vinsertv(x,n,i) {((int16_t*)(void*)&(x))[(i)] = (n);} + +#define vgather mm_i16gather_epi16lo + +#define vhsum mm_hsum_epi16 +#define vhsums mm_hsums_epi16 + +#define vpow2m1lt4 mm_pow2m1lt4_epi16 +#define vpow2lt4 mm_pow2lt4_epi16 + +#define vshufflelo _mm_shufflelo_epi16 +#define vshufflehi _mm_shufflehi_epi16 +#define valignr _mm_alignr_epi8 + +#define vunpacklo _mm_unpacklo_epi16 +#define vunpackhi _mm_unpackhi_epi16 + +#define v2vv _mm_cvtepi16_epi32 +#define vlo2vv(x) (v2vv(x)) +#define vhi2vv(x) (v2vv(vvshuffle((x), KSHUFFLE(3, 2, 3, 2)))) + + +// vv*() methods are for vi32_t = int32_t[4] + +#define vvi2u(x) x +#define vvu2i(x) x + +#define vvsetx _mm_undefined_si128 +#define vvset1 _mm_set1_epi32 +#define vvseta _mm_set_epi32 +#define vvsetr _mm_setr_epi32 +#define vvsetz _mm_setzero_si128 +#define vvsetf() (vvset1(-1)) + +#define vvand vand +#define vvor vor +#define vvxor vxor +#define vvandnot vandnot + +#define vvadd _mm_add_epi32 + +#define vvsrli _mm_srli_epi32 + +#define vvsllv mm_sllv_epi32 + +#define vvextract _mm_extract_epi32 +#define vvextractn mm_extractn_epi32 + +#define vvinsert _mm_insert_epi32 +#define vvinsertn mm_insertn_epi32 + +#define vvmullo _mm_mullo_epi32 + +#define vvshuffle _mm_shuffle_epi32 +#define KSHUFFLE _MM_SHUFFLE +#define vvswap(x) vvshuffle((x), KSHUFFLE(1, 0, 3, 2)) + +#define vvpacks _mm_packs_epi32 +#define vvpackus _mm_packus_epi32 + + +static inline +__m128i mm_setm_epi16(uint8_t m) +{ + const __m128i k = vsetr(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); + return vcmpeq(vand(vset1((int16_t)(uint16_t)m), k), k); +} + + +static inline +__m128i mm_sllv_epi16(__m128i x, __m128i n) +{ + // There's no quick way to perform variable bit shifts; resort to basic x86 + int16_t AYMO_ALIGN_V128 x_m128i_i16[8]; + uint16_t AYMO_ALIGN_V128 n_m128i_u16[8]; + _mm_store_si128((__m128i*)(void*)x_m128i_i16, x); + _mm_store_si128((__m128i*)(void*)n_m128i_u16, n); + x_m128i_i16[0] <<= n_m128i_u16[0]; + x_m128i_i16[1] <<= n_m128i_u16[1]; + x_m128i_i16[2] <<= n_m128i_u16[2]; + x_m128i_i16[3] <<= n_m128i_u16[3]; + x_m128i_i16[4] <<= n_m128i_u16[4]; + x_m128i_i16[5] <<= n_m128i_u16[5]; + x_m128i_i16[6] <<= n_m128i_u16[6]; + x_m128i_i16[7] <<= n_m128i_u16[7]; + return _mm_load_si128((__m128i*)(void*)x_m128i_i16); +} + + +// see: https://stackoverflow.com/questions/51789685/reproduce-mm256-sllv-epi16-and-mm256-sllv-epi8-in-sse41/51805592#51805592 +static inline +__m128i mm_srlv_epi16(__m128i x, __m128i n) +{ + // There's no quick way to perform variable bit shifts; resort to basic x86 + uint16_t AYMO_ALIGN_V128 x_m128i_u16[8]; + uint16_t AYMO_ALIGN_V128 n_m128i_u16[8]; + _mm_store_si128((__m128i*)(void*)x_m128i_u16, x); + _mm_store_si128((__m128i*)(void*)n_m128i_u16, n); + x_m128i_u16[0] >>= n_m128i_u16[0]; + x_m128i_u16[1] >>= n_m128i_u16[1]; + x_m128i_u16[2] >>= n_m128i_u16[2]; + x_m128i_u16[3] >>= n_m128i_u16[3]; + x_m128i_u16[4] >>= n_m128i_u16[4]; + x_m128i_u16[5] >>= n_m128i_u16[5]; + x_m128i_u16[6] >>= n_m128i_u16[6]; + x_m128i_u16[7] >>= n_m128i_u16[7]; + return _mm_load_si128((__m128i*)(void*)x_m128i_u16); +} + + +static inline +__m128i mm_srav_epi16(__m128i x, __m128i n) +{ + // There's no quick way to perform variable bit shifts; resort to basic x86 + int16_t AYMO_ALIGN_V128 x_m128i_i16[8]; + uint16_t AYMO_ALIGN_V128 n_m128i_u16[8]; + _mm_store_si128((__m128i*)(void*)x_m128i_i16, x); + _mm_store_si128((__m128i*)(void*)n_m128i_u16, n); + x_m128i_i16[0] >>= n_m128i_u16[0]; + x_m128i_i16[1] >>= n_m128i_u16[1]; + x_m128i_i16[2] >>= n_m128i_u16[2]; + x_m128i_i16[3] >>= n_m128i_u16[3]; + x_m128i_i16[4] >>= n_m128i_u16[4]; + x_m128i_i16[5] >>= n_m128i_u16[5]; + x_m128i_i16[6] >>= n_m128i_u16[6]; + x_m128i_i16[7] >>= n_m128i_u16[7]; + return _mm_load_si128((__m128i*)(void*)x_m128i_i16); +} + + +static inline +short mm_extractn_epi16(__m128i x, const int i) +{ + int16_t AYMO_ALIGN_V128 x_m128i_i16[8]; + _mm_store_si128((__m128i*)(void*)x_m128i_i16, x); + return x_m128i_i16[i]; +} + + +static inline +__m128i mm_insertn_epi16(__m128i x, short n, const int i) +{ + int16_t AYMO_ALIGN_V128 x_m128i_i16[8]; + _mm_store_si128((__m128i*)(void*)x_m128i_i16, x); + x_m128i_i16[i] = n; + return _mm_load_si128((__m128i*)(void*)x_m128i_i16); +} + + +// Gathers 16x 16-bit words via 16x 8-bit (low) indexes +static inline +__m128i mm_i16gather_epi16lo(const int16_t* v, __m128i i) +{ + // Plain C lookup, smallest cache footprint + uint8_t AYMO_ALIGN_V128 i_m128i_u8[16]; + _mm_store_si128((__m128i*)(void*)i_m128i_u8, i); + int16_t* r_m128i_i16 = (int16_t*)(void*)i_m128i_u8; + r_m128i_i16[0] = v[i_m128i_u8[0x0]]; + r_m128i_i16[1] = v[i_m128i_u8[0x2]]; + r_m128i_i16[2] = v[i_m128i_u8[0x4]]; + r_m128i_i16[3] = v[i_m128i_u8[0x6]]; + r_m128i_i16[4] = v[i_m128i_u8[0x8]]; + r_m128i_i16[5] = v[i_m128i_u8[0xA]]; + r_m128i_i16[6] = v[i_m128i_u8[0xC]]; + r_m128i_i16[7] = v[i_m128i_u8[0xE]]; + return _mm_load_si128((__m128i*)(void*)r_m128i_i16); +} + + +// see: https://stackoverflow.com/questions/60108658/fastest-method-to-calculate-sum-of-all-packed-32-bit-integers-using-avx512-or-av/ +static inline +int mm_hsum_epi32(__m128i x) +{ + __m128i hi64 = _mm_unpackhi_epi64(x, x); + __m128i sum64 = _mm_add_epi32(hi64, x); + __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + __m128i sum32 = _mm_add_epi32(sum64, hi32); + return _mm_cvtsi128_si32(sum32); +} + + +// see: https://stackoverflow.com/questions/55057933/simd-accumulate-adjacent-pairs +static inline +int mm_hsum_epi16(__m128i x) +{ + __m128i sum16 = _mm_madd_epi16(x, vset1(1)); + return mm_hsum_epi32(sum16); +} + + +static inline +int mm_hsums_epi16(__m128i x) +{ + __m128i hs16 = _mm_hadds_epi16(x, vsetz()); + __m128i sum16 = _mm_unpacklo_epi16(hs16, vsetz()); + return mm_hsum_epi32(sum16); +} + + +// 0 <= x < 4 --> (1 << (x - 1)) --> 0, 1, 2, 4 +static inline +__m128i mm_pow2m1lt4_epi16(__m128i x) +{ + return vsub(x, vcmpgt(x, vset1(2))); +} + + +// 0 <= x < 4 --> (1 << x) +static inline +__m128i mm_pow2lt4_epi16(__m128i x) +{ + __m128i a = vadd(x, vset1(1)); + __m128i b = vu2i(vsubsu(vi2u(x), vi2u(vset1(2)))); + __m128i c = vmulilo(b, b); + return vadd(a, c); +} + + +static inline +__m128i mm_sllv_epi32(__m128i x, __m128i n) +{ + // There's no quick way to perform variable bit shifts; resort to basic x86 + int32_t AYMO_ALIGN_V128 x_m128i_i32[4]; + uint32_t AYMO_ALIGN_V128 n_m128i_u32[4]; + _mm_store_si128((__m128i*)(void*)x_m128i_i32, x); + _mm_store_si128((__m128i*)(void*)n_m128i_u32, n); + x_m128i_i32[0] <<= n_m128i_u32[0]; + x_m128i_i32[1] <<= n_m128i_u32[1]; + x_m128i_i32[2] <<= n_m128i_u32[2]; + x_m128i_i32[3] <<= n_m128i_u32[3]; + return _mm_load_si128((__m128i*)(void*)x_m128i_i32); +} + + +static inline +long mm_extractn_epi32(__m128i x, const int i) +{ + int32_t AYMO_ALIGN_V128 x_m128i_i32[4]; + _mm_store_si128((__m128i*)(void*)x_m128i_i32, x); + return x_m128i_i32[i]; +} + + +static inline +__m128i mm_insertn_epi32(__m128i x, long n, const int i) +{ + int32_t AYMO_ALIGN_V128 x_m128i_i32[4]; + _mm_store_si128((__m128i*)(void*)x_m128i_i32, x); + x_m128i_i32[i] = n; + return _mm_load_si128((__m128i*)(void*)x_m128i_i32); +} + + +static inline +int16_t clamp16(int x) +{ + if (x < INT16_MIN) { + return (int16_t)INT16_MIN; + } + if (x >= INT16_MAX) { + return (int16_t)INT16_MAX; + } + return (int16_t)x; +} + + +// Finds first set bit = Counts trailing zeros +// Emulates the BSD function +static inline +int uffsll(unsigned long long x) +{ +#if defined(_MSC_VER) + unsigned long i = 0; +#if defined(_WIN32) + if (_BitScanForward(&i, (uint32_t)x)) { + return (int)(i + 1); + } + if (_BitScanForward(&i, (uint32_t)(x >> 32))) { + return (int)(i + 33); + } +#else + if (_BitScanForward64(&i, (unsigned long long)x)) { + return (int)(i + 1); + } +#endif + return 0; + +#elif (defined(__GNUC__) || defined(__clang__)) + return __builtin_ffsll((long long)x); + +#else + if (x) { + int i = 0; + do { + ++i; + x <<= 1; + } while(x); + return (64 - i); + } + return 0; +#endif +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 + +#endif // _include_aymo_cpu_x86_sse41_inline_h diff --git a/include/aymo_file.h b/include/aymo_file.h new file mode 100644 index 0000000..985fa0c --- /dev/null +++ b/include/aymo_file.h @@ -0,0 +1,42 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_utils_h +#define _include_aymo_utils_h + +#include "aymo_cc.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#ifndef AYMO_FILE_CHUNK_SIZE +#define AYMO_FILE_CHUNK_SIZE (1000000uL) // 1 MB +#endif + + +AYMO_PUBLIC int aymo_file_save(const char* pathp, const void* datap, size_t size); +AYMO_PUBLIC int aymo_file_load(const char* pathp, void** datapp, size_t* sizep); +AYMO_PUBLIC void aymo_file_unload(void* datap); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_utils_h diff --git a/include/aymo_score.h b/include/aymo_score.h new file mode 100644 index 0000000..f51567e --- /dev/null +++ b/include/aymo_score.h @@ -0,0 +1,145 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_score_h +#define _include_aymo_score_h + +#include "aymo_cc.h" + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +enum aymo_score_type { + aymo_score_type_avd, + aymo_score_type_dro, + aymo_score_type_imf, + aymo_score_type_raw, + aymo_score_type_unknown +}; + + +#define AYMO_SCORE_OPL_RATE_DEFAULT 49716u + +#define AYMO_SCORE_FLAG_EVENT 1u +#define AYMO_SCORE_FLAG_DELAY 2u +#define AYMO_SCORE_FLAG_EOF 4u + +struct aymo_score_status { + uint32_t delay; // after + uint16_t address; + uint8_t value; + uint8_t flags; +}; + +struct aymo_score_instance; // forward + +typedef int (*aymo_score_ctor_f)( + struct aymo_score_instance* score +); + +typedef void (*aymo_score_dtor_f)( + struct aymo_score_instance* score +); + +typedef int (*aymo_score_load_f)( + struct aymo_score_instance* score, + const void* data, + uint32_t size +); + +typedef void (*aymo_score_unload_f)( + struct aymo_score_instance* score +); + +typedef struct aymo_score_status* (*aymo_score_get_status_f)( + struct aymo_score_instance* score +); + +typedef void (*aymo_score_restart_f)( + struct aymo_score_instance* score +); + +typedef uint32_t (*aymo_score_tick_f)( + struct aymo_score_instance* score, + uint32_t count +); + +struct aymo_score_vt { + const char* class_name; + aymo_score_ctor_f ctor; + aymo_score_dtor_f dtor; + aymo_score_load_f load; + aymo_score_unload_f unload; + aymo_score_get_status_f get_status; + aymo_score_restart_f restart; + aymo_score_tick_f tick; +}; + +struct aymo_score_instance { + const struct aymo_score_vt* vt; +}; + + +AYMO_PUBLIC int aymo_score_ctor( + struct aymo_score_instance* score +); + +AYMO_PUBLIC void aymo_score_dtor( + struct aymo_score_instance* score +); + +AYMO_PUBLIC int aymo_score_load( + struct aymo_score_instance* score, + const void* data, + uint32_t size +); + +AYMO_PUBLIC void aymo_score_unload( + struct aymo_score_instance* score +); + +AYMO_PUBLIC struct aymo_score_status* aymo_score_get_status( + struct aymo_score_instance* score +); + +AYMO_PUBLIC void aymo_score_restart( + struct aymo_score_instance* score +); + +AYMO_PUBLIC uint32_t aymo_score_tick( + struct aymo_score_instance* score, + uint32_t count +); + + +AYMO_PUBLIC enum aymo_score_type aymo_score_ext_to_type( + const char *tag +); + +AYMO_PUBLIC const struct aymo_score_vt* aymo_score_type_to_vt( + enum aymo_score_type score_type +); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_score_h diff --git a/include/aymo_score_avd.h b/include/aymo_score_avd.h new file mode 100644 index 0000000..dfe28b7 --- /dev/null +++ b/include/aymo_score_avd.h @@ -0,0 +1,89 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_score_avd_h +#define _include_aymo_score_avd_h + +#include "aymo_score.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN +AYMO_PRAGMA_PACK_PUSH_1 + +struct aymo_score_avd_event { + uint8_t address_hi; + uint8_t address_lo; + uint8_t value; +}; + +AYMO_PRAGMA_PACK_POP +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +struct aymo_score_avd_instance { + const struct aymo_score_vt* vt; + struct aymo_score_status status; + const struct aymo_score_avd_event* events; + uint32_t length; + uint32_t index; +}; + + +AYMO_PUBLIC const struct aymo_score_vt aymo_score_avd_vt; + + +AYMO_PUBLIC int aymo_score_avd_ctor( + struct aymo_score_avd_instance* score +); + +AYMO_PUBLIC void aymo_score_avd_dtor( + struct aymo_score_avd_instance* score +); + +AYMO_PUBLIC int aymo_score_avd_load( + struct aymo_score_avd_instance* score, + const void* data, + uint32_t size +); + +AYMO_PUBLIC void aymo_score_avd_unload( + struct aymo_score_avd_instance* score +); + +AYMO_PUBLIC struct aymo_score_status* aymo_score_avd_get_status( + struct aymo_score_avd_instance* score +); + +AYMO_PUBLIC void aymo_score_avd_restart( + struct aymo_score_avd_instance* score +); + +AYMO_PUBLIC uint32_t aymo_score_avd_tick( + struct aymo_score_avd_instance* score, + uint32_t count +); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_score_avd_h diff --git a/include/aymo_score_dro.h b/include/aymo_score_dro.h new file mode 100644 index 0000000..8793508 --- /dev/null +++ b/include/aymo_score_dro.h @@ -0,0 +1,166 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_score_dro_h +#define _include_aymo_score_dro_h + +#include "aymo_score.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +// See: https://moddingwiki.shikadi.net/wiki/DRO_Format + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN +AYMO_PRAGMA_PACK_PUSH_1 + +#define AYMO_DRO_SIGNATURE "DBRAWOPL" + +// Common DRO header, at the very beginning of the file +// NOTE: v0.1 == v1.0 +struct aymo_score_dro_header { + char signature[8u]; + uint16_t version_major; + uint16_t version_minor; +// struct aymo_score_dro_v?_header versioned_header; +}; + + +// DRO v1.0 hardware type +enum aymo_score_dro_v1_hardware_type { + aymo_score_dro_v1_hardware_type_opl2 = 0, + aymo_score_dro_v1_hardware_type_opl2x2, + aymo_score_dro_v1_hardware_type_opl3, +}; + +// DRO v1.0 sub-header +struct aymo_score_dro_v1_header { + uint32_t length_ms; + uint32_t length_bytes; + uint8_t hardware_type; + uint8_t hardware_extra[3]; +}; + +// DRO v1.0 special codes +enum aymo_score_dro_v1_code { + aymo_score_dro_v1_code_delay_byte = 0, + aymo_score_dro_v1_code_delay_word, + aymo_score_dro_v1_code_switch_low, + aymo_score_dro_v1_code_switch_high, + aymo_score_dro_v1_code_escape, + aymo_score_dro_v1_code_invalid = 0xFF +}; + + +// DRO v2.0 hardware type +enum aymo_score_dro_v2_hardware_type { + aymo_score_dro_v2_hardware_type_opl2 = 0, + aymo_score_dro_v2_hardware_type_opl2x2, + aymo_score_dro_v2_hardware_type_opl3, +}; + +// DRO v2.0 format +enum aymo_score_dro_v2_format { + aymo_score_dro_v2_format_interleaved = 0 +}; + +// DRO v2.0 sub-header +struct aymo_score_dro_v2_header { + uint32_t length_pairs; + uint32_t length_ms; + uint8_t hardware_type; + uint8_t format; + uint8_t compression; + uint8_t short_delay_code; + uint8_t long_delay_code; + uint8_t codemap_length; +// uint8_t codemap_table[codemap_length]; +}; + + +// score event pair +struct aymo_score_dro_pair { + uint8_t code; + uint8_t value; +}; + +AYMO_PRAGMA_PACK_POP +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +// Score player score +struct aymo_score_dro_instance { + const struct aymo_score_vt* vt; + struct aymo_score_status status; + const struct aymo_score_dro_header *header; + const struct aymo_score_dro_v1_header *v1_header; + const struct aymo_score_dro_v2_header *v2_header; + const uint8_t* codemap; + const uint8_t* events; + uint32_t opl_rate; + uint32_t division; + uint32_t length; + uint32_t offset; + uint8_t address_hi; +}; + + +AYMO_PUBLIC const struct aymo_score_vt aymo_score_dro_vt; + + +AYMO_PUBLIC int aymo_score_dro_init_specific( + struct aymo_score_dro_instance* score, + uint32_t opl_rate +); + +AYMO_PUBLIC int aymo_score_dro_ctor( + struct aymo_score_dro_instance* score +); + +AYMO_PUBLIC void aymo_score_dro_dtor( + struct aymo_score_dro_instance* score +); + +AYMO_PUBLIC int aymo_score_dro_load( + struct aymo_score_dro_instance* score, + const void* data, + uint32_t size +); + +AYMO_PUBLIC void aymo_score_dro_unload( + struct aymo_score_dro_instance* score +); + +AYMO_PUBLIC struct aymo_score_status* aymo_score_dro_get_status( + struct aymo_score_dro_instance* score +); + +AYMO_PUBLIC void aymo_score_dro_restart( + struct aymo_score_dro_instance* score +); + +AYMO_PUBLIC uint32_t aymo_score_dro_tick( + struct aymo_score_dro_instance* score, + uint32_t count +); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_score_dro_h diff --git a/include/aymo_score_imf.h b/include/aymo_score_imf.h new file mode 100644 index 0000000..5699f00 --- /dev/null +++ b/include/aymo_score_imf.h @@ -0,0 +1,131 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_score_imf_h +#define _include_aymo_score_imf_h + +#include "aymo_score.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +// Common IMF score event rates +#define aymo_score_imf_rate_280Hz 280u +#define aymo_score_imf_rate_duke_nukem_ii aymo_score_imf_rate_280Hz + +#define aymo_score_imf_rate_560Hz 560u +#define aymo_score_imf_rate_bio_menace aymo_score_imf_rate_560Hz +#define aymo_score_imf_rate_commander_keen aymo_score_imf_rate_560Hz +#define aymo_score_imf_rate_cosmos_cosmic_adventures aymo_score_imf_rate_560Hz +#define aymo_score_imf_rate_monster_bash aymo_score_imf_rate_560Hz +#define aymo_score_imf_rate_major_stryker aymo_score_imf_rate_560Hz + +#define aymo_score_imf_rate_700Hz 700u +#define aymo_score_imf_rate_blake_stone aymo_score_imf_rate_700Hz +#define aymo_score_imf_rate_operation_body_count aymo_score_imf_rate_700Hz +#define aymo_score_imf_rate_wolfenstein_3d aymo_score_imf_rate_700Hz +#define aymo_score_imf_rate_corridor_7 aymo_score_imf_rate_700Hz + +#define aymo_score_imf_rate_default aymo_score_imf_rate_560Hz + + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN +AYMO_PRAGMA_PACK_PUSH_1 + +struct aymo_score_imf_event { + uint8_t address_lo; + uint8_t value; + uint8_t delay_lo; + uint8_t delay_hi; +}; + +AYMO_PRAGMA_PACK_POP +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +struct aymo_score_imf_instance { + const struct aymo_score_vt* vt; + struct aymo_score_status status; + const struct aymo_score_imf_event* events; + uint32_t imf_rate; + uint32_t opl_rate; + uint32_t division; + uint32_t length; + uint32_t index; + uint8_t type; + uint8_t address_hi; +}; + + +AYMO_PUBLIC const struct aymo_score_vt aymo_score_imf_vt; + + +AYMO_PUBLIC uint8_t aymo_score_imf_guess_type( + const void* data, + uint32_t size +); + +AYMO_PUBLIC int aymo_score_imf_ctor_specific( + struct aymo_score_imf_instance* score, + uint32_t imf_rate, + uint32_t opl_rate +); + +AYMO_PUBLIC int aymo_score_imf_ctor( + struct aymo_score_imf_instance* score +); + +AYMO_PUBLIC void aymo_score_imf_dtor( + struct aymo_score_imf_instance* score +); + +AYMO_PUBLIC int aymo_score_imf_load_specific( + struct aymo_score_imf_instance* score, + const void* data, + uint32_t size, + uint8_t type +); + +AYMO_PUBLIC int aymo_score_imf_load( + struct aymo_score_imf_instance* score, + const void* data, + uint32_t size +); + +AYMO_PUBLIC void aymo_score_imf_unload( + struct aymo_score_imf_instance* score +); + +AYMO_PUBLIC struct aymo_score_status* aymo_score_imf_get_status( + struct aymo_score_imf_instance* score +); + +AYMO_PUBLIC void aymo_score_imf_restart( + struct aymo_score_imf_instance* score +); + +AYMO_PUBLIC uint32_t aymo_score_imf_tick( + struct aymo_score_imf_instance* score, + uint32_t count +); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_score_imf_h diff --git a/include/aymo_score_raw.h b/include/aymo_score_raw.h new file mode 100644 index 0000000..56b72e7 --- /dev/null +++ b/include/aymo_score_raw.h @@ -0,0 +1,99 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_score_raw_h +#define _include_aymo_score_raw_h + +#include "aymo_score.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN +AYMO_PRAGMA_PACK_PUSH_1 + +#define AYMO_SCORE_RAW_RAWADATA "RAWADATA" +#define AYMO_SCORE_RAW_REFCLK 1193180L + +struct aymo_score_raw_header { + uint8_t rawadata[8]; + uint16_t clock; +}; + +struct aymo_score_raw_event { + uint8_t data; + uint8_t ctrl; +}; + +AYMO_PRAGMA_PACK_POP +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +struct aymo_score_raw_instance { + const struct aymo_score_vt* vt; + struct aymo_score_status status; + const struct aymo_score_raw_event* events; + uint32_t raw_rate; + uint32_t division; + uint32_t length; + uint32_t index; + uint16_t clock; + uint16_t clock_initial; + uint8_t address_hi; +}; + + +AYMO_PUBLIC const struct aymo_score_vt aymo_score_raw_vt; + + +AYMO_PUBLIC int aymo_score_raw_ctor( + struct aymo_score_raw_instance* score +); + +AYMO_PUBLIC void aymo_score_raw_dtor( + struct aymo_score_raw_instance* score +); + +AYMO_PUBLIC int aymo_score_raw_load( + struct aymo_score_raw_instance* score, + const void* data, + uint32_t size +); + +AYMO_PUBLIC void aymo_score_raw_unload( + struct aymo_score_raw_instance* score +); + +AYMO_PUBLIC struct aymo_score_status* aymo_score_raw_get_status( + struct aymo_score_raw_instance* score +); + +AYMO_PUBLIC void aymo_score_raw_restart( + struct aymo_score_raw_instance* score +); + +AYMO_PUBLIC uint32_t aymo_score_raw_tick( + struct aymo_score_raw_instance* score, + uint32_t count +); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_score_raw_h diff --git a/include/aymo_sys_linux.h b/include/aymo_sys_linux.h new file mode 100644 index 0000000..0673d7d --- /dev/null +++ b/include/aymo_sys_linux.h @@ -0,0 +1,25 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_sys_linux_h +#define _include_aymo_sys_linux_h + +// TODO: + +#endif // _include_aymo_sys_linux_h diff --git a/include/aymo_sys_windows.h b/include/aymo_sys_windows.h new file mode 100644 index 0000000..da75692 --- /dev/null +++ b/include/aymo_sys_windows.h @@ -0,0 +1,25 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_sys_windows_h +#define _include_aymo_sys_windows_h + +// TODO: + +#endif // _include_aymo_sys_windows_h diff --git a/include/aymo_tda8425.h b/include/aymo_tda8425.h new file mode 100644 index 0000000..e372208 --- /dev/null +++ b/include/aymo_tda8425.h @@ -0,0 +1,45 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_tda8425_h +#define _include_aymo_tda8425_h + +#include "aymo_tda8425_common.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +AYMO_PUBLIC const struct aymo_tda8425_math* aymo_tda8425_math; + + +AYMO_PUBLIC void aymo_tda8425_boot(const struct aymo_tda8425_math* math); +AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_tda8425_get_vt(const char* cpu_ext); +AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_tda8425_get_best_vt(void); + +AYMO_PUBLIC uint32_t aymo_tda8425_get_sizeof(struct aymo_tda8425_chip* chip); +AYMO_PUBLIC void aymo_tda8425_ctor(struct aymo_tda8425_chip* chip, float sample_rate); +AYMO_PUBLIC void aymo_tda8425_dtor(struct aymo_tda8425_chip* chip); +AYMO_PUBLIC uint8_t aymo_tda8425_read(struct aymo_tda8425_chip* chip, uint16_t address); +AYMO_PUBLIC void aymo_tda8425_write(struct aymo_tda8425_chip* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_tda8425_process_f32(struct aymo_tda8425_chip* chip, uint32_t count, const float x[], float y[]); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_tda8425_h diff --git a/include/aymo_tda8425_arm_neon.h b/include/aymo_tda8425_arm_neon.h new file mode 100644 index 0000000..4cf36fb --- /dev/null +++ b/include/aymo_tda8425_arm_neon.h @@ -0,0 +1,107 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_tda8425_arm_neon_h +#define _include_aymo_tda8425_arm_neon_h + +#include "aymo_cpu.h" + +#include +#include + +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_TDA8425_ARM_NEON_##_token_ +#define aymo_(_token_) aymo_tda8425_arm_neon_##_token_ + + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V128 +struct aymo_(chip) { + // 128-bit data + vf32x4_t hb1l; + vf32x4_t hb1r; + vf32x4_t hb2l; + vf32x4_t hb2r; + vf32x4_t kb2; + + vf32x4_t ha1l; + vf32x4_t ha1r; + vf32x4_t ha2l; + vf32x4_t ha2r; + vf32x4_t ka2; + + vf32x4_t hb0l; + vf32x4_t hb0r; + vf32x4_t kb1; + + vf32x4_t ha0l; + vf32x4_t ha0r; + vf32x4_t ka1; + + vf32x4_t kb0; + + // 64-bit data + vf32x2_t krl; + vf32x2_t klr; + + vf32x2_t kv; + + // 32-bit data + float sample_rate; // [Hz] + float pseudo_c1; // [F] + float pseudo_c2; // [F] + + // 8-bit data + uint8_t reg_vl; + uint8_t reg_vr; + uint8_t reg_ba; + uint8_t reg_tr; + uint8_t reg_pp; + uint8_t reg_sf; + uint8_t pad32_[2]; +}; + + +AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON + +#endif // _include_aymo_tda8425_arm_neon_h diff --git a/include/aymo_tda8425_common.h b/include/aymo_tda8425_common.h new file mode 100644 index 0000000..4803f76 --- /dev/null +++ b/include/aymo_tda8425_common.h @@ -0,0 +1,84 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_tda8425_common_h +#define _include_aymo_tda8425_common_h + +#include "aymo_cc.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +// Object-oriented API + +struct aymo_tda8425_chip; // forward +typedef uint32_t (*aymo_tda8425_get_sizeof_f)(void); +typedef void (*aymo_tda8425_ctor_f)(struct aymo_tda8425_chip* chip, float sample_rate); +typedef void (*aymo_tda8425_dtor_f)(struct aymo_tda8425_chip* chip); +typedef uint8_t (*aymo_tda8425_read_f)(struct aymo_tda8425_chip* chip, uint16_t address); +typedef void (*aymo_tda8425_write_f)(struct aymo_tda8425_chip* chip, uint16_t address, uint8_t value); +typedef void (*aymo_tda8425_process_f32_f)(struct aymo_tda8425_chip* chip, uint32_t count, const float x[], float y[]); + +struct aymo_tda8425_vt { + const char* class_name; + aymo_tda8425_get_sizeof_f get_sizeof; + aymo_tda8425_ctor_f ctor; + aymo_tda8425_dtor_f dtor; + aymo_tda8425_read_f read; + aymo_tda8425_write_f write; + aymo_tda8425_process_f32_f process_f32; +}; + +struct aymo_tda8425_chip { + const struct aymo_tda8425_vt* vt; +}; + + +// Math API + +typedef double (*aymo_tda8425_math1_f)(double a); +typedef double (*aymo_tda8425_math2_f)(double a, double b); + +struct aymo_tda8425_math { + aymo_tda8425_math1_f cos; + aymo_tda8425_math1_f fabs; + aymo_tda8425_math1_f log10; + aymo_tda8425_math2_f pow; + aymo_tda8425_math1_f sqrt; + aymo_tda8425_math1_f tan; +}; + +// Defines the default math functions, after #include +#define AYMO_TDA8425_DEFINE_MATH_DEFAULT(name__) \ + const struct aymo_tda8425_math name__ = { (cos), (fabs), (log10), (pow), (sqrt), (tan) } + + +AYMO_PUBLIC const int8_t aymo_tda8425_reg_v_to_db[64]; +AYMO_PUBLIC const int8_t aymo_tda8425_reg_ba_to_db[16]; +AYMO_PUBLIC const int8_t aymo_tda8425_reg_tr_to_db[16]; + +AYMO_PUBLIC const float aymo_tda8425_pseudo_preset_c1[3]; +AYMO_PUBLIC const float aymo_tda8425_pseudo_preset_c2[3]; + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_tda8425_common_h diff --git a/include/aymo_tda8425_none.h b/include/aymo_tda8425_none.h new file mode 100644 index 0000000..ada3d05 --- /dev/null +++ b/include/aymo_tda8425_none.h @@ -0,0 +1,61 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_tda8425_none_h +#define _include_aymo_tda8425_none_h + +#include "aymo_cpu.h" + +#include "TDA8425_emu.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_TDA8425_NONE_##_token_ +#define aymo_(_token_) aymo_tda8425_none_##_token_ + +#define AYMO_TDA8425_NONE_DELAY 4 + + +struct aymo_(chip) { + TDA8425_Chip emu; + float yh[AYMO_TDA8425_NONE_DELAY][2]; +}; + + +AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_tda8425_none_h diff --git a/include/aymo_tda8425_x86_avx2.h b/include/aymo_tda8425_x86_avx2.h new file mode 100644 index 0000000..618feda --- /dev/null +++ b/include/aymo_tda8425_x86_avx2.h @@ -0,0 +1,100 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_tda8425_x86_avx2_h +#define _include_aymo_tda8425_x86_avx2_h + +#include "aymo_cpu.h" + +#include +#include + +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_TDA8425_X86_AVX2_##_token_ +#define aymo_(_token_) aymo_tda8425_x86_avx2_##_token_ + + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V256 +struct aymo_(chip) { + // Vector data + vf32x8_t hb1; + vf32x8_t hb2; + vf32x8_t kb2; + + vf32x8_t ha1; + vf32x8_t ha2; + vf32x8_t ka2; + + vf32x8_t hb0; + vf32x8_t kb1; + + vf32x8_t ha0; + vf32x8_t ka1; + + vf32x8_t krl; + vf32x8_t klr; + + vf32x8_t kb0; + + vf32x8_t kv; + + // 32-bit data + float sample_rate; // [Hz] + float pseudo_c1; // [F] + float pseudo_c2; // [F] + + // 8-bit data + uint8_t reg_vl; + uint8_t reg_vr; + uint8_t reg_ba; + uint8_t reg_tr; + uint8_t reg_pp; + uint8_t reg_sf; + uint8_t pad32_[2]; +}; + + +AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 + +#endif // _include_aymo_tda8425_x86_avx2_h diff --git a/include/aymo_tda8425_x86_sse41.h b/include/aymo_tda8425_x86_sse41.h new file mode 100644 index 0000000..8626811 --- /dev/null +++ b/include/aymo_tda8425_x86_sse41.h @@ -0,0 +1,106 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_tda8425_x86_sse41_h +#define _include_aymo_tda8425_x86_sse41_h + +#include "aymo_cpu.h" + +#include +#include + +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_TDA8425_X86_SSE41_##_token_ +#define aymo_(_token_) aymo_tda8425_x86_sse41_##_token_ + + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V128 +struct aymo_(chip) { + // Vector data + vf32x4_t hb1l; + vf32x4_t hb1r; + vf32x4_t hb2l; + vf32x4_t hb2r; + vf32x4_t kb2; + + vf32x4_t ha1l; + vf32x4_t ha1r; + vf32x4_t ha2l; + vf32x4_t ha2r; + vf32x4_t ka2; + + vf32x4_t hb0l; + vf32x4_t hb0r; + vf32x4_t kb1; + + vf32x4_t ha0l; + vf32x4_t ha0r; + vf32x4_t ka1; + + vf32x4_t krl; + vf32x4_t klr; + + vf32x4_t kb0; + + vf32x4_t kv; + + // 32-bit data + float sample_rate; // [Hz] + float pseudo_c1; // [F] + float pseudo_c2; // [F] + + // 8-bit data + uint8_t reg_vl; + uint8_t reg_vr; + uint8_t reg_ba; + uint8_t reg_tr; + uint8_t reg_pp; + uint8_t reg_sf; + uint8_t pad32_[2]; +}; + + +AYMO_PUBLIC const struct aymo_tda8425_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 + +#endif // _include_aymo_tda8425_x86_sse41_h diff --git a/include/aymo_wave.h b/include/aymo_wave.h new file mode 100644 index 0000000..bd87fa7 --- /dev/null +++ b/include/aymo_wave.h @@ -0,0 +1,85 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_wave_h +#define _include_aymo_wave_h + +#include "aymo_cc.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN +AYMO_PRAGMA_PACK_PUSH_1 + +#define AYMO_WAVE_FMT_TYPE_PCM 1u +#define AYMO_WAVE_FMT_TYPE_FLOAT 3u + +/* Basic WAVE heading part. */ +struct aymo_wave_heading { + char riff_fourcc[4]; + uint32_t riff_size; + + char wave_fourcc[4]; + + char wave_fmt_fourcc[4]; + uint32_t wave_fmt_size; + uint16_t wave_fmt_type; + uint16_t wave_fmt_channel_count; + uint32_t wave_fmt_sample_rate; + uint32_t wave_fmt_byte_rate; + uint16_t wave_fmt_block_align; + uint16_t wave_fmt_sample_bits; + + char wave_data_fourcc[4]; + uint32_t wave_data_size; + +// sample_t wave_data_samples[...]; +}; + +AYMO_PRAGMA_PACK_POP +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +/* Basic setup of a WAVE heading part. + * + * Made for the common audio formats used with AYMO: + * - around 50 kHz sample rate + * - little-endian + * - 8/16/32-bit signed integers + * - 1/2/4 channel_count + * - up to a few minutes + * + * NOTE: Function arguments are not checked in depth! + * Please make sure they are valid! + */ +AYMO_PUBLIC void aymo_wave_heading_setup( + struct aymo_wave_heading* heading, + uint16_t wave_fmt_type, + uint16_t channel_count, + uint16_t sample_bits, + uint32_t sample_rate, + uint32_t sample_count +); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_wave_h diff --git a/include/aymo_ym7128.h b/include/aymo_ym7128.h new file mode 100644 index 0000000..c885ab1 --- /dev/null +++ b/include/aymo_ym7128.h @@ -0,0 +1,46 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ym7128_h +#define _include_aymo_ym7128_h + +#include "aymo_ym7128_common.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +#define AYMO_YM7128_SAMPLE_RATE_IN 23550 // [Hz] +#define AYMO_YM7128_SAMPLE_RATE_OUT 47100 // [Hz] + + +AYMO_PUBLIC void aymo_ym7128_boot(void); +AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_ym7128_get_vt(const char* cpu_ext); +AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_ym7128_get_best_vt(void); + +AYMO_PUBLIC uint32_t aymo_ym7128_get_sizeof(struct aymo_ym7128_chip* chip); +AYMO_PUBLIC void aymo_ym7128_ctor(struct aymo_ym7128_chip* chip); +AYMO_PUBLIC void aymo_ym7128_dtor(struct aymo_ym7128_chip* chip); +AYMO_PUBLIC uint8_t aymo_ym7128_read(struct aymo_ym7128_chip* chip, uint16_t address); +AYMO_PUBLIC void aymo_ym7128_write(struct aymo_ym7128_chip* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_ym7128_process_i16(struct aymo_ym7128_chip* chip, uint32_t count, const int16_t x[], int16_t y[]); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_ym7128_h diff --git a/include/aymo_ym7128_arm_neon.h b/include/aymo_ym7128_arm_neon.h new file mode 100644 index 0000000..70bc002 --- /dev/null +++ b/include/aymo_ym7128_arm_neon.h @@ -0,0 +1,93 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ym7128_arm_neon_h +#define _include_aymo_ym7128_arm_neon_h + +#include "aymo_cpu.h" +#include "aymo_ym7128_common.h" + +#include +#include + +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_YM7128_ARM_NEON_##_token_ +#define aymo_(_token_) aymo_ym7128_arm_neon_##_token_ + + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V128 +struct aymo_(chip) { + // 128-bit data + int16_t xxv[8]; + vi16x8_t kk1; + vi16x8_t kk2; + vi16x8_t kkm; + int16_t tiv[8]; + vi16x8_t kgl; + vi16x8_t kgr; + vi16x8_t kv; + + vi16x8_t zc; + vi16x8_t zb; + vi16x8_t kf; + vi16x8_t ke; + vi16x8_t za; + vi16x8_t kd; + vi16x8_t kc; + vi16x8_t kb; + vi16x8_t ka; + + // 16-bit data + int16_t uh[AYMO_YM7128_DELAY_LENGTH]; + + // 8-bit data + uint8_t regs[AYMO_YM7128_REG_COUNT]; + + uint8_t pad32_[3]; +}; + + +AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON + +#endif // _include_aymo_ym7128_arm_neon_h diff --git a/include/aymo_ym7128_common.h b/include/aymo_ym7128_common.h new file mode 100644 index 0000000..9a288b3 --- /dev/null +++ b/include/aymo_ym7128_common.h @@ -0,0 +1,118 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ym7128_common_h +#define _include_aymo_ym7128_common_h + +#include "aymo_cc.h" + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +// Object-oriented API + +struct aymo_ym7128_chip; // forward +typedef uint32_t (*aymo_ym7128_get_sizeof_f)(void); +typedef void (*aymo_ym7128_ctor_f)(struct aymo_ym7128_chip* chip); +typedef void (*aymo_ym7128_dtor_f)(struct aymo_ym7128_chip* chip); +typedef uint8_t (*aymo_ym7128_read_f)(struct aymo_ym7128_chip* chip, uint16_t address); +typedef void (*aymo_ym7128_write_f)(struct aymo_ym7128_chip* chip, uint16_t address, uint8_t value); +typedef void (*aymo_ym7128_process_i16_f)(struct aymo_ym7128_chip* chip, uint32_t count, const int16_t x[], int16_t y[]); + +struct aymo_ym7128_vt { + const char* class_name; + aymo_ym7128_get_sizeof_f get_sizeof; + aymo_ym7128_ctor_f ctor; + aymo_ym7128_dtor_f dtor; + aymo_ym7128_read_f read; + aymo_ym7128_write_f write; + aymo_ym7128_process_i16_f process_i16; +}; + +struct aymo_ym7128_chip { + const struct aymo_ym7128_vt* vt; +}; + + +#define AYMO_YM7128_REG_COUNT 31 +#define AYMO_YM7128_GAIN_BITS 6 +#define AYMO_YM7128_GAIN_COUNT 64 +#define AYMO_YM7128_TAP_BITS 5 +#define AYMO_YM7128_TAP_COUNT 32 +#define AYMO_YM7128_COEFF_BITS 6 +#define AYMO_YM7128_KERNEL_LENGTH 19 +#define AYMO_YM7128_DELAY_LENGTH 2356 +#define AYMO_YM7128_GAIN_UNIT 0x7FFF +#define AYMO_YM7128_GAIN_MASK 0xFFF0 +#define AYMO_YM7128_SIGNAL_BITS 14 +#define AYMO_YM7128_SIGNAL_MASK 0xFFFC + + +enum aymo_ym7128_reg { + aymo_ym7128_reg_gl1 = 0, + aymo_ym7128_reg_gl2, + aymo_ym7128_reg_gl3, + aymo_ym7128_reg_gl4, + aymo_ym7128_reg_gl5, + aymo_ym7128_reg_gl6, + aymo_ym7128_reg_gl7, + aymo_ym7128_reg_gl8, + + aymo_ym7128_reg_gr1, + aymo_ym7128_reg_gr2, + aymo_ym7128_reg_gr3, + aymo_ym7128_reg_gr4, + aymo_ym7128_reg_gr5, + aymo_ym7128_reg_gr6, + aymo_ym7128_reg_gr7, + aymo_ym7128_reg_gr8, + + aymo_ym7128_reg_vm, + aymo_ym7128_reg_vc, + + aymo_ym7128_reg_vl, + aymo_ym7128_reg_vr, + + aymo_ym7128_reg_c0, + aymo_ym7128_reg_c1, + + aymo_ym7128_reg_t0, + aymo_ym7128_reg_t1, + aymo_ym7128_reg_t2, + aymo_ym7128_reg_t3, + aymo_ym7128_reg_t4, + aymo_ym7128_reg_t5, + aymo_ym7128_reg_t6, + aymo_ym7128_reg_t7, + aymo_ym7128_reg_t8 +}; + + +AYMO_PUBLIC const int16_t aymo_ym7128_gain[AYMO_YM7128_GAIN_COUNT]; +AYMO_PUBLIC const int16_t aymo_ym7128_tap[AYMO_YM7128_TAP_COUNT]; +AYMO_PUBLIC const int16_t aymo_ym7128_kernel_linear[AYMO_YM7128_KERNEL_LENGTH]; +AYMO_PUBLIC const int16_t aymo_ym7128_kernel_minèhase[AYMO_YM7128_KERNEL_LENGTH]; + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_ym7128_common_h diff --git a/include/aymo_ym7128_none.h b/include/aymo_ym7128_none.h new file mode 100644 index 0000000..a9212ce --- /dev/null +++ b/include/aymo_ym7128_none.h @@ -0,0 +1,61 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ym7128_none_h +#define _include_aymo_ym7128_none_h + +#include "aymo_cpu.h" +#include "aymo_ym7128_common.h" + +#include "YM7128B_emu.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_YM7128_NONE_##_token_ +#define aymo_(_token_) aymo_ym7128_none_##_token_ + + +struct aymo_(chip) { + YM7128B_ChipFixed emu; +}; + + +AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_ym7128_none_h diff --git a/include/aymo_ym7128_x86_sse41.h b/include/aymo_ym7128_x86_sse41.h new file mode 100644 index 0000000..236deba --- /dev/null +++ b/include/aymo_ym7128_x86_sse41.h @@ -0,0 +1,93 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ym7128_x86_sse41_h +#define _include_aymo_ym7128_x86_sse41_h + +#include "aymo_cpu.h" +#include "aymo_ym7128_common.h" + +#include +#include + +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_YM7128_X86_SSE41_##_token_ +#define aymo_(_token_) aymo_ym7128_x86_sse41_##_token_ + + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V128 +struct aymo_(chip) { + // Vector data + int16_t xxv[8]; + vi16x8_t kk1; + vi16x8_t kk2; + vi16x8_t kkm; + int16_t tiv[8]; + vi16x8_t kgl; + vi16x8_t kgr; + vi16x8_t kv; + + vi16x8_t zc; + vi16x8_t zb; + vi16x8_t kf; + vi16x8_t ke; + vi16x8_t za; + vi16x8_t kd; + vi16x8_t kc; + vi16x8_t kb; + vi16x8_t ka; + + // 16-bit data + int16_t uh[AYMO_YM7128_DELAY_LENGTH]; + + // 8-bit data + uint8_t regs[AYMO_YM7128_REG_COUNT]; + + uint8_t pad32_[3]; +}; + + +AYMO_PUBLIC const struct aymo_ym7128_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 + +#endif // _include_aymo_ym7128_x86_sse41_h diff --git a/include/aymo_ymf262.h b/include/aymo_ymf262.h new file mode 100644 index 0000000..1d46004 --- /dev/null +++ b/include/aymo_ymf262.h @@ -0,0 +1,56 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ymf262_h +#define _include_aymo_ymf262_h + +#include "aymo_ymf262_common.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +#define AYMO_YMF262_SLOT_NUM 36 +#define AYMO_YMF262_CHANNEL_NUM 18 +#define AYMO_YMF262_CONN_NUM_MAX 6 + +#define AYMO_YMF262_SAMPLE_RATE 47916 // [Hz] + + +AYMO_PUBLIC void aymo_ymf262_boot(void); +AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_ymf262_get_vt(const char* cpu_ext); +AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_ymf262_get_best_vt(void); + +AYMO_PUBLIC uint32_t aymo_ymf262_get_sizeof(struct aymo_ymf262_chip* chip); +AYMO_PUBLIC void aymo_ymf262_ctor(struct aymo_ymf262_chip* chip); +AYMO_PUBLIC void aymo_ymf262_dtor(struct aymo_ymf262_chip* chip); +AYMO_PUBLIC uint8_t aymo_ymf262_read(struct aymo_ymf262_chip* chip, uint16_t address); +AYMO_PUBLIC void aymo_ymf262_write(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_ymf262_enqueue_write(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_ymf262_enqueue_delay(struct aymo_ymf262_chip* chip, uint32_t count); +AYMO_PUBLIC int16_t aymo_ymf262_get_output(struct aymo_ymf262_chip* chip, uint8_t channel); +AYMO_PUBLIC void aymo_ymf262_tick(struct aymo_ymf262_chip* chip, uint32_t count); +AYMO_PUBLIC void aymo_ymf262_generate_i16x2(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_ymf262_generate_i16x4(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_ymf262_generate_f32x2(struct aymo_ymf262_chip* chip, uint32_t count, float y[]); +AYMO_PUBLIC void aymo_ymf262_generate_f32x4(struct aymo_ymf262_chip* chip, uint32_t count, float y[]); + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_ymf262_h diff --git a/include/aymo_ymf262_arm_neon.h b/include/aymo_ymf262_arm_neon.h new file mode 100644 index 0000000..b7a6116 --- /dev/null +++ b/include/aymo_ymf262_arm_neon.h @@ -0,0 +1,333 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ymf262_arm_neon_h +#define _include_aymo_ymf262_arm_neon_h + +#include "aymo_cpu.h" +#include "aymo_ymf262_common.h" + +#include + +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_YMF262_ARM_NEON_##_token_ +#define aymo_(_token_) aymo_ymf262_arm_neon_##_token_ + + +#define AYMO_YMF262_ARM_NEON_SLOT_NUM_MAX 64 +#define AYMO_YMF262_ARM_NEON_CHANNEL_NUM_MAX 32 +#define AYMO_YMF262_ARM_NEON_SLOT_GROUP_NUM 8 +#define AYMO_YMF262_ARM_NEON_SLOT_GROUP_LENGTH 8 + + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN + +// Wave descriptor for single slot +struct aymo_(wave) { + int16_t wg_phase_shl; + int16_t wg_phase_zero; + int16_t wg_phase_neg; + int16_t wg_phase_flip; + int16_t wg_phase_mask; + int16_t wg_sine_gate; +}; + +// Waveform enumerator +enum aymo_(wf) { + aymo_(wf_sin) = 0, + aymo_(wf_sinup), + aymo_(wf_sinabs), + aymo_(wf_sinabsqrt), + aymo_(wf_sinfast), + aymo_(wf_sinabsfast), + aymo_(wf_square), + aymo_(wf_log) +}; + + +// Connection descriptor for a single slot +struct aymo_(conn) { + int16_t wg_fbmod_gate; + int16_t wg_prmod_gate; + int16_t og_out_gate; +}; + + +// TODO: move reg queue outside YMF262 +#ifndef AYMO_YMF262_ARM_NEON_REG_QUEUE_LENGTH +#define AYMO_YMF262_ARM_NEON_REG_QUEUE_LENGTH 256 +#endif +#ifndef AYMO_YMF262_ARM_NEON_REG_QUEUE_LATENCY +#define AYMO_YMF262_ARM_NEON_REG_QUEUE_LATENCY 2 +#endif + +struct aymo_(reg_queue_item) { + uint16_t address; + uint8_t value; +}; + + +#define AYMO_YMF262_ARM_NEON_EG_GEN_ATTACK 0 +#define AYMO_YMF262_ARM_NEON_EG_GEN_DECAY 1 +#define AYMO_YMF262_ARM_NEON_EG_GEN_SUSTAIN 2 +#define AYMO_YMF262_ARM_NEON_EG_GEN_RELEASE 3 + +#define AYMO_YMF262_ARM_NEON_EG_GEN_SHL_ATTACK 0 +#define AYMO_YMF262_ARM_NEON_EG_GEN_SHL_DECAY 4 +#define AYMO_YMF262_ARM_NEON_EG_GEN_SHL_SUSTAIN 8 +#define AYMO_YMF262_ARM_NEON_EG_GEN_SHL_RELEASE 12 +#define AYMO_YMF262_ARM_NEON_EG_GEN_SRLHI 10 + +#define AYMO_YMF262_ARM_NEON_EG_KEY_NORMAL (1 << 0) +#define AYMO_YMF262_ARM_NEON_EG_KEY_DRUM (1 << 8) + +// Packed ADSR register values +AYMO_PRAGMA_PACK_PUSH_1 +struct aymo_(eg_adsr) { + uint16_t rr : 4; + uint16_t sr : 4; + uint16_t dr : 4; + uint16_t ar : 4; +}; +AYMO_PRAGMA_PACK_POP + + +// Slot SIMD group status +// Processing order (kinda) +AYMO_ALIGN_V128 +struct aymo_(slot_group) { + // Updated each sample cycle + vi16x8_t eg_rout; + vi16x8_t eg_tremolo_am; + vi16x8_t eg_ksl_sh_tl_x4; + vi32x4_t pg_phase_lo; + vi32x4_t pg_phase_hi; + vi16x8_t pg_phase_out; + vi16x8_t eg_gen; + vi16x8_t eg_key; // bit 8 = drum, bit 0 = normal + vi16x8_t eg_gen_shl; // depends on reg_type for reg_sr + vi16x8_t eg_adsr; // struct aymo_(eg_adsr) + vi16x8_t eg_ks; + vi32x4_t pg_deltafreq_lo; + vi32x4_t pg_deltafreq_hi; + vi16x8_t wg_out; + vi16x8_t wg_prout; + vi16x8_t wg_fb_shs; // signed + vi16x8_t wg_prmod_gate; + vi16x8_t wg_fbmod_gate; + vi16x8_t wg_phase_shl; + vi16x8_t wg_phase_zero; + vi16x8_t wg_phase_flip; + vi16x8_t wg_phase_mask; + vi16x8_t wg_sine_gate; + vi16x8_t eg_out; + vi16x8_t wg_phase_neg; + vi16x8_t eg_sl; + vi16x8_t og_prout; + vi16x8_t og_prout_ac; + vi16x8_t og_prout_bd; + vi16x8_t og_out_ch_gate_a; + vi16x8_t og_out_ch_gate_c; + vi16x8_t og_out_ch_gate_b; + vi16x8_t og_out_ch_gate_d; + + // Updated infrequently + vi16x8_t pg_vib; + vi16x8_t pg_mult_x2; + + // Updated only by writing registers + vi16x8_t eg_am; + vi16x8_t og_out_gate; + +#ifdef AYMO_DEBUG + // Variables for debug + vi16x8_t eg_ksl; + vi16x8_t eg_rate; + vi16x8_t eg_inc; + vi16x8_t wg_fbmod; + vi16x8_t wg_mod; +#endif // AYMO_DEBUG +}; + +// Channel_2xOP SIMD group status +// Processing order (kinda) +AYMO_ALIGN_V128 +struct aymo_(ch2x_group) { + // Updated infrequently + vi16x8_t pg_fnum; + vi16x8_t pg_block; + + // Updated only by writing registers + vi16x8_t eg_ksv; + + vi16x8_t og_ch_gate_a; + vi16x8_t og_ch_gate_b; + vi16x8_t og_ch_gate_c; + vi16x8_t og_ch_gate_d; + +#ifdef AYMO_DEBUG + // Variables for debug +#endif // AYMO_DEBUG +}; + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V128 +struct aymo_(chip) { + struct aymo_ymf262_chip parent; + + // 128-bit data + struct aymo_(slot_group) sg[AYMO_(SLOT_GROUP_NUM)]; + struct aymo_(ch2x_group) cg[AYMO_(SLOT_GROUP_NUM) / 2]; + + vi16x8_t eg_add; + vi16x8_t wg_mod; + vi16x8_t eg_incstep; + vi16x8_t og_acc_a; + vi16x8_t og_acc_c; + vi16x8_t og_acc_b; + vi16x8_t og_acc_d; + vi16x4_t og_out; // coupled 64-bit variables + vi16x4_t og_old; // coupled 64-bit variables + + vi16x8_t pg_vib_shs; // signed + vi16x8_t pg_vib_sign; + + // 64-bit data + uint64_t eg_timer; + uint64_t tm_timer; + + // 32-bit data + uint32_t rq_delay; + uint32_t og_ch2x_pairing; + uint32_t og_ch2x_drum; + uint32_t ng_noise; + + // 16-bit data + uint16_t rq_head; + uint16_t rq_tail; + + // 8-bit data + uint8_t eg_state; + uint8_t eg_timerrem; + uint8_t rm_hh_bit2; + uint8_t rm_hh_bit3; + uint8_t rm_hh_bit7; + uint8_t rm_hh_bit8; + uint8_t rm_tc_bit3; + uint8_t rm_tc_bit5; + uint8_t eg_tremolopos; + uint8_t eg_tremoloshift; + uint8_t eg_vibshift; + uint8_t pg_vibpos; + uint8_t process_all_slots; + uint8_t pad32_[1]; + + struct aymo_ymf262_chip_regs chip_regs; + struct aymo_ymf262_slot_regs slot_regs[AYMO_(SLOT_NUM_MAX)]; + struct aymo_ymf262_chan_regs ch2x_regs[AYMO_(CHANNEL_NUM_MAX)]; + + struct aymo_(reg_queue_item) rq_buffer[AYMO_(REG_QUEUE_LENGTH)]; + +#ifdef AYMO_DEBUG + // Variables for debug +#endif // AYMO_DEBUG +}; + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +AYMO_PUBLIC const int8_t aymo_(sgo_side)[8]; +AYMO_PUBLIC const int8_t aymo_(sgo_cell)[8]; + +AYMO_PUBLIC const int16_t aymo_(eg_incstep_table)[4]; + +AYMO_PUBLIC const struct aymo_(wave) aymo_(wave_table)[8]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */]; + +AYMO_PUBLIC const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)]; +AYMO_PUBLIC const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)]; + +AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt); + + +AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel); +AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]); +AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]); + + +// Slot group index to Channel group index +static inline +int aymo_(sgi_to_cgi)(int sgi) +{ +// return (((sgi / 4) * 2) | (sgi % 2)); + return (((sgi >> 1) & 2) | (sgi & 1)); +} + + +// Address to Slot index +static inline +int8_t aymo_(addr_to_slot)(uint16_t address) +{ + uint16_t subaddr = ((address & 0x1F) | ((address >> 8) & 1)); + int8_t slot = aymo_ymf262_subaddr_to_slot[subaddr]; + return slot; +} + + +// Address to Channel_2xOP index +static inline +int8_t aymo_(addr_to_ch2x)(uint16_t address) +{ + uint16_t subaddr = ((address & 0x0F) | ((address >> 8) & 1)); + int8_t ch2x = aymo_ymf262_subaddr_to_ch2x[subaddr]; + return ch2x; +} + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON + +#endif // _include_aymo_ymf262_arm_neon_h diff --git a/include/aymo_ymf262_common.h b/include/aymo_ymf262_common.h new file mode 100644 index 0000000..3cd10b0 --- /dev/null +++ b/include/aymo_ymf262_common.h @@ -0,0 +1,230 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ymf262_common_h +#define _include_aymo_ymf262_common_h + +#include "aymo_cc.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +// Object-oriented API + +struct aymo_ymf262_chip; // forward +typedef uint32_t (*aymo_ymf262_get_sizeof_f)(void); +typedef void (*aymo_ymf262_ctor_f)(struct aymo_ymf262_chip* chip); +typedef void (*aymo_ymf262_dtor_f)(struct aymo_ymf262_chip* chip); +typedef uint8_t (*aymo_ymf262_read_f)(struct aymo_ymf262_chip* chip, uint16_t address); +typedef void (*aymo_ymf262_write_f)(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value); +typedef int (*aymo_ymf262_enqueue_write_f)(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value); +typedef int (*aymo_ymf262_enqueue_delay_f)(struct aymo_ymf262_chip* chip, uint32_t count); +typedef int16_t (*aymo_ymf262_get_output_f)(struct aymo_ymf262_chip* chip, uint8_t channel); +typedef void (*aymo_ymf262_tick_f)(struct aymo_ymf262_chip* chip, uint32_t count); +typedef void (*aymo_ymf262_generate_i16x2_f)(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]); +typedef void (*aymo_ymf262_generate_i16x4_f)(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]); +typedef void (*aymo_ymf262_generate_f32x2_f)(struct aymo_ymf262_chip* chip, uint32_t count, float y[]); +typedef void (*aymo_ymf262_generate_f32x4_f)(struct aymo_ymf262_chip* chip, uint32_t count, float y[]); + +struct aymo_ymf262_vt { + const char* class_name; + aymo_ymf262_get_sizeof_f get_sizeof; + aymo_ymf262_ctor_f ctor; + aymo_ymf262_dtor_f dtor; + aymo_ymf262_read_f read; + aymo_ymf262_write_f write; + aymo_ymf262_enqueue_write_f enqueue_write; + aymo_ymf262_enqueue_delay_f enqueue_delay; + aymo_ymf262_get_output_f get_output; + aymo_ymf262_tick_f tick; + aymo_ymf262_generate_i16x2_f generate_i16x2; + aymo_ymf262_generate_i16x4_f generate_i16x4; + aymo_ymf262_generate_f32x2_f generate_f32x2; + aymo_ymf262_generate_f32x4_f generate_f32x4; +}; + +struct aymo_ymf262_chip { + const struct aymo_ymf262_vt* vt; +}; + + +// Limits +#define AYMO_YMF262_SLOT_NUM_MAX 64 +#define AYMO_YMF262_CHANNEL_NUM_MAX 32 + + +// Registers; little-endian bitfields +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN + +AYMO_PRAGMA_PACK_PUSH_1 + + +struct aymo_ymf262_reg_01h { + uint8_t lsitest_lo : 8; +}; +struct aymo_ymf262_reg_101h { + uint8_t lsitest_hi : 6; + uint8_t _7_6 : 2; +}; +struct aymo_ymf262_reg_02h { + uint8_t timer1 : 8; +}; +struct aymo_ymf262_reg_03h { + uint8_t timer2 : 8; +}; +struct aymo_ymf262_reg_04h { + uint8_t st1 : 1; + uint8_t st2 : 1; + uint8_t _4_2 : 3; + uint8_t mt2 : 1; + uint8_t mt1 : 1; + uint8_t rst : 1; +}; +struct aymo_ymf262_reg_104h { + uint8_t conn : 6; + uint8_t _7_6 : 2; +}; +struct aymo_ymf262_reg_105h { + uint8_t newm : 1; + uint8_t stereo : 1; + uint8_t _7_2 : 6; +}; +struct aymo_ymf262_reg_08h { + uint8_t _5_0 : 6; + uint8_t nts : 1; + uint8_t csm : 1; +}; +struct aymo_ymf262_reg_20h { + uint8_t mult : 4; + uint8_t ksr : 1; + uint8_t egt : 1; + uint8_t vib : 1; + uint8_t am : 1; +}; +struct aymo_ymf262_reg_40h { + uint8_t tl : 6; + uint8_t ksl : 2; +}; +struct aymo_ymf262_reg_60h { + uint8_t dr : 4; + uint8_t ar : 4; +}; +struct aymo_ymf262_reg_80h { + uint8_t rr : 4; + uint8_t sl : 4; +}; +struct aymo_ymf262_reg_A0h { + uint8_t fnum_lo : 8; +}; +struct aymo_ymf262_reg_B0h { + uint8_t fnum_hi : 2; + uint8_t block : 3; + uint8_t kon : 1; + uint8_t _7_6 : 2; +}; +struct aymo_ymf262_reg_BDh { + uint8_t hh : 1; + uint8_t tc : 1; + uint8_t tom : 1; + uint8_t sd : 1; + uint8_t bd : 1; + uint8_t ryt : 1; + uint8_t dvb : 1; + uint8_t dam : 1; +}; +struct aymo_ymf262_reg_C0h { + uint8_t cnt : 1; + uint8_t fb : 3; + uint8_t cha : 1; + uint8_t chb : 1; + uint8_t chc : 1; + uint8_t chd : 1; +}; +struct aymo_ymf262_reg_E0h { + uint8_t ws : 3; + uint8_t _7_3 : 5; +}; + +struct aymo_ymf262_chip_regs { + struct aymo_ymf262_reg_01h reg_01h; + struct aymo_ymf262_reg_02h reg_02h; + struct aymo_ymf262_reg_03h reg_03h; + struct aymo_ymf262_reg_04h reg_04h; + struct aymo_ymf262_reg_08h reg_08h; + struct aymo_ymf262_reg_BDh reg_BDh; + struct aymo_ymf262_reg_101h reg_101h; + struct aymo_ymf262_reg_104h reg_104h; + struct aymo_ymf262_reg_105h reg_105h; + uint8_t _pad32[3]; +}; + +struct aymo_ymf262_slot_regs { + struct aymo_ymf262_reg_20h reg_20h; + struct aymo_ymf262_reg_40h reg_40h; + struct aymo_ymf262_reg_60h reg_60h; + struct aymo_ymf262_reg_80h reg_80h; + struct aymo_ymf262_reg_E0h reg_E0h; + uint8_t _pad32[3]; +}; + +struct aymo_ymf262_chan_regs { + struct aymo_ymf262_reg_A0h reg_A0h; + struct aymo_ymf262_reg_B0h reg_B0h; + struct aymo_ymf262_reg_C0h reg_C0h; + struct aymo_ymf262_reg_C0h reg_D0h; +}; + + +// Packed ADSR register values +struct aymo_ymf262_adsr { + uint16_t rr : 4; + uint16_t sr : 4; + uint16_t dr : 4; + uint16_t ar : 4; +}; + + +AYMO_PRAGMA_PACK_POP + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +AYMO_PUBLIC const int16_t aymo_ymf262_exp_x2_table[256 + 4]; +AYMO_PUBLIC const int16_t aymo_ymf262_logsin_table[256 + 4]; + +AYMO_PUBLIC const int8_t aymo_ymf262_word_to_slot[AYMO_YMF262_SLOT_NUM_MAX]; +AYMO_PUBLIC const int8_t aymo_ymf262_slot_to_word[AYMO_YMF262_SLOT_NUM_MAX]; +AYMO_PUBLIC const int8_t aymo_ymf262_word_to_ch2x[AYMO_YMF262_SLOT_NUM_MAX]; +AYMO_PUBLIC const int8_t aymo_ymf262_ch2x_to_word[AYMO_YMF262_SLOT_NUM_MAX / 2][2/* slot */]; +AYMO_PUBLIC const int8_t aymo_ymf262_word_to_ch4x[AYMO_YMF262_SLOT_NUM_MAX]; +AYMO_PUBLIC const int8_t aymo_ymf262_ch4x_to_word[AYMO_YMF262_SLOT_NUM_MAX / 4][4/* slot */]; +AYMO_PUBLIC const int8_t aymo_ymf262_ch4x_to_pair[AYMO_YMF262_CHANNEL_NUM_MAX / 2][2/* slot */]; +AYMO_PUBLIC const int8_t aymo_ymf262_ch2x_paired[AYMO_YMF262_CHANNEL_NUM_MAX]; +AYMO_PUBLIC const int8_t aymo_ymf262_subaddr_to_slot[AYMO_YMF262_SLOT_NUM_MAX]; +AYMO_PUBLIC const int8_t aymo_ymf262_subaddr_to_ch2x[AYMO_YMF262_CHANNEL_NUM_MAX]; +AYMO_PUBLIC const int8_t aymo_ymf262_pg_mult_x2_table[16]; +AYMO_PUBLIC const int8_t aymo_ymf262_eg_ksl_table[16]; +AYMO_PUBLIC const int8_t aymo_ymf262_eg_kslsh_table[4]; + + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_ymf262_common_h diff --git a/include/aymo_ymf262_none.h b/include/aymo_ymf262_none.h new file mode 100644 index 0000000..e2e7f10 --- /dev/null +++ b/include/aymo_ymf262_none.h @@ -0,0 +1,79 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ymf262_none_h +#define _include_aymo_ymf262_none_h + +#include "aymo_cpu.h" +#include "aymo_ymf262_common.h" + +#include "opl3.h" + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_YMF262_NONE_##_token_ +#define aymo_(_token_) aymo_ymf262_none_##_token_ + + +#define AYMO_YMF262_NONE_SLOT_NUM_MAX AYMO_YMF262_SLOT_NUM +#define AYMO_YMF262_NONE_CHANNEL_NUM_MAX AYMO_YMF262_CHANNEL_NUM +#define AYMO_YMF262_NONE_SLOT_GROUP_NUM AYMO_YMF262_SLOT_NUM +#define AYMO_YMF262_NONE_SLOT_GROUP_LENGTH 1 + + +struct aymo_(chip) { + struct aymo_ymf262_chip parent; + int16_t outs[4]; + opl3_chip opl3; +}; + +AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt); + + +AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel); +AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]); +AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]); + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // _include_aymo_ymf262_none_h diff --git a/include/aymo_ymf262_x86_avx.h b/include/aymo_ymf262_x86_avx.h new file mode 100644 index 0000000..b808f5c --- /dev/null +++ b/include/aymo_ymf262_x86_avx.h @@ -0,0 +1,333 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ymf262_x86_avx_h +#define _include_aymo_ymf262_x86_avx_h + +#include "aymo_cpu.h" +#include "aymo_ymf262_common.h" + +#include + +#ifdef AYMO_CPU_SUPPORT_X86_AVX + +AYMO_CXX_EXTERN_C_BEGIN + + +// YMF262 via x86 AVX is actually the SSE4.1 code compiled with VEX prefix +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_YMF262_X86_AVX_##_token_ +#define aymo_(_token_) aymo_ymf262_x86_avx_##_token_ + + +#define AYMO_YMF262_X86_AVX_SLOT_NUM_MAX 64 +#define AYMO_YMF262_X86_AVX_CHANNEL_NUM_MAX 32 +#define AYMO_YMF262_X86_AVX_SLOT_GROUP_NUM 8 +#define AYMO_YMF262_X86_AVX_SLOT_GROUP_LENGTH 8 + + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN + +// Wave descriptor for single slot +struct aymo_(wave) { + int16_t wg_phase_mullo; + int16_t wg_phase_zero; + int16_t wg_phase_neg; + int16_t wg_phase_flip; + int16_t wg_phase_mask; + int16_t wg_sine_gate; +}; + +// Waveform enumerator +enum aymo_(wf) { + aymo_(wf_sin) = 0, + aymo_(wf_sinup), + aymo_(wf_sinabs), + aymo_(wf_sinabsqrt), + aymo_(wf_sinfast), + aymo_(wf_sinabsfast), + aymo_(wf_square), + aymo_(wf_log) +}; + + +// Connection descriptor for a single slot +struct aymo_(conn) { // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag + int16_t wg_fbmod_gate; + int16_t wg_prmod_gate; + int16_t og_out_gate; +}; + + +// TODO: move reg queue outside YMF262 +#ifndef AYMO_YMF262_X86_AVX_REG_QUEUE_LENGTH +#define AYMO_YMF262_X86_AVX_REG_QUEUE_LENGTH 256 +#endif +#ifndef AYMO_YMF262_X86_AVX_REG_QUEUE_LATENCY +#define AYMO_YMF262_X86_AVX_REG_QUEUE_LATENCY 2 +#endif + +struct aymo_(reg_queue_item) { + uint16_t address; + uint8_t value; +}; + + +#define AYMO_YMF262_X86_AVX_EG_GEN_ATTACK 0 +#define AYMO_YMF262_X86_AVX_EG_GEN_DECAY 1 +#define AYMO_YMF262_X86_AVX_EG_GEN_SUSTAIN 2 +#define AYMO_YMF262_X86_AVX_EG_GEN_RELEASE 3 + +#define AYMO_YMF262_X86_AVX_EG_GEN_MULLO_ATTACK (1 << 0) +#define AYMO_YMF262_X86_AVX_EG_GEN_MULLO_DECAY (1 << 4) +#define AYMO_YMF262_X86_AVX_EG_GEN_MULLO_SUSTAIN (1 << 8) +#define AYMO_YMF262_X86_AVX_EG_GEN_MULLO_RELEASE (1 << 12) +#define AYMO_YMF262_X86_AVX_EG_GEN_SRLHI 10 + +#define AYMO_YMF262_X86_AVX_EG_KEY_NORMAL (1 << 0) +#define AYMO_YMF262_X86_AVX_EG_KEY_DRUM (1 << 8) + +// Packed ADSR register values +AYMO_PRAGMA_PACK_PUSH_1 +struct aymo_(eg_adsr) { + uint16_t rr : 4; + uint16_t sr : 4; + uint16_t dr : 4; + uint16_t ar : 4; +}; +AYMO_PRAGMA_PACK_POP + + +// Slot SIMD group status +// Processing order (kinda) +AYMO_ALIGN_V128 +struct aymo_(slot_group) { + // Updated each sample cycle + vi16x8_t eg_rout; + vi16x8_t eg_tremolo_am; + vi16x8_t eg_ksl_sh_tl_x4; + vi32x4_t pg_phase_lo; + vi32x4_t pg_phase_hi; + vi16x8_t pg_phase_out; + vi16x8_t eg_gen; + vi16x8_t eg_key; // bit 8 = drum, bit 0 = normal + vi16x8_t eg_gen_mullo; // depends on reg_type for reg_sr + vi16x8_t eg_adsr; // struct aymo_(eg_adsr) + vi16x8_t eg_ks; + vi32x4_t pg_deltafreq_lo; + vi32x4_t pg_deltafreq_hi; + vi16x8_t wg_out; + vi16x8_t wg_prout; + vi16x8_t wg_fb_mulhi; + vi16x8_t wg_prmod_gate; + vi16x8_t wg_fbmod_gate; + vi16x8_t wg_phase_mullo; + vi16x8_t wg_phase_zero; + vi16x8_t wg_phase_flip; + vi16x8_t wg_phase_mask; + vi16x8_t wg_sine_gate; + vi16x8_t eg_out; + vi16x8_t wg_phase_neg; + vi16x8_t eg_sl; + vi16x8_t og_prout; + vi16x8_t og_prout_ac; + vi16x8_t og_prout_bd; + vi16x8_t og_out_ch_gate_a; + vi16x8_t og_out_ch_gate_c; + vi16x8_t og_out_ch_gate_b; + vi16x8_t og_out_ch_gate_d; + + // Updated infrequently + vi16x8_t pg_vib; + vi16x8_t pg_mult_x2; + + // Updated only by writing registers + vi16x8_t eg_am; + vi16x8_t og_out_gate; + +#ifdef AYMO_DEBUG + // Variables for debug + vi16x8_t eg_ksl; + vi16x8_t eg_rate; + vi16x8_t eg_inc; + vi16x8_t wg_fbmod; + vi16x8_t wg_mod; +#endif // AYMO_DEBUG +}; + +// Channel_2xOP SIMD group status +// Processing order (kinda) +AYMO_ALIGN_V128 +struct aymo_(ch2x_group) { + // Updated infrequently + vi16x8_t pg_fnum; + vi16x8_t pg_block; + + // Updated only by writing registers + vi16x8_t eg_ksv; + + vi16x8_t og_ch_gate_a; + vi16x8_t og_ch_gate_b; + vi16x8_t og_ch_gate_c; + vi16x8_t og_ch_gate_d; + +#ifdef AYMO_DEBUG + // Variables for debug +#endif // AYMO_DEBUG +}; + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V128 +struct aymo_(chip) { + struct aymo_ymf262_chip parent; + + // 128-bit data + struct aymo_(slot_group) sg[AYMO_(SLOT_GROUP_NUM)]; + struct aymo_(ch2x_group) cg[AYMO_(SLOT_GROUP_NUM) / 2]; + + vi16x8_t eg_add; + vi16x8_t wg_mod; + vu16x8_t eg_incstep; + vi16x8_t og_acc_a; + vi16x8_t og_acc_c; + vi16x8_t og_acc_b; + vi16x8_t og_acc_d; + vi16x8_t og_out; + + vi16x8_t pg_vib_mulhi; + vi16x8_t pg_vib_neg; + + // 64-bit data + uint64_t eg_timer; + uint64_t tm_timer; + + // 32-bit data + uint32_t rq_delay; + uint32_t og_ch2x_pairing; + uint32_t og_ch2x_drum; + uint32_t ng_noise; + + // 16-bit data + uint16_t rq_head; + uint16_t rq_tail; + + // 8-bit data + uint8_t eg_state; + uint8_t eg_timerrem; + uint8_t rm_hh_bit2; + uint8_t rm_hh_bit3; + uint8_t rm_hh_bit7; + uint8_t rm_hh_bit8; + uint8_t rm_tc_bit3; + uint8_t rm_tc_bit5; + uint8_t eg_tremolopos; + uint8_t eg_tremoloshift; + uint8_t eg_vibshift; + uint8_t pg_vibpos; + uint8_t process_all_slots; + uint8_t pad32_[1]; + + struct aymo_ymf262_chip_regs chip_regs; + struct aymo_ymf262_slot_regs slot_regs[AYMO_(SLOT_NUM_MAX)]; + struct aymo_ymf262_chan_regs ch2x_regs[AYMO_(CHANNEL_NUM_MAX)]; + + struct aymo_(reg_queue_item) rq_buffer[AYMO_(REG_QUEUE_LENGTH)]; + +#ifdef AYMO_DEBUG + // Variables for debug +#endif // AYMO_DEBUG +}; + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +AYMO_PUBLIC const int8_t aymo_(sgo_side)[8]; +AYMO_PUBLIC const int8_t aymo_(sgo_cell)[8]; + +AYMO_PUBLIC const uint16_t aymo_(eg_incstep_table)[4]; + +AYMO_PUBLIC const struct aymo_(wave) aymo_(wave_table)[8]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */]; + +AYMO_PUBLIC const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)]; +AYMO_PUBLIC const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)]; + +AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt); + + +AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel); +AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]); +AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]); + + +// Slot group index to Channel group index +static inline +int aymo_(sgi_to_cgi)(int sgi) +{ +// return (((sgi / 4) * 2) | (sgi % 2)); + return (((sgi >> 1) & 2) | (sgi & 1)); +} + + +// Address to Slot index +static inline +int8_t aymo_(addr_to_slot)(uint16_t address) +{ + uint16_t subaddr = ((address & 0x1F) | ((address >> 8) & 1)); + int8_t slot = aymo_ymf262_subaddr_to_slot[subaddr]; + return slot; +} + + +// Address to Channel_2xOP index +static inline +int8_t aymo_(addr_to_ch2x)(uint16_t address) +{ + uint16_t subaddr = ((address & 0x0F) | ((address >> 8) & 1)); + int8_t ch2x = aymo_ymf262_subaddr_to_ch2x[subaddr]; + return ch2x; +} + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX + +#endif // _include_aymo_ymf262_x86_avx_h diff --git a/include/aymo_ymf262_x86_avx2.h b/include/aymo_ymf262_x86_avx2.h new file mode 100644 index 0000000..98afc70 --- /dev/null +++ b/include/aymo_ymf262_x86_avx2.h @@ -0,0 +1,332 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ymf262_x86_avx2_h +#define _include_aymo_ymf262_x86_avx2_h + +#include "aymo_cpu.h" +#include "aymo_ymf262_common.h" + +#include + +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_YMF262_X86_AVX2_##_token_ +#define aymo_(_token_) aymo_ymf262_x86_avx2_##_token_ + + +#define AYMO_YMF262_X86_AVX2_SLOT_NUM_MAX 64 +#define AYMO_YMF262_X86_AVX2_CHANNEL_NUM_MAX 32 +#define AYMO_YMF262_X86_AVX2_SLOT_GROUP_NUM 4 +#define AYMO_YMF262_X86_AVX2_SLOT_GROUP_LENGTH 16 + + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN + +// Wave descriptor for single slot +struct aymo_(wave) { + int16_t wg_phase_mullo; + int16_t wg_phase_zero; + int16_t wg_phase_neg; + int16_t wg_phase_flip; + int16_t wg_phase_mask; + int16_t wg_sine_gate; +}; + +// Waveform enumerator +enum aymo_(wf) { + aymo_(wf_sin) = 0, + aymo_(wf_sinup), + aymo_(wf_sinabs), + aymo_(wf_sinabsqrt), + aymo_(wf_sinfast), + aymo_(wf_sinabsfast), + aymo_(wf_square), + aymo_(wf_log) +}; + + +// Connection descriptor for a single slot +struct aymo_(conn) { + int16_t wg_fbmod_gate; + int16_t wg_prmod_gate; + int16_t og_out_gate; +}; + + +// TODO: move reg queue outside YMF262 +#ifndef AYMO_YMF262_X86_AVX2_REG_QUEUE_LENGTH +#define AYMO_YMF262_X86_AVX2_REG_QUEUE_LENGTH 256 +#endif +#ifndef AYMO_YMF262_X86_AVX2_REG_QUEUE_LATENCY +#define AYMO_YMF262_X86_AVX2_REG_QUEUE_LATENCY 2 +#endif + +struct aymo_(reg_queue_item) { + uint16_t address; + uint8_t value; +}; + + +#define AYMO_YMF262_X86_AVX2_EG_GEN_ATTACK 0 +#define AYMO_YMF262_X86_AVX2_EG_GEN_DECAY 1 +#define AYMO_YMF262_X86_AVX2_EG_GEN_SUSTAIN 2 +#define AYMO_YMF262_X86_AVX2_EG_GEN_RELEASE 3 + +#define AYMO_YMF262_X86_AVX2_EG_GEN_MULLO_ATTACK (1 << 0) +#define AYMO_YMF262_X86_AVX2_EG_GEN_MULLO_DECAY (1 << 4) +#define AYMO_YMF262_X86_AVX2_EG_GEN_MULLO_SUSTAIN (1 << 8) +#define AYMO_YMF262_X86_AVX2_EG_GEN_MULLO_RELEASE (1 << 12) +#define AYMO_YMF262_X86_AVX2_EG_GEN_SRLHI 10 + +#define AYMO_YMF262_X86_AVX2_EG_KEY_NORMAL (1 << 0) +#define AYMO_YMF262_X86_AVX2_EG_KEY_DRUM (1 << 8) + +// Packed ADSR register values +AYMO_PRAGMA_PACK_PUSH_1 +struct aymo_(eg_adsr) { + uint16_t rr : 4; + uint16_t sr : 4; + uint16_t dr : 4; + uint16_t ar : 4; +}; +AYMO_PRAGMA_PACK_POP + + +// Slot SIMD group status +// Processing order (kinda) +AYMO_ALIGN_V256 +struct aymo_(slot_group) { + // Updated each sample cycle + vi16x16_t eg_rout; + vi16x16_t eg_tremolo_am; + vi16x16_t eg_ksl_sh_tl_x4; + vi32x8_t pg_phase_lo; + vi32x8_t pg_phase_hi; + vi16x16_t pg_phase_out; + vi16x16_t eg_gen; + vi16x16_t eg_key; // bit 8 = drum, bit 0 = normal + vi16x16_t eg_gen_mullo; // depends on reg_type for reg_sr + vi16x16_t eg_adsr; // struct aymo_(eg_adsr) + vi16x16_t eg_ks; + vi32x8_t pg_deltafreq_lo; + vi32x8_t pg_deltafreq_hi; + vi16x16_t wg_out; + vi16x16_t wg_prout; + vi16x16_t wg_fb_mulhi; + vi16x16_t wg_prmod_gate; + vi16x16_t wg_fbmod_gate; + vi16x16_t wg_phase_mullo; + vi16x16_t wg_phase_zero; + vi16x16_t wg_phase_flip; + vi16x16_t wg_phase_mask; + vi16x16_t wg_sine_gate; + vi16x16_t eg_out; + vi16x16_t wg_phase_neg; + vi16x16_t eg_sl; + vi16x16_t og_prout; + vi16x16_t og_prout_ac; + vi16x16_t og_prout_bd; + vi16x16_t og_out_ch_gate_a; + vi16x16_t og_out_ch_gate_c; + vi16x16_t og_out_ch_gate_b; + vi16x16_t og_out_ch_gate_d; + + // Updated infrequently + vi16x16_t pg_vib; + vi16x16_t pg_mult_x2; + + // Updated only by writing registers + vi16x16_t eg_am; + vi16x16_t og_out_gate; + +#ifdef AYMO_DEBUG + // Variables for debug + vi16x16_t eg_ksl; + vi16x16_t eg_rate; + vi16x16_t eg_inc; + vi16x16_t wg_fbmod; + vi16x16_t wg_mod; +#endif // AYMO_DEBUG +}; + +// Channel_2xOP SIMD group status +// Processing order (kinda) +AYMO_ALIGN_V256 +struct aymo_(ch2x_group) { + // Updated infrequently + vi16x16_t pg_fnum; + vi16x16_t pg_block; + + // Updated only by writing registers + vi16x16_t eg_ksv; + vi16x16_t og_ch_gate_a; + vi16x16_t og_ch_gate_b; + vi16x16_t og_ch_gate_c; + vi16x16_t og_ch_gate_d; + +#ifdef AYMO_DEBUG + // Variables for debug +#endif // AYMO_DEBUG +}; + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V256 +struct aymo_(chip) { + struct aymo_ymf262_chip parent; + uint8_t align32_[sizeof(vi16x16_t) - sizeof(struct aymo_ymf262_chip)]; + + // 256-bit data + struct aymo_(slot_group) sg[AYMO_(SLOT_GROUP_NUM)]; + struct aymo_(ch2x_group) cg[AYMO_(SLOT_GROUP_NUM) / 2]; + + vi16x16_t eg_add; + vi16x16_t wg_mod; + vu16x16_t eg_incstep; + vi16x16_t og_acc_a; + vi16x16_t og_acc_c; + vi16x16_t og_acc_b; + vi16x16_t og_acc_d; + + vi16x16_t pg_vib_mulhi; + vi16x16_t pg_vib_neg; + + // 128-bit data + vi16x8_t og_out; + + // 64-bit data + uint64_t eg_timer; + uint64_t tm_timer; + + // 32-bit data + uint32_t rq_delay; + uint32_t og_ch2x_pairing; + uint32_t og_ch2x_drum; + uint32_t ng_noise; + + // 16-bit data + uint16_t rq_head; + uint16_t rq_tail; + + // 8-bit data + uint8_t eg_state; + uint8_t eg_timerrem; + uint8_t rm_hh_bit2; + uint8_t rm_hh_bit3; + uint8_t rm_hh_bit7; + uint8_t rm_hh_bit8; + uint8_t rm_tc_bit3; + uint8_t rm_tc_bit5; + uint8_t eg_tremolopos; + uint8_t eg_tremoloshift; + uint8_t eg_vibshift; + uint8_t pg_vibpos; + uint8_t pad32_[2]; + + struct aymo_ymf262_chip_regs chip_regs; + struct aymo_ymf262_slot_regs slot_regs[AYMO_(SLOT_NUM_MAX)]; + struct aymo_ymf262_chan_regs ch2x_regs[AYMO_(CHANNEL_NUM_MAX)]; + + struct aymo_(reg_queue_item) rq_buffer[AYMO_(REG_QUEUE_LENGTH)]; + +#ifdef AYMO_DEBUG + // Variables for debug +#endif // AYMO_DEBUG +}; + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +AYMO_PUBLIC const int8_t aymo_(sgo_side)[16]; +AYMO_PUBLIC const int8_t aymo_(sgo_cell)[16]; + +AYMO_PUBLIC const uint16_t aymo_(eg_incstep_table)[4]; + +AYMO_PUBLIC const struct aymo_(wave) aymo_(wave_table)[8]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */]; + +AYMO_PUBLIC const uint16_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)]; +AYMO_PUBLIC const uint16_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)]; + +AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt); + + +AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel); +AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]); +AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]); + + +// Slot group index to Channel group index +static inline +int aymo_(sgi_to_cgi)(int sgi) +{ + return (sgi / 2); +} + + +// Address to Slot index +static inline +int8_t aymo_(addr_to_slot)(uint16_t address) +{ + uint16_t subaddr = ((address & 0x1F) | ((address >> 8) & 1)); + int8_t slot = aymo_ymf262_subaddr_to_slot[subaddr]; + return slot; +} + + +// Address to Channel_2xOP index +static inline +int8_t aymo_(addr_to_ch2x)(uint16_t address) +{ + uint16_t subaddr = ((address & 0x0F) | ((address >> 8) & 1)); + int8_t ch2x = aymo_ymf262_subaddr_to_ch2x[subaddr]; + return ch2x; +} + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 + +#endif // _include_aymo_ymf262_x86_avx2_h diff --git a/include/aymo_ymf262_x86_sse41.h b/include/aymo_ymf262_x86_sse41.h new file mode 100644 index 0000000..b9814ee --- /dev/null +++ b/include/aymo_ymf262_x86_sse41.h @@ -0,0 +1,332 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_ymf262_x86_sse41_h +#define _include_aymo_ymf262_x86_sse41_h + +#include "aymo_cpu.h" +#include "aymo_ymf262_common.h" + +#include + +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +AYMO_CXX_EXTERN_C_BEGIN + + +#undef AYMO_ +#undef aymo_ +#define AYMO_(_token_) AYMO_YMF262_X86_SSE41_##_token_ +#define aymo_(_token_) aymo_ymf262_x86_sse41_##_token_ + + +#define AYMO_YMF262_X86_SSE41_SLOT_NUM_MAX 64 +#define AYMO_YMF262_X86_SSE41_CHANNEL_NUM_MAX 32 +#define AYMO_YMF262_X86_SSE41_SLOT_GROUP_NUM 8 +#define AYMO_YMF262_X86_SSE41_SLOT_GROUP_LENGTH 8 + + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_LITTLE_ENDIAN + +// Wave descriptor for single slot +struct aymo_(wave) { + int16_t wg_phase_mullo; + int16_t wg_phase_zero; + int16_t wg_phase_neg; + int16_t wg_phase_flip; + int16_t wg_phase_mask; + int16_t wg_sine_gate; +}; + +// Waveform enumerator +enum aymo_(wf) { + aymo_(wf_sin) = 0, + aymo_(wf_sinup), + aymo_(wf_sinabs), + aymo_(wf_sinabsqrt), + aymo_(wf_sinfast), + aymo_(wf_sinabsfast), + aymo_(wf_square), + aymo_(wf_log) +}; + + +// Connection descriptor for a single slot +struct aymo_(conn) { + int16_t wg_fbmod_gate; + int16_t wg_prmod_gate; + int16_t og_out_gate; +}; + + +// TODO: move reg queue outside YMF262 +#ifndef AYMO_YMF262_X86_SSE41_REG_QUEUE_LENGTH +#define AYMO_YMF262_X86_SSE41_REG_QUEUE_LENGTH 256 +#endif +#ifndef AYMO_YMF262_X86_SSE41_REG_QUEUE_LATENCY +#define AYMO_YMF262_X86_SSE41_REG_QUEUE_LATENCY 2 +#endif + +struct aymo_(reg_queue_item) { + uint16_t address; + uint8_t value; +}; + + +#define AYMO_YMF262_X86_SSE41_EG_GEN_ATTACK 0 +#define AYMO_YMF262_X86_SSE41_EG_GEN_DECAY 1 +#define AYMO_YMF262_X86_SSE41_EG_GEN_SUSTAIN 2 +#define AYMO_YMF262_X86_SSE41_EG_GEN_RELEASE 3 + +#define AYMO_YMF262_X86_SSE41_EG_GEN_MULLO_ATTACK (1 << 0) +#define AYMO_YMF262_X86_SSE41_EG_GEN_MULLO_DECAY (1 << 4) +#define AYMO_YMF262_X86_SSE41_EG_GEN_MULLO_SUSTAIN (1 << 8) +#define AYMO_YMF262_X86_SSE41_EG_GEN_MULLO_RELEASE (1 << 12) +#define AYMO_YMF262_X86_SSE41_EG_GEN_SRLHI 10 + +#define AYMO_YMF262_X86_SSE41_EG_KEY_NORMAL (1 << 0) +#define AYMO_YMF262_X86_SSE41_EG_KEY_DRUM (1 << 8) + +// Packed ADSR register values +AYMO_PRAGMA_PACK_PUSH_1 +struct aymo_(eg_adsr) { + uint16_t rr : 4; + uint16_t sr : 4; + uint16_t dr : 4; + uint16_t ar : 4; +}; +AYMO_PRAGMA_PACK_POP + + +// Slot SIMD group status +// Processing order (kinda) +AYMO_ALIGN_V128 +struct aymo_(slot_group) { + // Updated each sample cycle + vi16x8_t eg_rout; + vi16x8_t eg_tremolo_am; + vi16x8_t eg_ksl_sh_tl_x4; + vi32x4_t pg_phase_lo; + vi32x4_t pg_phase_hi; + vi16x8_t pg_phase_out; + vi16x8_t eg_gen; + vi16x8_t eg_key; // bit 8 = drum, bit 0 = normal + vi16x8_t eg_gen_mullo; // depends on reg_type for reg_sr + vi16x8_t eg_adsr; // struct aymo_(eg_adsr) + vi16x8_t eg_ks; + vi32x4_t pg_deltafreq_lo; + vi32x4_t pg_deltafreq_hi; + vi16x8_t wg_out; + vi16x8_t wg_prout; + vi16x8_t wg_fb_mulhi; + vi16x8_t wg_prmod_gate; + vi16x8_t wg_fbmod_gate; + vi16x8_t wg_phase_mullo; + vi16x8_t wg_phase_zero; + vi16x8_t wg_phase_flip; + vi16x8_t wg_phase_mask; + vi16x8_t wg_sine_gate; + vi16x8_t eg_out; + vi16x8_t wg_phase_neg; + vi16x8_t eg_sl; + vi16x8_t og_prout; + vi16x8_t og_prout_ac; + vi16x8_t og_prout_bd; + vi16x8_t og_out_ch_gate_a; + vi16x8_t og_out_ch_gate_c; + vi16x8_t og_out_ch_gate_b; + vi16x8_t og_out_ch_gate_d; + + // Updated infrequently + vi16x8_t pg_vib; + vi16x8_t pg_mult_x2; + + // Updated only by writing registers + vi16x8_t eg_am; + vi16x8_t og_out_gate; + +#ifdef AYMO_DEBUG + // Variables for debug + vi16x8_t eg_ksl; + vi16x8_t eg_rate; + vi16x8_t eg_inc; + vi16x8_t wg_fbmod; + vi16x8_t wg_mod; +#endif // AYMO_DEBUG +}; + +// Channel_2xOP SIMD group status +// Processing order (kinda) +AYMO_ALIGN_V128 +struct aymo_(ch2x_group) { + // Updated infrequently + vi16x8_t pg_fnum; + vi16x8_t pg_block; + + // Updated only by writing registers + vi16x8_t eg_ksv; + + vi16x8_t og_ch_gate_a; + vi16x8_t og_ch_gate_b; + vi16x8_t og_ch_gate_c; + vi16x8_t og_ch_gate_d; + +#ifdef AYMO_DEBUG + // Variables for debug +#endif // AYMO_DEBUG +}; + +// Chip SIMD and scalar status data +// Processing order (kinda), size/alignment order +AYMO_ALIGN_V128 +struct aymo_(chip) { + struct aymo_ymf262_chip parent; + + // 128-bit data + struct aymo_(slot_group) sg[AYMO_(SLOT_GROUP_NUM)]; + struct aymo_(ch2x_group) cg[AYMO_(SLOT_GROUP_NUM) / 2]; + + vi16x8_t eg_add; + vi16x8_t wg_mod; + vu16x8_t eg_incstep; + vi16x8_t og_acc_a; + vi16x8_t og_acc_c; + vi16x8_t og_acc_b; + vi16x8_t og_acc_d; + vi16x8_t og_out; + + vi16x8_t pg_vib_mulhi; + vi16x8_t pg_vib_neg; + + // 64-bit data + uint64_t eg_timer; + uint64_t tm_timer; + + // 32-bit data + uint32_t rq_delay; + uint32_t og_ch2x_pairing; + uint32_t og_ch2x_drum; + uint32_t ng_noise; + + // 16-bit data + uint16_t rq_head; + uint16_t rq_tail; + + // 8-bit data + uint8_t eg_state; + uint8_t eg_timerrem; + uint8_t rm_hh_bit2; + uint8_t rm_hh_bit3; + uint8_t rm_hh_bit7; + uint8_t rm_hh_bit8; + uint8_t rm_tc_bit3; + uint8_t rm_tc_bit5; + uint8_t eg_tremolopos; + uint8_t eg_tremoloshift; + uint8_t eg_vibshift; + uint8_t pg_vibpos; + uint8_t process_all_slots; + uint8_t pad32_[1]; + + struct aymo_ymf262_chip_regs chip_regs; + struct aymo_ymf262_slot_regs slot_regs[AYMO_(SLOT_NUM_MAX)]; + struct aymo_ymf262_chan_regs ch2x_regs[AYMO_(CHANNEL_NUM_MAX)]; + + struct aymo_(reg_queue_item) rq_buffer[AYMO_(REG_QUEUE_LENGTH)]; + +#ifdef AYMO_DEBUG + // Variables for debug +#endif // AYMO_DEBUG +}; + +AYMO_PRAGMA_SCALAR_STORAGE_ORDER_DEFAULT + + +AYMO_PUBLIC const int8_t aymo_(sgo_side)[8]; +AYMO_PUBLIC const int8_t aymo_(sgo_cell)[8]; + +AYMO_PUBLIC const uint16_t aymo_(eg_incstep_table)[4]; + +AYMO_PUBLIC const struct aymo_(wave) aymo_(wave_table)[8]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */]; +AYMO_PUBLIC const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */]; + +AYMO_PUBLIC const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)]; +AYMO_PUBLIC const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)]; + +AYMO_PUBLIC const struct aymo_ymf262_vt aymo_(vt); + + +AYMO_PUBLIC const struct aymo_ymf262_vt* aymo_(get_vt)(void); +AYMO_PUBLIC uint32_t aymo_(get_sizeof)(void); +AYMO_PUBLIC void aymo_(ctor)(struct aymo_(chip)* chip); +AYMO_PUBLIC void aymo_(dtor)(struct aymo_(chip)* chip); +AYMO_PUBLIC uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address); +AYMO_PUBLIC void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value); +AYMO_PUBLIC int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel); +AYMO_PUBLIC void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count); +AYMO_PUBLIC void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]); +AYMO_PUBLIC void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]); +AYMO_PUBLIC void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]); + + +// Slot group index to Channel group index +static inline +int aymo_(sgi_to_cgi)(int sgi) +{ +// return (((sgi / 4) * 2) | (sgi % 2)); + return (((sgi >> 1) & 2) | (sgi & 1)); +} + + +// Address to Slot index +static inline +int8_t aymo_(addr_to_slot)(uint16_t address) +{ + uint16_t subaddr = ((address & 0x1F) | ((address >> 8) & 1)); + int8_t slot = aymo_ymf262_subaddr_to_slot[subaddr]; + return slot; +} + + +// Address to Channel_2xOP index +static inline +int8_t aymo_(addr_to_ch2x)(uint16_t address) +{ + uint16_t subaddr = ((address & 0x0F) | ((address >> 8) & 1)); + int8_t ch2x = aymo_ymf262_subaddr_to_ch2x[subaddr]; + return ch2x; +} + + +#ifndef AYMO_KEEP_SHORTHANDS + #undef AYMO_KEEP_SHORTHANDS + #undef AYMO_ + #undef aymo_ +#endif // AYMO_KEEP_SHORTHANDS + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 + +#endif // _include_aymo_ymf262_x86_sse41_h diff --git a/include/meson.build b/include/meson.build new file mode 100644 index 0000000..a094049 --- /dev/null +++ b/include/meson.build @@ -0,0 +1,11 @@ + +aymo_headers = [ # TODO + 'aymo.h', + 'aymo_score.h', + 'aymo_score_avd.h', + 'aymo_score_dro.h', + 'aymo_score_imf.h', + 'aymo_ymf262.h', +] + +install_headers(aymo_headers, subdir: 'aymo') diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..b5d6244 --- /dev/null +++ b/meson.build @@ -0,0 +1,688 @@ +# This Meson build script is a heavily modified version of the +# "/meson.build" file of the OPUS codec project, adapted to +# suit AYMO. +# That script is used as a template for AYMO because it has a +# known and tested support for SIMD auto-detection within Meson. +# This way, any changes by the OPUS project can be applied to AYMO. +# +# OPUS project reference links: +# https://opus-codec.org/ +# https://github.com/xiph/opus/ +# +# Reference file snapshot: +# https://github.com/xiph/opus/blob/20c032d27c59d65b19b8ffbb2608e5282fe817eb/meson.build +# +# OPUS license disclaimer: +# --- BEGIN OPUS LICENSE --- +# +# Copyright 2001-2011 Xiph.Org, Skype Limited, Octasic, +# Jean-Marc Valin, Timothy B. Terriberry, +# CSIRO, Gregory Maxwell, Mark Borgerding, +# Erik de Castro Lopo +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# - Neither the name of Internet Society, IETF or IETF Trust, nor the +# names of specific contributors, may be used to endorse or promote +# products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# --- END OPUS LICENSE --- + +# ===================================================================== + +project('aymo', 'c', + version: '0.0.1', + meson_version: '>=0.60.0', + default_options: [ + 'warning_level=2', + 'c_std=gnu99', + 'buildtype=debugoptimized' + ], +) + +libversion = '0.0.1' + +cc = meson.get_compiler('c') +host_system = host_machine.system() +host_cpu_family = host_machine.cpu_family() +top_srcdir = meson.current_source_dir() # for opt_docs +top_builddir = meson.current_build_dir() # for opt_docs + +aymo_includes = include_directories('.', 'include') +aymo_public_includes = include_directories('include') + +# ===================================================================== + +add_project_arguments('-DAYMO_BUILD', language: 'c') +add_project_arguments('-DAYMO_HAVE_CONFIG_H', language: 'c') + +if host_system == 'windows' + if cc.get_argument_syntax() == 'msvc' + add_project_arguments('-D_CRT_SECURE_NO_WARNINGS', language: 'c') + endif +endif + +if cc.get_argument_syntax() == 'gcc' + #add_project_arguments('-D_FORTIFY_SOURCE=2', language: 'c') +endif + +# Check for extra compiler args +additional_c_args = [] +if cc.get_argument_syntax() == 'msvc' + additional_c_args += [ + ] +else # msvc + additional_c_args += [ + '-fvisibility=hidden', + '-Wcast-align', + '-Wnested-externs', + '-Wshadow', + '-Wstrict-prototypes', + ] + + # On Windows, -fstack-protector-strong adds a libssp-0.dll dependency and + # prevents static linking + if host_system != 'windows' + #additional_c_args += ['-fstack-protector-strong'] + endif +endif # msvc + +foreach arg : additional_c_args + if cc.has_argument(arg) + add_project_arguments(arg, language: 'c') + endif +endforeach + +# Windows MSVC warnings +if cc.get_id() == 'msvc' + # Ignore several spurious warnings. + # If a warning is completely useless and spammy, use '/wdXXXX' to suppress it + # If a warning is harmless but hard to fix, use '/woXXXX' so it's shown once + # NOTE: Only add warnings here if you are sure they're spurious +# add_project_arguments('/wd4035', '/wd4715', '/wd4116', '/wd4046', '/wd4068', +# '/wd4820', '/wd4244', '/wd4255', '/wd4668', +# language : 'c') +endif + +# ===================================================================== + +aymo_version = meson.project_version() +aymo_url = 'https://github.com/TexZK/aymo/' + +aymo_conf = configuration_data() +aymo_conf.set('PACKAGE_BUGREPORT', '"texzk@email.it"') +aymo_conf.set('PACKAGE_NAME', '"aymo"') +aymo_conf.set('PACKAGE_STRING', '"aymo @0@"'.format(aymo_version)) +aymo_conf.set('PACKAGE_TARNAME', '"aymo"') +aymo_conf.set('PACKAGE_URL', '"@0@"'.format(aymo_url)) +aymo_conf.set('PACKAGE_VERSION', '"@0@"'.format(aymo_version)) + +aymo_conf.set('AYMO_CC_HOST_@0@'.format(host_system.underscorify().to_upper()), 1) +aymo_conf.set('AYMO_CC_ID_@0@'.format(cc.get_id().underscorify().to_upper()), 1) +aymo_conf.set('AYMO_CC_SYNTAX_@0@'.format(cc.get_argument_syntax().underscorify().to_upper()), 1) + +if cc.check_header('stdint.h') + aymo_conf.set('AYMO_CC_HAVE_STDINT_H', 1) +endif + +opt_apps = get_option('apps') +opt_asm = get_option('asm') +opt_docs = get_option('docs') +opt_rtcd = get_option('rtcd') +opt_tests = get_option('tests') + +if get_option('buildtype').startswith('debug') + add_project_arguments('-DDEBUG', language : 'c') + add_project_arguments('-D_DEBUG', language : 'c') + add_project_arguments('-DAYMO_DEBUG', language : 'c') +else + add_project_arguments('-DNDEBUG', language : 'c') +endif + +# ===================================================================== + +aymo_conf.set('AYMO_CPU_FAMILY_@0@'.format(host_cpu_family.underscorify().to_upper()), 1) + +# With GCC, Clang, ICC, etc, we differentiate between +# 'runtime support for this SIMD' and 'presume we have this SIMD', +# by checking whether the SIMD / intrinsics can be compiled by the compiler +# as-is ('presume') or with SIMD cflags ('support'). +# +# With MSVC, the compiler will always build SIMD/intrinsics targeting all +# specific instruction sets supported by that version of the compiler. +# No special arguments are ever needed. +# +# If runtime CPU detection is not disabled, we must always assume that +# we only have runtime 'support' for it. + +aymo_can_presume_simd = true +if cc.get_argument_syntax() == 'msvc' + if opt_rtcd.disabled() + warning('Building with an MSVC-like compiler and runtime CPU detection is disabled. Outputs may not run on all @0@ CPUs.'.format(host_cpu_family)) + else + aymo_can_presume_simd = false + endif +endif + +# TODO: NEON has 'hardfp' vs 'softfp' compiler configuration issues. +# When targeting 'AArch32 softfp', we sometimes need to explicitly pass +# '-mfloat-abi=softfp' to enable NEON (e.g. on Android). +# It should be set in the cross file. +arm_neon_link_args = [] +if cc.get_argument_syntax() != 'msvc' + arm_neon_link_args += ['-mfpu=neon'] +endif + +aymo_have_none = true # always +aymo_have_x86_sse = false +aymo_have_x86_sse2 = false +aymo_have_x86_sse41 = false +aymo_have_x86_avx = false +aymo_have_x86_avx2 = false + +aymo_have_arm_neon = false + +rtcd_support_names = [] +intrin_support_names = [] + +if host_cpu_family in ['arm', 'aarch64'] + # Check for ARMv7/AArch64 neon intrinsics + intrin_check_code = ''' + #include + int main(void) { + static float32x4_t A0, A1, SUMM; + SUMM = vmlaq_f32(SUMM, A0, A1); + return (int)vgetq_lane_f32(SUMM, 0); + } + ''' + intrin_name = 'ARMv7/AArch64 NEON' + if cc.links(intrin_check_code, + name: 'compiler supports @0@ intrinsics'.format(intrin_name)) + aymo_arm_presume_neon = aymo_can_presume_simd + aymo_arm_support_neon = true + else + aymo_arm_presume_neon = false + if cc.links(intrin_check_code, + args: arm_neon_link_args, + name: 'compiler supports @0@ intrinsics with @1@' + .format(intrin_name, ' '.join(arm_neon_link_args))) + aymo_arm_support_neon = true + else + aymo_arm_support_neon = false + endif + endif + + if aymo_arm_support_neon + aymo_have_arm_neon = true + intrin_support_names += [intrin_name] + aymo_conf.set('AYMO_CPU_SUPPORT_ARM_NEON', 1) + if aymo_arm_presume_neon + aymo_conf.set('AYMO_CPU_PRESUME_ARM_NEON', 1) + else + rtcd_support_names += [intrin_name] + aymo_arm_neon_args = arm_neon_link_args + endif + else + message('Compiler does not support @0@ intrinsics'.format(intrin_name)) + endif + + # Check for aarch64 neon intrinsics + intrin_check_code = ''' + #include + int main(void) { + static int32_t x; + static int16_t y; + y = vqmovns_s32(x); + } + ''' + intrin_name = 'AArch64 NEON' + if cc.links(intrin_check_code, + name: 'compiler supports @0@ intrinsics'.format(intrin_name)) + aymo_arm_presume_aarch64 = aymo_can_presume_simd + aymo_arm_support_aarch64 = true + else + aymo_arm_presume_aarch64 = false + if cc.links(intrin_check_code, + args: arm_neon_link_args, + name: 'compiler supports @0@ intrinsics with @1@' + .format(intrin_name, ' '.join(arm_neon_link_args))) + aymo_arm_support_aarch64 = true + else + aymo_arm_support_aarch64 = false + endif + endif + + if aymo_arm_support_aarch64 + intrin_support_names += [intrin_name] + aymo_conf.set('AYMO_CPU_SUPPORT_ARM_AARCH64', 1) + if aymo_arm_presume_aarch64 + aymo_conf.set('AYMO_CPU_PRESUME_ARM_AARCH64', 1) + endif + else + message('Compiler does not support @0@ intrinsics'.format(intrin_name)) + endif + +elif host_cpu_family in ['x86', 'x86_64'] + # allow external override/specification of the flags + x86_intrinsics = [ + [ 'x86_sse', 'SSE', 'xmmintrin.h', '__m128', '_mm_setzero_ps()', [['-msse'], ['/arch:SSE']] ], + [ 'x86_sse2', 'SSE2', 'emmintrin.h', '__m128i', '_mm_setzero_si128()', [['-msse2'], ['/arch:SSE2']] ], + [ 'x86_sse41', 'SSE4.1', 'smmintrin.h', '__m128i', '_mm_setzero_si128(); x = _mm_cmpeq_epi64(x, x)', [['-msse4.1'], ['/arch:SSE2']] ], + [ 'x86_avx', 'AVX', 'immintrin.h', '__m256', '_mm256_setzero_ps()', [['-mavx'], ['/arch:AVX']] ], + [ 'x86_avx2', 'AVX2', 'immintrin.h', '__m256i', '_mm256_setzero_si256(); x = _mm256_cmpeq_epi16(x, x)', [['-mavx2'], ['/arch:AVX2']] ], + ] + foreach intrin : x86_intrinsics + intrin_check_code = ''' + #include <@0@> + int main(void) { + @1@ x; + x = @2@; + return 0; + } + '''.format(intrin[2], intrin[3], intrin[4]) + intrin_name = intrin[1] + # Intrinsics arguments are not available with MSVC-like compilers + intrin_args = ((cc.get_argument_syntax() == 'msvc') ? intrin[5][1] : intrin[5][0]) + if cc.links(intrin_check_code, + name: 'compiler supports @0@ intrinsics'.format(intrin_name)) + support_intrin = true + presume_intrin = aymo_can_presume_simd + elif intrin_args.length() > 0 + presume_intrin = false + support_intrin = false + if cc.links(intrin_check_code, + args: intrin_args, + name: 'compiler supports @0@ intrinsics with @1@' + .format(intrin_name, ' '.join(intrin_args))) + support_intrin = true + endif + endif # intrin_check_code + if support_intrin + intrin_support_names += [intrin_name] + intrin_lower_name = intrin[0] + set_variable('aymo_have_@0@'.format(intrin_lower_name), true) + intrin_upper_name = intrin_lower_name.to_upper() + aymo_conf.set('AYMO_CPU_SUPPORT_@0@'.format(intrin_upper_name), 1) + if presume_intrin + aymo_conf.set('AYMO_CPU_PRESUME_@0@'.format(intrin_upper_name), 1) + else + rtcd_support_names += [intrin_name] + set_variable('aymo_@0@_args'.format(intrin_lower_name), intrin_args) + endif + else + message('Compiler does not support @0@ intrinsics'.format(intrin_name)) + endif # support_intrin + endforeach # intrin + + if not opt_rtcd.disabled() + cpuid_h__cpuid_code = ''' + #include + int main(void) { + unsigned e1[4] = { 0u, 0u, 0u, 0u }; + __cpuid(1u, e1[0], e1[1], e1[2], e1[3]); + return 0; + } + ''' + cpuid_h__cpuid_count_code = ''' + #include + int main(void) { + unsigned e7[4] = { 0u, 0u, 0u, 0u }; + __cpuid_count(7u, 0u, e7[0], e7[1], e7[2], e7[3]); + return 0; + } + ''' + intrin_h__cpuid_code = ''' + #include + int main(void) { + int e1[4] = { 0, 0, 0, 0 }; + __cpuid(e1, 1); + return 0; + } + ''' + intrin_h__cpuidex_code = ''' + #include + int main(void) { + int e7[4] = { 0, 0, 0, 0 }; + __cpuidex(e7, 7, 0); + return 0; + } + ''' + have_cpuinfo = false + if cc.links(cpuid_h__cpuid_code, name: ' __cpuid()') + aymo_conf.set('AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID', 1) + have_cpuinfo = true + endif + if cc.links(cpuid_h__cpuid_count_code, name: ' __cpuid_count()') + aymo_conf.set('AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID_COUNT', 1) + have_cpuinfo = true + endif + if cc.links(intrin_h__cpuid_code, name: ' __cpuid()') + aymo_conf.set('AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUID', 1) + have_cpuinfo = true + endif + if cc.links(intrin_h__cpuidex_code, name: ' __cpuidex()') + aymo_conf.set('AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUIDEX', 1) + have_cpuinfo = true + endif + if have_cpuinfo + aymo_conf.set('AYMO_CPU_HAVE_CPUINFO', 1) + else + if opt_rtcd.enabled() + error('rtcd option is enabled, but no Get CPU Info method detected') + endif + warning('Get CPU Info method not detected, no rtcd for intrinsics') + endif + endif # opt_rtcd + + aymo_conf.set('AYMO_CPU_X86_AVX2_GATHER16_STRATEGY', 2) # TODO: option() + +else # host_cpu_family + warning('No intrinsics support for @0@'.format(host_cpu_family)) +endif # host_cpu_family + +# Check whether we require intrinsics and we support intrinsics on this cpu, +# but none were detected. Can happen because of incorrect compiler flags, such +# as missing -mfloat-abi=softfp on ARM32 softfp cpuitectures. +if intrin_support_names.length() == 0 + warning('"intrinsics" option was enabled, but none were detected') +endif + +if opt_rtcd.disabled() + rtcd_support_names = 'disabled' +else + if rtcd_support_names.length() > 0 + aymo_conf.set('AYMO_CPU_HAVE_RTCD', 1) + else + if intrin_support_names.length() == 0 + rtcd_support_names = 'none' + if opt_rtcd.enabled() + warning('"rtcd" option is enabled, but no support for intrinsics is available') + endif + else + rtcd_support_names = 'not needed' + endif + endif +endif # opt_rtcd + +# ===================================================================== + +sources = { + 'AYMO_HEADERS': files( + 'include/aymo.h', + 'include/aymo_cc.h', + 'include/aymo_cpu.h', + 'include/aymo_convert.h', + 'include/aymo_convert_arm_neon.h', + 'include/aymo_convert_none.h', + 'include/aymo_convert_x86_avx2.h', + 'include/aymo_convert_x86_sse41.h', + 'include/aymo_score.h', + 'include/aymo_score_avd.h', + 'include/aymo_score_dro.h', + 'include/aymo_score_imf.h', + 'include/aymo_score_raw.h', + 'include/aymo_wave.h', + 'include/aymo_ymf262_arm_neon.h', + 'include/aymo_ymf262_none.h', + 'include/aymo_ymf262_x86_avx.h', + 'include/aymo_ymf262_x86_avx2.h', + 'include/aymo_ymf262_x86_sse41.h', + ), + + 'AYMO_SOURCES': files( + 'src/aymo.c', + 'src/aymo_convert.c', + 'src/aymo_convert_none.c', + 'src/aymo_cpu.c', + 'src/aymo_score.c', + 'src/aymo_score_avd.c', + 'src/aymo_score_dro.c', + 'src/aymo_score_imf.c', + 'src/aymo_score_raw.c', + 'src/aymo_tda8425.c', + 'src/aymo_tda8425_common.c', + 'src/aymo_tda8425_none.c', + 'src/aymo_wave.c', + 'src/aymo_ym7128.c', + 'src/aymo_ym7128_common.c', + 'src/aymo_ym7128_none.c', + 'src/aymo_ymf262.c', + 'src/aymo_ymf262_common.c', + 'src/aymo_ymf262_none.c', + ), + + 'AYMO_SOURCES_X86': files ( + 'src/aymo_cpu_x86.c', + ), + + 'AYMO_SOURCES_X86_SSE41': files ( + 'src/aymo_convert_x86_sse41.c', + 'src/aymo_tda8425_x86_sse41.c', + 'src/aymo_ym7128_x86_sse41.c', + 'src/aymo_ymf262_x86_sse41.c', + ), + + 'AYMO_SOURCES_X86_AVX': files ( + 'src/aymo_ymf262_x86_avx.c', + ), + + 'AYMO_SOURCES_X86_AVX2': files ( + 'src/aymo_convert_x86_avx2.c', + 'src/aymo_tda8425_x86_avx2.c', + 'src/aymo_ymf262_x86_avx2.c', + ), + + 'AYMO_SOURCES_ARM': files ( + 'src/aymo_cpu_arm.c', + ), + + 'AYMO_SOURCES_ARM_NEON': files ( + 'src/aymo_convert_arm_neon.c', + 'src/aymo_tda8425_arm_neon.c', + 'src/aymo_ym7128_arm_neon.c', + 'src/aymo_ymf262_arm_neon.c', + ), + + 'AYMO_SOURCES_LIBC': files ( + 'src/aymo_file.c', + ), + + 'AYMO_SOURCES_AYMO': files ( + 'src/aymo_empty.c', + ), +} + +# ===================================================================== + +libm = cc.find_library('m', required: false) + +aymo_c_args = [] + +# Assembly code listings +if cc.get_argument_syntax() == 'msvc' + if not opt_asm.disabled() + aymo_c_args += ['/FAcs'] + endif +else + if opt_asm.enabled() + aymo_c_args += ['-S', '-fverbose-asm', '-masm=intel', '-Wa,-adhln'] + endif +endif + +subdir('contrib') + +aymo_sources = sources['AYMO_SOURCES'] +aymo_x86_sse41_sources = sources['AYMO_SOURCES_X86_SSE41'] +aymo_x86_avx_sources = sources['AYMO_SOURCES_X86_AVX'] +aymo_x86_avx2_sources = sources['AYMO_SOURCES_X86_AVX2'] +aymo_arm_neon_sources = sources['AYMO_SOURCES_ARM_NEON'] + +aymo_static_libs = [] + +foreach intr_name : ['x86_sse41', 'x86_avx', 'x86_avx2', 'arm_neon'] + have_intr = get_variable('aymo_have_@0@'.format(intr_name)) + if have_intr + intr_sources = get_variable('aymo_@0@_sources'.format(intr_name)) + intr_args = get_variable('aymo_@0@_args'.format(intr_name), []) + + aymo_static_libs += static_library( + 'aymo-static_@0@'.format(intr_name), + intr_sources, + c_args: aymo_c_args + intr_args, + include_directories: [aymo_includes, aymo_contrib_includes], + link_with: aymo_contrib_lib, + install: false, + ) + endif +endforeach + + +if host_cpu_family in ['x86', 'x86_64'] + aymo_sources += sources['AYMO_SOURCES_X86'] +endif +if host_cpu_family in ['arm', 'aarch64'] + aymo_sources += sources['AYMO_SOURCES_ARM'] +endif + + +if host_system in ['windows', 'cygwin'] + aymo_sources += 'src/aymo_sys_windows.c' +elif host_system in ['linux'] + aymo_sources += 'src/aymo_sys_linux.c' +endif + + +aymo_static_lib = static_library( + 'aymo-static', + aymo_sources, + c_args: aymo_c_args, + include_directories: aymo_includes, + link_with: aymo_static_libs, + dependencies: aymo_contrib_dep, + install: false, +) + +aymo_static_dep = declare_dependency( + include_directories: aymo_includes, + link_with: aymo_static_lib, + dependencies: aymo_contrib_dep, +) + + +aymo_target_lib = library( + 'aymo', + sources['AYMO_SOURCES_AYMO'], + version: libversion, + # darwin_versions: macosversion, # TODO: + link_with: aymo_static_lib, + install: true, +) + +aymo_target_dep = declare_dependency( + include_directories: aymo_includes, + link_with: aymo_target_lib, +) + + +aymo_libc_lib = static_library( + 'aymo-libc', + sources['AYMO_SOURCES_LIBC'], + c_args: aymo_c_args, + include_directories: aymo_includes, + install: false, +) + +aymo_libc_dep = declare_dependency( + include_directories: aymo_includes, + link_with: aymo_libc_lib, +) + +# ===================================================================== + +# pkg-config files (not using pkg module so we can use the existing .pc.in file) +pkgconf = configuration_data() + +pkgconf.set('prefix', join_paths(get_option('prefix'))) +pkgconf.set('exec_prefix', '${prefix}') +pkgconf.set('libdir', '${prefix}/@0@'.format(get_option('libdir'))) +pkgconf.set('includedir', '${prefix}/@0@'.format(get_option('includedir'))) +pkgconf.set('VERSION', aymo_version) +pkgconf.set('URL', aymo_url) + +pkg_install_dir = '@0@/pkgconfig'.format(get_option('libdir')) + +configure_file( + input: 'aymo.pc.in', + output: 'aymo.pc', + configuration: pkgconf, + install_dir: pkg_install_dir +) + +# ===================================================================== + +configure_file( + output: 'aymo_config.h', + configuration: aymo_conf, +# macro_name: 'INCLUDE_AYMO_CONFIG_H', +) + +subdir('include') + +subdir('apps') + +if not opt_tests.disabled() + subdir('tests') +endif + +# ===================================================================== + +# TODO: Doxygen +#doxygen = find_program('doxygen', required: get_option('docs')) +#if doxygen.found() +# subdir('doc') +#endif + +# ===================================================================== + +summary( + { + 'Run-time CPU detection': rtcd_support_names, + 'Generate Assembly Files': opt_asm.enabled(), + }, + section: 'Compilation', + bool_yn: true, + list_sep: ', ', +) + +summary( + { +# 'API documentation': doxygen.found(), # TODO: Docygen + 'Apps': not opt_apps.disabled(), + 'Tests': not opt_tests.disabled(), + }, + section: 'Components', + bool_yn: true, + list_sep: ', ', +) diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000..8c87e79 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1,8 @@ +# Compilation +option('asm', type : 'feature', value : 'auto', description : 'Generate Assembly Files') +option('rtcd', type : 'feature', value : 'auto', description : 'Run-Time CPU Detection') + +# Components +option('apps', type : 'feature', value : 'auto', description : 'Build Applications') +option('docs', type: 'feature', value: 'auto', description: 'Build Documentation') +option('tests', type : 'feature', value : 'auto', description : 'Build Tests') diff --git a/msvc-arm.txt b/msvc-arm.txt new file mode 100644 index 0000000..cedb2df --- /dev/null +++ b/msvc-arm.txt @@ -0,0 +1,14 @@ +[binaries] +c = 'cl' +cpp = 'cl' +ar = 'lib' +windres = 'rc' + +[built-in options] +c_std = 'c99' + +[host_machine] +system = 'windows' +cpu_family = 'arm' +cpu = 'armv7' +endian = 'little' diff --git a/msvc-arm_env.bat b/msvc-arm_env.bat new file mode 100644 index 0000000..66e46b2 --- /dev/null +++ b/msvc-arm_env.bat @@ -0,0 +1,4 @@ +rem Run this script to setup environment for the MSVC ARM compiler +"D:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsamd64_arm.bat" +cd "D:\Documenti\GitHub\aymo" +meson setup vs --backend vs --cross-file msvc-arm.txt diff --git a/src/aymo.c b/src/aymo.c new file mode 100644 index 0000000..d6c7c60 --- /dev/null +++ b/src/aymo.c @@ -0,0 +1,35 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#include "aymo_convert.h" +#include "aymo_cpu.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +void aymo_boot(void) +{ + aymo_cpu_boot(); + aymo_convert_boot(); +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_convert.c b/src/aymo_convert.c new file mode 100644 index 0000000..0cecf45 --- /dev/null +++ b/src/aymo_convert.c @@ -0,0 +1,206 @@ +// CPU-specific inline methods for ARM NEON. +// Only #include after "aymo_cpu.h" to have inline methods. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_convert.h" +#include "aymo_convert_arm_neon.h" +#include "aymo_convert_none.h" +#include "aymo_convert_x86_avx2.h" +#include "aymo_convert_x86_sse41.h" +#include "aymo_cpu.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +// Dispatcher function types +typedef void (*aymo_convert_i16_f32_f)(size_t n, const int16_t i16v[], float f32v[]); +typedef void (*aymo_convert_f32_i16_f)(size_t n, const float f32v[], int16_t i16v[]); +typedef void (*aymo_convert_i16_f32_1_f)(size_t n, const int16_t i16v[], float f32v[]); +typedef void (*aymo_convert_f32_i16_1_f)(size_t n, const float f32v[], int16_t i16v[]); +typedef void (*aymo_convert_i16_f32_k_f)(size_t n, const int16_t i16v[], float f32v[], float scale); +typedef void (*aymo_convert_f32_i16_k_f)(size_t n, const float f32v[], int16_t i16v[], float scale); +typedef void (*aymo_convert_u16_f32_f)(size_t n, const uint16_t u16v[], float f32v[]); +typedef void (*aymo_convert_f32_u16_f)(size_t n, const float f32v[], uint16_t u16v[]); +typedef void (*aymo_convert_u16_f32_1_f)(size_t n, const uint16_t u16v[], float f32v[]); +typedef void (*aymo_convert_f32_u16_1_f)(size_t n, const float f32v[], uint16_t u16v[]); +typedef void (*aymo_convert_u16_f32_k_f)(size_t n, const uint16_t u16v[], float f32v[], float scale); +typedef void (*aymo_convert_f32_u16_k_f)(size_t n, const float f32v[], uint16_t u16v[], float scale); + +// Dispatcher function pointers +static aymo_convert_i16_f32_f aymo_convert_i16_f32_p; +static aymo_convert_f32_i16_f aymo_convert_f32_i16_p; +static aymo_convert_i16_f32_1_f aymo_convert_i16_f32_1_p; +static aymo_convert_f32_i16_1_f aymo_convert_f32_i16_1_p; +static aymo_convert_i16_f32_k_f aymo_convert_i16_f32_k_p; +static aymo_convert_f32_i16_k_f aymo_convert_f32_i16_k_p; +static aymo_convert_u16_f32_f aymo_convert_u16_f32_p; +static aymo_convert_f32_u16_f aymo_convert_f32_u16_p; +static aymo_convert_u16_f32_1_f aymo_convert_u16_f32_1_p; +static aymo_convert_f32_u16_1_f aymo_convert_f32_u16_1_p; +static aymo_convert_u16_f32_k_f aymo_convert_u16_f32_k_p; +static aymo_convert_f32_u16_k_f aymo_convert_f32_u16_k_p; + + +void aymo_convert_boot(void) +{ +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) { + aymo_convert_i16_f32_p = aymo_convert_x86_avx2_i16_f32; + aymo_convert_f32_i16_p = aymo_convert_x86_avx2_f32_i16; + aymo_convert_i16_f32_1_p = aymo_convert_x86_avx2_i16_f32_1; + aymo_convert_f32_i16_1_p = aymo_convert_x86_avx2_f32_i16_1; + aymo_convert_i16_f32_k_p = aymo_convert_x86_avx2_i16_f32_k; + aymo_convert_f32_i16_k_p = aymo_convert_x86_avx2_f32_i16_k; + aymo_convert_u16_f32_p = aymo_convert_x86_avx2_u16_f32; + aymo_convert_f32_u16_p = aymo_convert_x86_avx2_f32_u16; + aymo_convert_u16_f32_1_p = aymo_convert_x86_avx2_u16_f32_1; + aymo_convert_f32_u16_1_p = aymo_convert_x86_avx2_f32_u16_1; + aymo_convert_u16_f32_k_p = aymo_convert_x86_avx2_u16_f32_k; + aymo_convert_f32_u16_k_p = aymo_convert_x86_avx2_f32_u16_k; + return; + } +#endif // AYMO_CPU_SUPPORT_X86_AVX2 + +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) { + aymo_convert_i16_f32_p = aymo_convert_x86_sse41_i16_f32; + aymo_convert_f32_i16_p = aymo_convert_x86_sse41_f32_i16; + aymo_convert_i16_f32_1_p = aymo_convert_x86_sse41_i16_f32_1; + aymo_convert_f32_i16_1_p = aymo_convert_x86_sse41_f32_i16_1; + aymo_convert_i16_f32_k_p = aymo_convert_x86_sse41_i16_f32_k; + aymo_convert_f32_i16_k_p = aymo_convert_x86_sse41_f32_i16_k; + aymo_convert_u16_f32_p = aymo_convert_x86_sse41_u16_f32; + aymo_convert_f32_u16_p = aymo_convert_x86_sse41_f32_u16; + aymo_convert_u16_f32_1_p = aymo_convert_x86_sse41_u16_f32_1; + aymo_convert_f32_u16_1_p = aymo_convert_x86_sse41_f32_u16_1; + aymo_convert_u16_f32_k_p = aymo_convert_x86_sse41_u16_f32_k; + aymo_convert_f32_u16_k_p = aymo_convert_x86_sse41_f32_u16_k; + return; + } +#endif // AYMO_CPU_SUPPORT_X86_SSE41 + +#if 0//def AYMO_CPU_SUPPORT_ARM_NEON //FIXME: TODO: + if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) { + aymo_convert_i16_f32_p = aymo_convert_arm_neon_i16_f32; + aymo_convert_f32_i16_p = aymo_convert_arm_neon_f32_i16; + aymo_convert_i16_f32_1_p = aymo_convert_arm_neon_i16_f32_1; + aymo_convert_f32_i16_1_p = aymo_convert_arm_neon_f32_i16_1; + aymo_convert_i16_f32_k_p = aymo_convert_arm_neon_i16_f32_k; + aymo_convert_f32_i16_k_p = aymo_convert_arm_neon_f32_i16_k; + aymo_convert_u16_f32_p = aymo_convert_arm_neon_u16_f32; + aymo_convert_f32_u16_p = aymo_convert_arm_neon_f32_u16; + aymo_convert_u16_f32_1_p = aymo_convert_arm_neon_u16_f32_1; + aymo_convert_f32_u16_1_p = aymo_convert_arm_neon_f32_u16_1; + aymo_convert_u16_f32_k_p = aymo_convert_arm_neon_u16_f32_k; + aymo_convert_f32_u16_k_p = aymo_convert_arm_neon_f32_u16_k; + return; + } +#endif // AYMO_CPU_SUPPORT_ARM_NEON + + // Default dispatcher functions + aymo_convert_i16_f32_p = aymo_convert_none_i16_f32; + aymo_convert_f32_i16_p = aymo_convert_none_f32_i16; + aymo_convert_i16_f32_1_p = aymo_convert_none_i16_f32_1; + aymo_convert_f32_i16_1_p = aymo_convert_none_f32_i16_1; + aymo_convert_i16_f32_k_p = aymo_convert_none_i16_f32_k; + aymo_convert_f32_i16_k_p = aymo_convert_none_f32_i16_k; + aymo_convert_u16_f32_p = aymo_convert_none_u16_f32; + aymo_convert_f32_u16_p = aymo_convert_none_f32_u16; + aymo_convert_u16_f32_1_p = aymo_convert_none_u16_f32_1; + aymo_convert_f32_u16_1_p = aymo_convert_none_f32_u16_1; + aymo_convert_u16_f32_k_p = aymo_convert_none_u16_f32_k; + aymo_convert_f32_u16_k_p = aymo_convert_none_f32_u16_k; +} + + +void aymo_convert_i16_f32(size_t n, const int16_t i16v[], float f32v[]) +{ + aymo_convert_i16_f32_p(n, i16v, f32v); +} + + +void aymo_convert_f32_i16(size_t n, const float f32v[], int16_t i16v[]) +{ + aymo_convert_f32_i16_p(n, f32v, i16v); +} + + +void aymo_convert_i16_f32_1(size_t n, const int16_t i16v[], float f32v[]) +{ + aymo_convert_i16_f32_1_p(n, i16v, f32v); +} + + +void aymo_convert_f32_i16_1(size_t n, const float f32v[], int16_t i16v[]) +{ + aymo_convert_f32_i16_1_p(n, f32v, i16v); +} + + +void aymo_convert_i16_f32_k(size_t n, const int16_t i16v[], float f32v[], float scale) +{ + aymo_convert_i16_f32_k_p(n, i16v, f32v, scale); +} + + +void aymo_convert_f32_i16_k(size_t n, const float f32v[], int16_t i16v[], float scale) +{ + aymo_convert_f32_i16_k_p(n, f32v, i16v, scale); +} + + +void aymo_convert_u16_f32(size_t n, const uint16_t u16v[], float f32v[]) +{ + aymo_convert_u16_f32_p(n, u16v, f32v); +} + + +void aymo_convert_f32_u16(size_t n, const float f32v[], uint16_t u16v[]) +{ + aymo_convert_f32_u16_p(n, f32v, u16v); +} + + +void aymo_convert_u16_f32_1(size_t n, const uint16_t u16v[], float f32v[]) +{ + aymo_convert_u16_f32_1_p(n, u16v, f32v); +} + + +void aymo_convert_f32_u16_1(size_t n, const float f32v[], uint16_t u16v[]) +{ + aymo_convert_f32_u16_1_p(n, f32v, u16v); +} + + +void aymo_convert_u16_f32_k(size_t n, const uint16_t u16v[], float f32v[], float scale) +{ + aymo_convert_u16_f32_k_p(n, u16v, f32v, scale); +} + + +void aymo_convert_f32_u16_k(size_t n, const float f32v[], uint16_t u16v[], float scale) +{ + aymo_convert_f32_u16_k_p(n, f32v, u16v, scale); +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_convert_arm_neon.c b/src/aymo_convert_arm_neon.c new file mode 100644 index 0000000..487588a --- /dev/null +++ b/src/aymo_convert_arm_neon.c @@ -0,0 +1,821 @@ +// CPU-specific inline methods for ARM NEON. +// Only #include after "aymo_cpu.h" to have inline methods. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_convert_arm_neon.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +static inline float reinterpret_f32_i32(int32_t i32) +{ + union { float f; int32_t i; } u; + u.i = i32; + return u.f; +} + + +#undef mm_extract_ps +#define mm_extract_ps(a, imm8) \ + (reinterpret_f32_i32(_mm_extract_epi32(_mm_castps_si128(a), (imm8)))) + + +void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]) +{ + if (n >= 8) { + size_t nw = (n / 8); + do { + int16x8_t s16 = vld1q_s16(i16v); i16v += 8; + int32x4_t s32lo = vmovl_s16(vget_low_s16(s16)); + int32x4_t s32hi = vmovl_s16(vget_high_s16(s16)); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + float32x4_t f32hi = vcvtq_f32_s32(s32hi); + vst1q_f32(f32v, f32lo); f32v += 4; + vst1q_f32(f32v, f32hi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + int16x4_t s16 = vld1_s16(i16v); i16v += 4; + int32x4_t s32lo = vmovl_s16(s16); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + vst1q_f32(f32v, f32lo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + int32_t i32t[4] = { i16v[0], i16v[1], i16v[2], 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + f32v[2] = vgetq_lane_f32(f32lo, 2); + break; + } + case 2: { + int32_t i32t[4] = { i16v[0], i16v[1], 0, 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + break; + } + case 1: { + int32_t i32t[4] = { i16v[0], 0, 0, 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32v[0] = vgetq_lane_f32(f32lo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]) +{ + if (n >= 8) { + size_t nw = (n / 8); + do { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + float32x4_t f32hi = vld1q_f32(f32v); f32v += 4; + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int32x4_t s32hi = vcvtq_s32_f32(f32hi); + int16x4_t s16lo = vqmovn_s32(s32lo); + int16x4_t s16hi = vqmovn_s32(s32hi); + vst1q_s16(i16v, vcombine_s16(s16lo, s16hi)); i16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + vst1_s16(i16v, s16lo); i16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + i16v[1] = vget_lane_s16(s16lo, 1); + i16v[2] = vget_lane_s16(s16lo, 2); + break; + } + case 2: { + float f32t[4] = { f32v[0], f32v[1], .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + i16v[1] = vget_lane_s16(s16lo, 1); + break; + } + case 1: { + float f32t[4] = { f32v[0], .0f, .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]) +{ + const float scale = (float)(1. / 32768.); + float32x4_t psk = vdupq_n_f32(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + int16x8_t s16 = vld1q_s16(i16v); i16v += 8; + int32x4_t s32lo = vmovl_s16(vget_low_s16(s16)); + int32x4_t s32hi = vmovl_s16(vget_high_s16(s16)); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + float32x4_t f32hi = vcvtq_f32_s32(s32hi); + f32lo = vmulq_f32(f32lo, psk); + f32hi = vmulq_f32(f32hi, psk); + vst1q_f32(f32v, f32lo); f32v += 4; + vst1q_f32(f32v, f32hi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + int16x4_t s16 = vld1_s16(i16v); i16v += 4; + int32x4_t s32lo = vmovl_s16(s16); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32lo = vmulq_f32(f32lo, psk); + vst1q_f32(f32v, f32lo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + int32_t i32t[4] = { i16v[0], i16v[1], i16v[2], 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + f32v[2] = vgetq_lane_f32(f32lo, 2); + break; + } + case 2: { + int32_t i32t[4] = { i16v[0], i16v[1], 0, 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + break; + } + case 1: { + int32_t i32t[4] = { i16v[0], 0, 0, 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]) +{ + const float scale = (float)(32768.); + float32x4_t psk = vdupq_n_f32(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + float32x4_t f32hi = vld1q_f32(f32v); f32v += 4; + f32lo = vmulq_f32(f32lo, psk); + f32hi = vmulq_f32(f32hi, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int32x4_t s32hi = vcvtq_s32_f32(f32hi); + int16x4_t s16lo = vqmovn_s32(s32lo); + int16x4_t s16hi = vqmovn_s32(s32hi); + vst1q_s16(i16v, vcombine_s16(s16lo, s16hi)); i16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + f32lo = vmulq_f32(f32lo, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + vst1_s16(i16v, s16lo); i16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + i16v[1] = vget_lane_s16(s16lo, 1); + i16v[2] = vget_lane_s16(s16lo, 2); + break; + } + case 2: { + float f32t[4] = { f32v[0], f32v[1], .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + i16v[1] = vget_lane_s16(s16lo, 1); + break; + } + case 1: { + float f32t[4] = { f32v[0], .0f, .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale) +{ + float32x4_t psk = vdupq_n_f32(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + int16x8_t s16 = vld1q_s16(i16v); i16v += 8; + int32x4_t s32lo = vmovl_s16(vget_low_s16(s16)); + int32x4_t s32hi = vmovl_s16(vget_high_s16(s16)); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + float32x4_t f32hi = vcvtq_f32_s32(s32hi); + f32lo = vmulq_f32(f32lo, psk); + f32hi = vmulq_f32(f32hi, psk); + vst1q_f32(f32v, f32lo); f32v += 4; + vst1q_f32(f32v, f32hi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + int16x4_t s16 = vld1_s16(i16v); i16v += 4; + int32x4_t s32lo = vmovl_s16(s16); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32lo = vmulq_f32(f32lo, psk); + vst1q_f32(f32v, f32lo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + int32_t i32t[4] = { i16v[0], i16v[1], i16v[2], 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + f32v[2] = vgetq_lane_f32(f32lo, 2); + break; + } + case 2: { + int32_t i32t[4] = { i16v[0], i16v[1], 0, 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + break; + } + case 1: { + int32_t i32t[4] = { i16v[0], 0, 0, 0 }; + int32x4_t s32lo = vld1q_s32(i32t); + float32x4_t f32lo = vcvtq_f32_s32(s32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale) +{ + float32x4_t psk = vdupq_n_f32(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + float32x4_t f32hi = vld1q_f32(f32v); f32v += 4; + f32lo = vmulq_f32(f32lo, psk); + f32hi = vmulq_f32(f32hi, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int32x4_t s32hi = vcvtq_s32_f32(f32hi); + int16x4_t s16lo = vqmovn_s32(s32lo); + int16x4_t s16hi = vqmovn_s32(s32hi); + vst1q_s16(i16v, vcombine_s16(s16lo, s16hi)); i16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + f32lo = vmulq_f32(f32lo, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + vst1_s16(i16v, s16lo); i16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + i16v[1] = vget_lane_s16(s16lo, 1); + i16v[2] = vget_lane_s16(s16lo, 2); + break; + } + case 2: { + float f32t[4] = { f32v[0], f32v[1], .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + i16v[1] = vget_lane_s16(s16lo, 1); + break; + } + case 1: { + float f32t[4] = { f32v[0], .0f, .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + int32x4_t s32lo = vcvtq_s32_f32(f32lo); + int16x4_t s16lo = vqmovn_s32(s32lo); + i16v[0] = vget_lane_s16(s16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(u16_f32)(size_t n, const uint16_t u16v[], float f32v[]) +{ + if (n >= 8) { + size_t nw = (n / 8); + do { + uint16x8_t u16 = vld1q_u16(u16v); u16v += 8; + uint32x4_t u32lo = vmovl_u16(vget_low_u16(u16)); + uint32x4_t u32hi = vmovl_u16(vget_high_u16(u16)); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + float32x4_t f32hi = vcvtq_f32_u32(u32hi); + vst1q_f32(f32v, f32lo); f32v += 4; + vst1q_f32(f32v, f32hi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + uint16x4_t u16 = vld1_u16(u16v); u16v += 4; + uint32x4_t u32lo = vmovl_u16(u16); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + vst1q_f32(f32v, f32lo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + uint32_t u32t[4] = { u16v[0], u16v[1], u16v[2], 0 }; + uint32x4_t u32lo = vld1q_u32(u32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + f32v[2] = vgetq_lane_f32(f32lo, 2); + break; + } + case 2: { + uint32_t u32t[4] = { u16v[0], u16v[1], 0, 0 }; + uint32x4_t u32lo = vld1q_u32(u32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + break; + } + case 1: { + uint32_t u32t[4] = { u16v[0], 0, 0, 0 }; + uint32x4_t u32lo = vld1q_u32(u32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32v[0] = vgetq_lane_f32(f32lo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t u16v[]) +{ + if (n >= 8) { + size_t nw = (n / 8); + do { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + float32x4_t f32hi = vld1q_f32(f32v); f32v += 4; + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint32x4_t u32hi = vcvtq_u32_f32(f32hi); + uint16x4_t u16lo = vqmovn_u32(u32lo); + uint16x4_t u16hi = vqmovn_u32(u32hi); + vst1q_u16(u16v, vcombine_u16(u16lo, u16hi)); u16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + vst1_u16(u16v, u16lo); u16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + u16v[1] = vget_lane_u16(u16lo, 1); + u16v[2] = vget_lane_u16(u16lo, 2); + break; + } + case 2: { + float f32t[4] = { f32v[0], f32v[1], .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + u16v[1] = vget_lane_u16(u16lo, 1); + break; + } + case 1: { + float f32t[4] = { f32v[0], .0f, .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(u16_f32_1)(size_t n, const uint16_t u16v[], float f32v[]) +{ + const float scale = (float)(1. / 32768.); + float32x4_t psk = vdupq_n_f32(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + uint16x8_t u16 = vld1q_u16(u16v); u16v += 8; + uint32x4_t u32lo = vmovl_u16(vget_low_u16(u16)); + uint32x4_t u32hi = vmovl_u16(vget_high_u16(u16)); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + float32x4_t f32hi = vcvtq_f32_u32(u32hi); + f32lo = vmulq_f32(f32lo, psk); + f32hi = vmulq_f32(f32hi, psk); + vst1q_f32(f32v, f32lo); f32v += 4; + vst1q_f32(f32v, f32hi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + uint16x4_t u16 = vld1_u16(u16v); u16v += 4; + uint32x4_t u32lo = vmovl_u16(u16); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32lo = vmulq_f32(f32lo, psk); + vst1q_f32(f32v, f32lo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + uint32_t i32t[4] = { u16v[0], u16v[1], u16v[2], 0 }; + uint32x4_t u32lo = vld1q_u32(i32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + f32v[2] = vgetq_lane_f32(f32lo, 2); + break; + } + case 2: { + uint32_t i32t[4] = { u16v[0], u16v[1], 0, 0 }; + uint32x4_t u32lo = vld1q_u32(i32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + break; + } + case 1: { + uint32_t i32t[4] = { u16v[0], 0, 0, 0 }; + uint32x4_t u32lo = vld1q_u32(i32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t u16v[]) +{ + const float scale = (float)(32768.); + float32x4_t psk = vdupq_n_f32(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + float32x4_t f32hi = vld1q_f32(f32v); f32v += 4; + f32lo = vmulq_f32(f32lo, psk); + f32hi = vmulq_f32(f32hi, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint32x4_t u32hi = vcvtq_u32_f32(f32hi); + uint16x4_t u16lo = vqmovn_u32(u32lo); + uint16x4_t u16hi = vqmovn_u32(u32hi); + vst1q_u16(u16v, vcombine_u16(u16lo, u16hi)); u16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + f32lo = vmulq_f32(f32lo, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + vst1_u16(u16v, u16lo); u16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + u16v[1] = vget_lane_u16(u16lo, 1); + u16v[2] = vget_lane_u16(u16lo, 2); + break; + } + case 2: { + float f32t[4] = { f32v[0], f32v[1], .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + u16v[1] = vget_lane_u16(u16lo, 1); + break; + } + case 1: { + float f32t[4] = { f32v[0], .0f, .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(u16_f32_k)(size_t n, const uint16_t u16v[], float f32v[], float scale) +{ + float32x4_t psk = vdupq_n_f32(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + uint16x8_t u16 = vld1q_u16(u16v); u16v += 8; + uint32x4_t u32lo = vmovl_u16(vget_low_u16(u16)); + uint32x4_t u32hi = vmovl_u16(vget_high_u16(u16)); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + float32x4_t f32hi = vcvtq_f32_u32(u32hi); + f32lo = vmulq_f32(f32lo, psk); + f32hi = vmulq_f32(f32hi, psk); + vst1q_f32(f32v, f32lo); f32v += 4; + vst1q_f32(f32v, f32hi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + uint16x4_t u16 = vld1_u16(u16v); u16v += 4; + uint32x4_t u32lo = vmovl_u16(u16); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32lo = vmulq_f32(f32lo, psk); + vst1q_f32(f32v, f32lo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + uint32_t i32t[4] = { u16v[0], u16v[1], u16v[2], 0 }; + uint32x4_t u32lo = vld1q_u32(i32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + f32v[2] = vgetq_lane_f32(f32lo, 2); + break; + } + case 2: { + uint32_t i32t[4] = { u16v[0], u16v[1], 0, 0 }; + uint32x4_t u32lo = vld1q_u32(i32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + f32v[1] = vgetq_lane_f32(f32lo, 1); + break; + } + case 1: { + uint32_t i32t[4] = { u16v[0], 0, 0, 0 }; + uint32x4_t u32lo = vld1q_u32(i32t); + float32x4_t f32lo = vcvtq_f32_u32(u32lo); + f32lo = vmulq_f32(f32lo, psk); + f32v[0] = vgetq_lane_f32(f32lo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t u16v[], float scale) +{ + float32x4_t psk = vdupq_n_f32(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + float32x4_t f32hi = vld1q_f32(f32v); f32v += 4; + f32lo = vmulq_f32(f32lo, psk); + f32hi = vmulq_f32(f32hi, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint32x4_t u32hi = vcvtq_u32_f32(f32hi); + uint16x4_t u16lo = vqmovn_u32(u32lo); + uint16x4_t u16hi = vqmovn_u32(u32hi); + vst1q_u16(u16v, vcombine_u16(u16lo, u16hi)); u16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + float32x4_t f32lo = vld1q_f32(f32v); f32v += 4; + f32lo = vmulq_f32(f32lo, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + vst1_u16(u16v, u16lo); u16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + float f32t[4] = { f32v[0], f32v[1], f32v[2], .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + u16v[1] = vget_lane_u16(u16lo, 1); + u16v[2] = vget_lane_u16(u16lo, 2); + break; + } + case 2: { + float f32t[4] = { f32v[0], f32v[1], .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + u16v[1] = vget_lane_u16(u16lo, 1); + break; + } + case 1: { + float f32t[4] = { f32v[0], .0f, .0f, .0f }; + float32x4_t f32lo = vld1q_f32(f32t); + f32lo = vmulq_f32(f32lo, psk); + uint32x4_t u32lo = vcvtq_u32_f32(f32lo); + uint16x4_t u16lo = vqmovn_u32(u32lo); + u16v[0] = vget_lane_u16(u16lo, 0); + break; + } + default: break; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON diff --git a/src/aymo_convert_none.c b/src/aymo_convert_none.c new file mode 100644 index 0000000..da0590f --- /dev/null +++ b/src/aymo_convert_none.c @@ -0,0 +1,177 @@ +// CPU-specific inline methods for ARM NEON. +// Only #include after "aymo_cpu.h" to have inline methods. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_convert_none.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +static inline float convert_i16_f32(int16_t i) +{ + return (float)i; +} + + +static inline int16_t convert_f32_i16(float f) +{ + if (f >= (float)INT16_MAX) { + return INT16_MAX; + } + if (f < (float)INT16_MIN) { + return INT16_MIN; + } + return (int16_t)f; +} + + +static inline float convert_u16_f32(uint16_t u) +{ + return (float)u; +} + + +static inline uint16_t convert_f32_u16(float f) +{ + if (f >= (float)UINT16_MAX) { + return UINT16_MAX; + } + if (f < 0.f) { + return 0u; + } + return (uint16_t)f; +} + + +void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]) +{ + const int16_t* i16e = (i16v + n); + while (i16v != i16e) { + *f32v++ = convert_i16_f32(*i16v++); + } +} + + +void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]) +{ + const float* f32e = (f32v + n); + while (f32v != f32e) { + *i16v++ = convert_f32_i16(*f32v++); + } +} + + +void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]) +{ + const float scale = (float)(1. / 32768.); + const int16_t* i16e = (i16v + n); + while (i16v != i16e) { + *f32v++ = (convert_i16_f32(*i16v++) * scale); + } +} + + +void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]) +{ + const float scale = (float)(32768.); + const float* f32e = (f32v + n); + while (f32v != f32e) { + *i16v++ = convert_f32_i16(*f32v++ * scale); + } +} + + +void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale) +{ + const int16_t* i16e = (i16v + n); + while (i16v != i16e) { + *f32v++ = (convert_i16_f32(*i16v++) * scale); + } +} + + +void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale) +{ + const float* f32e = (f32v + n); + while (f32v != f32e) { + *i16v++ = convert_f32_i16(*f32v++ * scale); + } +} + + +void aymo_(u16_f32)(size_t n, const uint16_t u16v[], float f32v[]) +{ + const uint16_t* u16e = (u16v + n); + while (u16v != u16e) { + *f32v++ = convert_u16_f32(*u16v++); + } +} + + +void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t u16v[]) +{ + const float* f32e = (f32v + n); + while (f32v != f32e) { + *u16v++ = convert_f32_u16(*f32v++); + } +} + + +void aymo_(u16_f32_1)(size_t n, const uint16_t u16v[], float f32v[]) +{ + const float scale = (float)(1. / 32768.); + const uint16_t* u16e = (u16v + n); + while (u16v != u16e) { + *f32v++ = (convert_u16_f32(*u16v++) * scale); + } +} + + +void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t u16v[]) +{ + const float scale = (float)(32768.); + const float* f32e = (f32v + n); + while (f32v != f32e) { + *u16v++ = convert_f32_u16(*f32v++ * scale); + } +} + + +void aymo_(u16_f32_k)(size_t n, const uint16_t u16v[], float f32v[], float scale) +{ + const uint16_t* u16e = (u16v + n); + while (u16v != u16e) { + *f32v++ = (convert_u16_f32(*u16v++) * scale); + } +} + + +void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t u16v[], float scale) +{ + const float* f32e = (f32v + n); + while (f32v != f32e) { + *u16v++ = convert_f32_u16(*f32v++ * scale); + } +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_convert_x86_avx2.c b/src/aymo_convert_x86_avx2.c new file mode 100644 index 0000000..a3de900 --- /dev/null +++ b/src/aymo_convert_x86_avx2.c @@ -0,0 +1,335 @@ +// CPU-specific inline methods for ARM NEON. +// Only #include after "aymo_cpu.h" to have inline methods. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +#include "aymo_convert_x86_sse41.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_convert_x86_avx2.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]) +{ + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256i epi16 = _mm256_loadu_si256((const void*)i16v); i16v += 16; + __m256i epi32lo = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 0)); + __m256i epi32hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 1)); + __m256 pslo = _mm256_cvtepi32_ps(epi32lo); + __m256 pshi = _mm256_cvtepi32_ps(epi32hi); + _mm256_storeu_ps((void*)f32v, pslo); f32v += 8; + _mm256_storeu_ps((void*)f32v, pshi); f32v += 8; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_i16_f32(n, i16v, f32v); + } +} + + +void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]) +{ + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8; + __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8; + __m256i epi32lo = _mm256_cvtps_epi32(pslo); + __m256i epi32hi = _mm256_cvtps_epi32(pshi); + __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1); + __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0); + epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1); + epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0); + __m256i epi16 = _mm256_packs_epi32(epi32lo, epi32hi); + _mm256_storeu_si256((void*)i16v, epi16); i16v += 16; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_f32_i16(n, f32v, i16v); + } +} + + +void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]) +{ + const float scale = (float)(1. / 32768.); + __m256 psk = _mm256_set1_ps(scale); + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256i epi16 = _mm256_loadu_si256((const void*)i16v); i16v += 16; + __m256i epi32lo = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 0)); + __m256i epi32hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 1)); + __m256 pslo = _mm256_cvtepi32_ps(epi32lo); + __m256 pshi = _mm256_cvtepi32_ps(epi32hi); + pslo = _mm256_mul_ps(pslo, psk); + pshi = _mm256_mul_ps(pshi, psk); + _mm256_storeu_ps((void*)f32v, pslo); f32v += 8; + _mm256_storeu_ps((void*)f32v, pshi); f32v += 8; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_i16_f32_1(n, i16v, f32v); + } +} + + +void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]) +{ + const float scale = (float)(32768.); + __m256 psk = _mm256_set1_ps(scale); + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8; + __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8; + pslo = _mm256_mul_ps(pslo, psk); + pshi = _mm256_mul_ps(pshi, psk); + __m256i epi32lo = _mm256_cvtps_epi32(pslo); + __m256i epi32hi = _mm256_cvtps_epi32(pshi); + __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1); + __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0); + epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1); + epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0); + __m256i epi16 = _mm256_packs_epi32(epi32lo, epi32hi); + _mm256_storeu_si256((void*)i16v, epi16); i16v += 16; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_f32_i16_1(n, f32v, i16v); + } +} + + +void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale) +{ + __m256 psk = _mm256_set1_ps(scale); + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256i epi16 = _mm256_loadu_si256((const void*)i16v); i16v += 16; + __m256i epi32lo = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 0)); + __m256i epi32hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(epi16, 1)); + __m256 pslo = _mm256_cvtepi32_ps(epi32lo); + __m256 pshi = _mm256_cvtepi32_ps(epi32hi); + pslo = _mm256_mul_ps(pslo, psk); + pshi = _mm256_mul_ps(pshi, psk); + _mm256_storeu_ps((void*)f32v, pslo); f32v += 8; + _mm256_storeu_ps((void*)f32v, pshi); f32v += 8; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_i16_f32_k(n, i16v, f32v, scale); + } +} + + +void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale) +{ + __m256 psk = _mm256_set1_ps(scale); + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8; + __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8; + pslo = _mm256_mul_ps(pslo, psk); + pshi = _mm256_mul_ps(pshi, psk); + __m256i epi32lo = _mm256_cvtps_epi32(pslo); + __m256i epi32hi = _mm256_cvtps_epi32(pshi); + __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1); + __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0); + epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1); + epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0); + __m256i epi16 = _mm256_packs_epi32(epi32lo, epi32hi); + _mm256_storeu_si256((void*)i16v, epi16); i16v += 16; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_f32_i16_k(n, f32v, i16v, scale); + } +} + + +void aymo_(u16_f32)(size_t n, const uint16_t u16v[], float f32v[]) +{ + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256i epu16 = _mm256_loadu_si256((const void*)u16v); u16v += 16; + __m256i epi32lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 0)); + __m256i epi32hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 1)); + __m256 pslo = _mm256_cvtepi32_ps(epi32lo); + __m256 pshi = _mm256_cvtepi32_ps(epi32hi); + _mm256_storeu_ps((void*)f32v, pslo); f32v += 8; + _mm256_storeu_ps((void*)f32v, pshi); f32v += 8; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_u16_f32(n, u16v, f32v); + } +} + + +void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t u16v[]) +{ + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8; + __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8; + __m256i epi32lo = _mm256_cvtps_epi32(pslo); + __m256i epi32hi = _mm256_cvtps_epi32(pshi); + __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1); + __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0); + epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1); + epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0); + __m256i epu16 = _mm256_packus_epi32(epi32lo, epi32hi); + _mm256_storeu_si256((void*)u16v, epu16); u16v += 16; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_f32_u16(n, f32v, u16v); + } +} + + +void aymo_(u16_f32_1)(size_t n, const uint16_t u16v[], float f32v[]) +{ + const float scale = (float)(1. / 32768.); + __m256 psk = _mm256_set1_ps(scale); + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256i epu16 = _mm256_loadu_si256((const void*)u16v); u16v += 16; + __m256i epi32lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 0)); + __m256i epi32hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 1)); + __m256 pslo = _mm256_cvtepi32_ps(epi32lo); + __m256 pshi = _mm256_cvtepi32_ps(epi32hi); + pslo = _mm256_mul_ps(pslo, psk); + pshi = _mm256_mul_ps(pshi, psk); + _mm256_storeu_ps((void*)f32v, pslo); f32v += 8; + _mm256_storeu_ps((void*)f32v, pshi); f32v += 8; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_u16_f32_1(n, u16v, f32v); + } +} + + +void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t u16v[]) +{ + const float scale = (float)(32768.); + __m256 psk = _mm256_set1_ps(scale); + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8; + __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8; + pslo = _mm256_mul_ps(pslo, psk); + pshi = _mm256_mul_ps(pshi, psk); + __m256i epi32lo = _mm256_cvtps_epi32(pslo); + __m256i epi32hi = _mm256_cvtps_epi32(pshi); + __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1); + __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0); + epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1); + epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0); + __m256i epu16 = _mm256_packus_epi32(epi32lo, epi32hi); + _mm256_storeu_si256((void*)u16v, epu16); u16v += 16; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_f32_u16_1(n, f32v, u16v); + } +} + + +void aymo_(u16_f32_k)(size_t n, const uint16_t u16v[], float f32v[], float scale) +{ + __m256 psk = _mm256_set1_ps(scale); + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256i epu16 = _mm256_loadu_si256((const void*)u16v); u16v += 16; + __m256i epi32lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 0)); + __m256i epi32hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(epu16, 1)); + __m256 pslo = _mm256_cvtepi32_ps(epi32lo); + __m256 pshi = _mm256_cvtepi32_ps(epi32hi); + pslo = _mm256_mul_ps(pslo, psk); + pshi = _mm256_mul_ps(pshi, psk); + _mm256_storeu_ps((void*)f32v, pslo); f32v += 8; + _mm256_storeu_ps((void*)f32v, pshi); f32v += 8; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_u16_f32_k(n, u16v, f32v, scale); + } +} + + +void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t u16v[], float scale) +{ + __m256 psk = _mm256_set1_ps(scale); + if (n >= 16) { + size_t nw = (n / 16); + n %= 16; + do { + __m256 pslo = _mm256_loadu_ps((const void*)f32v); f32v += 8; + __m256 pshi = _mm256_loadu_ps((const void*)f32v); f32v += 8; + pslo = _mm256_mul_ps(pslo, psk); + pshi = _mm256_mul_ps(pshi, psk); + __m256i epi32lo = _mm256_cvtps_epi32(pslo); + __m256i epi32hi = _mm256_cvtps_epi32(pshi); + __m128i epi32lohi = _mm256_extracti128_si256(epi32lo, 1); + __m128i epi32hilo = _mm256_extracti128_si256(epi32hi, 0); + epi32lo = _mm256_inserti128_si256(epi32lo, epi32hilo, 1); + epi32hi = _mm256_inserti128_si256(epi32hi, epi32lohi, 0); + __m256i epu16 = _mm256_packus_epi32(epi32lo, epi32hi); + _mm256_storeu_si256((void*)u16v, epu16); u16v += 16; + } while (--nw); + } + if (n) { + aymo_convert_x86_sse41_f32_u16_k(n, f32v, u16v, scale); + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 diff --git a/src/aymo_convert_x86_sse41.c b/src/aymo_convert_x86_sse41.c new file mode 100644 index 0000000..56d2d49 --- /dev/null +++ b/src/aymo_convert_x86_sse41.c @@ -0,0 +1,796 @@ +// CPU-specific inline methods for ARM NEON. +// Only #include after "aymo_cpu.h" to have inline methods. +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_convert_x86_sse41.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +static inline float reinterpret_f32_i32(int32_t i32) +{ + union { float f; int32_t i; } u; + u.i = i32; + return u.f; +} + + +#undef mm_extract_ps +#define mm_extract_ps(a, imm8) \ + (reinterpret_f32_i32(_mm_extract_epi32(_mm_castps_si128(a), (imm8)))) + + +void aymo_(i16_f32)(size_t n, const int16_t i16v[], float f32v[]) +{ + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128i epi16 = _mm_loadu_si128((const void*)i16v); i16v += 8; + __m128i epi32lo = _mm_cvtepi16_epi32(epi16); + epi16 = _mm_shuffle_epi32(epi16, _MM_SHUFFLE(3, 2, 3, 2)); + __m128i epi32hi = _mm_cvtepi16_epi32(epi16); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + __m128 pshi = _mm_cvtepi32_ps(epi32hi); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + _mm_storeu_ps((void*)f32v, pshi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128i epi16lo = _mm_loadl_epi64((const void*)i16v); i16v += 4; + __m128i epi32lo = _mm_cvtepi16_epi32(epi16lo); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], i16v[2], 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + f32v[2] = mm_extract_ps(pslo, 2); + break; + } + case 2: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + break; + } + case 1: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], 0, 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + f32v[0] = mm_extract_ps(pslo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_i16)(size_t n, const float f32v[], int16_t i16v[]) +{ + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi32hi = _mm_cvtps_epi32(pshi); + __m128i epi16 = _mm_packs_epi32(epi32lo, epi32hi); + _mm_storeu_si128((void*)i16v, epi16); i16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + _mm_storel_epi64((void*)i16v, epi16lo); i16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + i16v[1] = _mm_extract_epi16(epi16lo, 1); + i16v[2] = _mm_extract_epi16(epi16lo, 2); + break; + } + case 2: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + i16v[1] = _mm_extract_epi16(epi16lo, 1); + break; + } + case 1: { + __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(i16_f32_1)(size_t n, const int16_t i16v[], float f32v[]) +{ + const float scale = (float)(1. / 32768.); + __m128 psk = _mm_set1_ps(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128i epi16 = _mm_loadu_si128((const void*)i16v); i16v += 8; + __m128i epi32lo = _mm_cvtepi16_epi32(epi16); + epi16 = _mm_shuffle_epi32(epi16, _MM_SHUFFLE(3, 2, 3, 2)); + __m128i epi32hi = _mm_cvtepi16_epi32(epi16); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + __m128 pshi = _mm_cvtepi32_ps(epi32hi); + pslo = _mm_mul_ps(pslo, psk); + pshi = _mm_mul_ps(pshi, psk); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + _mm_storeu_ps((void*)f32v, pshi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128i epi16lo = _mm_loadl_epi64((const void*)i16v); i16v += 4; + __m128i epi32lo = _mm_cvtepi16_epi32(epi16lo); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], i16v[2], 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + f32v[2] = mm_extract_ps(pslo, 2); + break; + } + case 2: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + break; + } + case 1: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], 0, 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_i16_1)(size_t n, const float f32v[], int16_t i16v[]) +{ + const float scale = (float)(32768.); + __m128 psk = _mm_set1_ps(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4; + pslo = _mm_mul_ps(pslo, psk); + pshi = _mm_mul_ps(pshi, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi32hi = _mm_cvtps_epi32(pshi); + __m128i epi16 = _mm_packs_epi32(epi32lo, epi32hi); + _mm_storeu_si128((void*)i16v, epi16); i16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + _mm_storel_epi64((void*)i16v, epi16lo); i16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + i16v[1] = _mm_extract_epi16(epi16lo, 1); + i16v[2] = _mm_extract_epi16(epi16lo, 2); + break; + } + case 2: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + i16v[1] = _mm_extract_epi16(epi16lo, 1); + break; + } + case 1: { + __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(i16_f32_k)(size_t n, const int16_t i16v[], float f32v[], float scale) +{ + __m128 psk = _mm_set1_ps(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128i epi16 = _mm_loadu_si128((const void*)i16v); i16v += 8; + __m128i epi32lo = _mm_cvtepi16_epi32(epi16); + epi16 = _mm_shuffle_epi32(epi16, _MM_SHUFFLE(3, 2, 3, 2)); + __m128i epi32hi = _mm_cvtepi16_epi32(epi16); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + __m128 pshi = _mm_cvtepi32_ps(epi32hi); + pslo = _mm_mul_ps(pslo, psk); + pshi = _mm_mul_ps(pshi, psk); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + _mm_storeu_ps((void*)f32v, pshi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128i epi16lo = _mm_loadl_epi64((const void*)i16v); i16v += 4; + __m128i epi32lo = _mm_cvtepi16_epi32(epi16lo); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], i16v[2], 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + f32v[2] = mm_extract_ps(pslo, 2); + break; + } + case 2: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], i16v[1], 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + break; + } + case 1: { + __m128i epi32lo = _mm_setr_epi32(i16v[0], 0, 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_i16_k)(size_t n, const float f32v[], int16_t i16v[], float scale) +{ + __m128 psk = _mm_set1_ps(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4; + pslo = _mm_mul_ps(pslo, psk); + pshi = _mm_mul_ps(pshi, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi32hi = _mm_cvtps_epi32(pshi); + __m128i epi16 = _mm_packs_epi32(epi32lo, epi32hi); + _mm_storeu_si128((void*)i16v, epi16); i16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + _mm_storel_epi64((void*)i16v, epi16lo); i16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + i16v[1] = _mm_extract_epi16(epi16lo, 1); + i16v[2] = _mm_extract_epi16(epi16lo, 2); + break; + } + case 2: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + i16v[1] = _mm_extract_epi16(epi16lo, 1); + break; + } + case 1: { + __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epi16lo = _mm_packs_epi32(epi32lo, epi32lo); + i16v[0] = _mm_extract_epi16(epi16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(u16_f32)(size_t n, const uint16_t u16v[], float f32v[]) +{ + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128i epu16 = _mm_loadu_si128((const void*)u16v); u16v += 8; + __m128i epu32lo = _mm_cvtepu16_epi32(epu16); + epu16 = _mm_shuffle_epi32(epu16, _MM_SHUFFLE(3, 2, 3, 2)); + __m128i epu32hi = _mm_cvtepu16_epi32(epu16); + __m128 pslo = _mm_cvtepi32_ps(epu32lo); + __m128 pshi = _mm_cvtepi32_ps(epu32hi); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + _mm_storeu_ps((void*)f32v, pshi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128i epu16lo = _mm_loadl_epi64((const void*)u16v); u16v += 4; + __m128i epu32lo = _mm_cvtepu16_epi32(epu16lo); + __m128 pslo = _mm_cvtepi32_ps(epu32lo); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], + (int32_t)(uint32_t)u16v[1], + (int32_t)(uint32_t)u16v[2], 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + f32v[2] = mm_extract_ps(pslo, 2); + break; + } + case 2: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], + (int32_t)(uint32_t)u16v[1], 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + break; + } + case 1: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], 0, 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + f32v[0] = mm_extract_ps(pslo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_u16)(size_t n, const float f32v[], uint16_t u16v[]) +{ + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128i epu32lo = _mm_cvtps_epi32(pslo); + __m128i epu32hi = _mm_cvtps_epi32(pshi); + __m128i epu16 = _mm_packus_epi32(epu32lo, epu32hi); + _mm_storeu_si128((void*)u16v, epu16); u16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128i epu32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epu32lo, epu32lo); + _mm_storel_epi64((void*)u16v, epu16lo); u16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1); + u16v[2] = (uint16_t)_mm_extract_epi16(epu16lo, 2); + break; + } + case 2: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1); + break; + } + case 1: { + __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(u16_f32_1)(size_t n, const uint16_t u16v[], float f32v[]) +{ + const float scale = (float)(1. / 32768.); + __m128 psk = _mm_set1_ps(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128i epu16 = _mm_loadu_si128((const void*)u16v); u16v += 8; + __m128i epu32lo = _mm_cvtepu16_epi32(epu16); + epu16 = _mm_shuffle_epi32(epu16, _MM_SHUFFLE(3, 2, 3, 2)); + __m128i epu32hi = _mm_cvtepu16_epi32(epu16); + __m128 pslo = _mm_cvtepi32_ps(epu32lo); + __m128 pshi = _mm_cvtepi32_ps(epu32hi); + pslo = _mm_mul_ps(pslo, psk); + pshi = _mm_mul_ps(pshi, psk); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + _mm_storeu_ps((void*)f32v, pshi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128i epu16lo = _mm_loadl_epi64((const void*)u16v); u16v += 4; + __m128i epu32lo = _mm_cvtepu16_epi32(epu16lo); + __m128 pslo = _mm_cvtepi32_ps(epu32lo); + pslo = _mm_mul_ps(pslo, psk); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], + (int32_t)(uint32_t)u16v[1], + (int32_t)(uint32_t)u16v[2], 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + f32v[2] = mm_extract_ps(pslo, 2); + break; + } + case 2: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], + (int32_t)(uint32_t)u16v[1], 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + break; + } + case 1: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], 0, 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_u16_1)(size_t n, const float f32v[], uint16_t u16v[]) +{ + const float scale = (float)(32768.); + __m128 psk = _mm_set1_ps(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4; + pslo = _mm_mul_ps(pslo, psk); + pshi = _mm_mul_ps(pshi, psk); + __m128i epu32lo = _mm_cvtps_epi32(pslo); + __m128i epu32hi = _mm_cvtps_epi32(pshi); + __m128i epu16 = _mm_packus_epi32(epu32lo, epu32hi); + _mm_storeu_si128((void*)u16v, epu16); u16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + pslo = _mm_mul_ps(pslo, psk); + __m128i epu32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epu32lo, epu32lo); + _mm_storel_epi64((void*)u16v, epu16lo); u16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1); + u16v[2] = (uint16_t)_mm_extract_epi16(epu16lo, 2); + break; + } + case 2: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1); + break; + } + case 1: { + __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + break; + } + default: break; + } +} + + +void aymo_(u16_f32_k)(size_t n, const uint16_t u16v[], float f32v[], float scale) +{ + __m128 psk = _mm_set1_ps(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128i epu16 = _mm_loadu_si128((const void*)u16v); u16v += 8; + __m128i epu32lo = _mm_cvtepu16_epi32(epu16); + epu16 = _mm_shuffle_epi32(epu16, _MM_SHUFFLE(3, 2, 3, 2)); + __m128i epu32hi = _mm_cvtepu16_epi32(epu16); + __m128 pslo = _mm_cvtepi32_ps(epu32lo); + __m128 pshi = _mm_cvtepi32_ps(epu32hi); + pslo = _mm_mul_ps(pslo, psk); + pshi = _mm_mul_ps(pshi, psk); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + _mm_storeu_ps((void*)f32v, pshi); f32v += 4; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128i epu16lo = _mm_loadl_epi64((const void*)u16v); u16v += 4; + __m128i epu32lo = _mm_cvtepu16_epi32(epu16lo); + __m128 pslo = _mm_cvtepi32_ps(epu32lo); + pslo = _mm_mul_ps(pslo, psk); + _mm_storeu_ps((void*)f32v, pslo); f32v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], + (int32_t)(uint32_t)u16v[1], + (int32_t)(uint32_t)u16v[2], 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + f32v[2] = mm_extract_ps(pslo, 2); + break; + } + case 2: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], + (int32_t)(uint32_t)u16v[1], 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + f32v[1] = mm_extract_ps(pslo, 1); + break; + } + case 1: { + __m128i epi32lo = _mm_setr_epi32((int32_t)(uint32_t)u16v[0], 0, 0, 0); + __m128 pslo = _mm_cvtepi32_ps(epi32lo); + pslo = _mm_mul_ps(pslo, psk); + f32v[0] = mm_extract_ps(pslo, 0); + break; + } + default: break; + } +} + + +void aymo_(f32_u16_k)(size_t n, const float f32v[], uint16_t u16v[], float scale) +{ + __m128 psk = _mm_set1_ps(scale); + if (n >= 8) { + size_t nw = (n / 8); + do { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + __m128 pshi = _mm_loadu_ps((const void*)f32v); f32v += 4; + pslo = _mm_mul_ps(pslo, psk); + pshi = _mm_mul_ps(pshi, psk); + __m128i epu32lo = _mm_cvtps_epi32(pslo); + __m128i epu32hi = _mm_cvtps_epi32(pshi); + __m128i epu16 = _mm_packus_epi32(epu32lo, epu32hi); + _mm_storeu_si128((void*)u16v, epu16); u16v += 8; + } while (--nw); + n %= 8; + if (n == 0) { + return; + } + } + if (n >= 4) { + __m128 pslo = _mm_loadu_ps((const void*)f32v); f32v += 4; + pslo = _mm_mul_ps(pslo, psk); + __m128i epu32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epu32lo, epu32lo); + _mm_storel_epi64((void*)u16v, epu16lo); u16v += 4; + n %= 4; + if (n == 0) { + return; + } + } + switch (n) { + case 3: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], f32v[2], .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1); + u16v[2] = (uint16_t)_mm_extract_epi16(epu16lo, 2); + break; + } + case 2: { + __m128 pslo = _mm_setr_ps(f32v[0], f32v[1], .0f, .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + u16v[1] = (uint16_t)_mm_extract_epi16(epu16lo, 1); + break; + } + case 1: { + __m128 pslo = _mm_setr_ps(f32v[0], .0f, .0f, .0f); + pslo = _mm_mul_ps(pslo, psk); + __m128i epi32lo = _mm_cvtps_epi32(pslo); + __m128i epu16lo = _mm_packus_epi32(epi32lo, epi32lo); + u16v[0] = (uint16_t)_mm_extract_epi16(epu16lo, 0); + break; + } + default: break; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 diff --git a/src/aymo_cpu.c b/src/aymo_cpu.c new file mode 100644 index 0000000..ffc2b1a --- /dev/null +++ b/src/aymo_cpu.c @@ -0,0 +1,38 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +void aymo_cpu_boot(void) +{ + #if (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64)) + aymo_cpu_x86_boot(); + #endif + + #if (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64)) + aymo_cpu_arm_boot(); + #endif +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_cpu_arm.c b/src/aymo_cpu_arm.c new file mode 100644 index 0000000..e8369fa --- /dev/null +++ b/src/aymo_cpu_arm.c @@ -0,0 +1,61 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#if (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64)) + +AYMO_CXX_EXTERN_C_BEGIN + + +static unsigned aymo_cpu_arm_extensions; + + +void aymo_cpu_arm_boot(void) +{ + unsigned mask = 0u; + +#ifdef AYMO_CPU_PRESUME_ARM_NEON + mask |= AYMO_CPU_ARM_EXT_NEON; +#endif +#ifdef AYMO_CPU_PRESUME_ARM_NEON64 + mask |= AYMO_CPU_ARM_EXT_NEON64; +#endif + + // FIXME: TODO: feature detection +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + mask |= AYMO_CPU_ARM_EXT_NEON; +#endif +#ifdef AYMO_CPU_SUPPORT_ARM_NEON64 + mask |= AYMO_CPU_ARM_EXT_NEON64; +#endif + + aymo_cpu_arm_extensions = mask; +} + + +unsigned aymo_cpu_arm_get_extensions(void) +{ + return aymo_cpu_arm_extensions; +} + + +AYMO_CXX_EXTERN_C_END + +#endif // (defined(AYMO_CPU_FAMILY_ARM) || defined(AYMO_CPU_FAMILY_AARCH64)) diff --git a/src/aymo_cpu_x86.c b/src/aymo_cpu_x86.c new file mode 100644 index 0000000..5456c9f --- /dev/null +++ b/src/aymo_cpu_x86.c @@ -0,0 +1,119 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#if (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64)) + +#ifdef AYMO_CPU_HAVE_CPUINFO + #if defined(AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID) + #include + #elif defined(AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUID) + #include + #endif +#endif // AYMO_CPU_HAVE_CPUINFO + +AYMO_CXX_EXTERN_C_BEGIN + +#define AYMO_CPU_X86_CPUID_SSE (1uL << 25u) // edx[25] @ leaf 1 +#define AYMO_CPU_X86_CPUID_SSE2 (1uL << 26u) // edx[26] @ leaf 1 +#define AYMO_CPU_X86_CPUID_SSE3 (1uL << 0u) // ecx[ 0] @ leaf 1 +#define AYMO_CPU_X86_CPUID_SSSE3 (1uL << 9u) // ecx[ 9] @ leaf 1 +#define AYMO_CPU_X86_CPUID_SSE41 (1uL << 19u) // ecx[19] @ leaf 1 +#define AYMO_CPU_X86_CPUID_SSE42 (1uL << 20u) // ecx[20] @ leaf 1 +#define AYMO_CPU_X86_CPUID_AVX (1uL << 28u) // ecx[28] @ leaf 1 +#define AYMO_CPU_X86_CPUID_AVX2 (1uL << 5u) // ebx[ 5] @ leaf 7.0 +#define AYMO_CPU_X86_CPUID_FMA (1uL << 12u) // ecx[12] @ leaf 1 + + +static unsigned aymo_cpu_x86_extensions; + + +void aymo_cpu_x86_boot(void) +{ + unsigned mask = 0u; + +#ifdef AYMO_CPU_PRESUME_X86_SSE + mask |= AYMO_CPU_X86_EXT_SSE; +#endif +#ifdef AYMO_CPU_PRESUME_X86_SSE2 + mask |= AYMO_CPU_X86_EXT_SSE2; +#endif +#ifdef AYMO_CPU_PRESUME_X86_SSE3 + mask |= AYMO_CPU_X86_EXT_SSE3; +#endif +#ifdef AYMO_CPU_PRESUME_X86_SSSE3 + mask |= AYMO_CPU_X86_EXT_SSSE3; +#endif +#ifdef AYMO_CPU_PRESUME_X86_SSE41 + mask |= AYMO_CPU_X86_EXT_SSE41; +#endif +#ifdef AYMO_CPU_PRESUME_X86_SSE42 + mask |= AYMO_CPU_X86_EXT_SSE42; +#endif +#ifdef AYMO_CPU_PRESUME_X86_AVX + mask |= AYMO_CPU_X86_EXT_AVX; +#endif +#ifdef AYMO_CPU_PRESUME_X86_AVX2 + mask |= AYMO_CPU_X86_EXT_AVX2; +#endif +#ifdef AYMO_CPU_PRESUME_X86_FMA3 + mask |= AYMO_CPU_X86_EXT_FMA3; +#endif + +#ifdef AYMO_CPU_HAVE_CPUINFO + unsigned e1[4] = { 0u, 0u, 0u, 0u }; + unsigned e7[4] = { 0u, 0u, 0u, 0u }; + + #if defined(AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID) + __cpuid(1u, e1[0], e1[1], e1[2], e1[3]); + #elif defined(AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUID) + __cpuid((int*)e1, 1); + #endif + + #if defined(AYMO_CPU_HAVE_CPUINFO_CPUID_H_CPUID_COUNT) + __cpuid_count(7u, 0u, e7[0], e7[1], e7[2], e7[3]); + #elif defined(AYMO_CPU_HAVE_CPUINFO_INTRIN_H_CPUIDEX) + __cpuidex((int*)e7, 7, 0); + #endif + + if (e1[3] & AYMO_CPU_X86_CPUID_SSE ) { mask |= AYMO_CPU_X86_EXT_SSE; } + if (e1[3] & AYMO_CPU_X86_CPUID_SSE2 ) { mask |= AYMO_CPU_X86_EXT_SSE2; } + if (e1[2] & AYMO_CPU_X86_CPUID_SSE3 ) { mask |= AYMO_CPU_X86_EXT_SSE3; } + if (e1[2] & AYMO_CPU_X86_CPUID_SSSE3) { mask |= AYMO_CPU_X86_EXT_SSSE3; } + if (e1[2] & AYMO_CPU_X86_CPUID_SSE41) { mask |= AYMO_CPU_X86_EXT_SSE41; } + if (e1[2] & AYMO_CPU_X86_CPUID_SSE42) { mask |= AYMO_CPU_X86_EXT_SSE42; } + if (e1[2] & AYMO_CPU_X86_CPUID_AVX ) { mask |= AYMO_CPU_X86_EXT_AVX; } + if (e7[1] & AYMO_CPU_X86_CPUID_AVX2 ) { mask |= AYMO_CPU_X86_EXT_AVX2; } + if (e1[2] & AYMO_CPU_X86_CPUID_FMA ) { mask |= AYMO_CPU_X86_EXT_FMA3; } +#endif // AYMO_CPU_HAVE_CPUINFO + + aymo_cpu_x86_extensions = mask; +} + + +unsigned aymo_cpu_x86_get_extensions(void) +{ + return aymo_cpu_x86_extensions; +} + + +AYMO_CXX_EXTERN_C_END + +#endif // (defined(AYMO_CPU_FAMILY_X86) || defined(AYMO_CPU_FAMILY_X86_64)) diff --git a/src/aymo_empty.c b/src/aymo_empty.c new file mode 100644 index 0000000..439e361 --- /dev/null +++ b/src/aymo_empty.c @@ -0,0 +1 @@ +/* Just an empty file to make Meson happy :-) */ diff --git a/src/aymo_file.c b/src/aymo_file.c new file mode 100644 index 0000000..a0e7b83 --- /dev/null +++ b/src/aymo_file.c @@ -0,0 +1,133 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_file.h" + +#include +#include +#include +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +static unsigned char aymo_file_chunk[AYMO_FILE_CHUNK_SIZE]; + + +int aymo_file_save(const char* pathp, const void* datap, size_t size) +{ + const char* chunkp = (const char*)datap; + FILE* filep = (FILE*)NULL; + size_t total = 0u; + size_t subsize = 0u; + + assert(pathp != NULL); + assert(*pathp != '\0'); + if (datap == NULL) { + size = 0u; + } + + filep = fopen(pathp, "wb"); + if (filep == NULL) { + perror("fopen()"); + goto error_; + } + + while (total < size) { + subsize = fwrite(chunkp, 1u, (total - size), filep); + if (subsize == 0u) { + perror("fwrite()"); + goto error_; + } + + total += subsize; + chunkp += subsize; + } + return 0; + +error_: + return 1; +} + + +int aymo_file_load(const char* pathp, void** datapp, size_t* sizep) +{ + FILE* filep = (FILE*)NULL; + void* datap = NULL; + size_t total = 0u; + size_t subsize = 0u; + + assert(pathp != NULL); + assert(*pathp != '\0'); + assert(datapp != NULL); + assert(sizep != NULL); + + *datapp = NULL; + *sizep = 0U; + + filep = fopen(pathp, "rb"); + if (filep == NULL) { + perror("fopen()"); + goto error_; + } + + datap = malloc(1u); + if (datap == NULL) { + perror("malloc()"); + goto error_; + } + + while (!feof(filep)) { + subsize = fread(&aymo_file_chunk[0], 1u, AYMO_FILE_CHUNK_SIZE, filep); + if (subsize == 0u) { + perror("fread()"); + goto error_; + } + + datap = realloc(datap, (total + subsize)); + if (datap == NULL) { + perror("realloc()"); + goto error_; + } + + (void)memcpy((unsigned char*)datap + total, &aymo_file_chunk[0], subsize); + + total += subsize; + } + *datapp = datap; + *sizep = total; + return 0; + +error_: + aymo_file_unload(datap); + return 1; +} + + +void aymo_file_unload(void* datap) +{ + if (datap) { + free(datap); + } +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_score.c b/src/aymo_score.c new file mode 100644 index 0000000..019a3fb --- /dev/null +++ b/src/aymo_score.c @@ -0,0 +1,153 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_score.h" +#include "aymo_score_avd.h" +#include "aymo_score_dro.h" +#include "aymo_score_imf.h" +#include "aymo_score_raw.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +int aymo_score_ctor( + struct aymo_score_instance* score +) +{ + assert(score); + assert(score->vt); + return score->vt->ctor(score); +} + + +void aymo_score_dtor( + struct aymo_score_instance* score +) +{ + assert(score); + assert(score->vt); + score->vt->dtor(score); +} + + +int aymo_score_load( + struct aymo_score_instance* score, + const void* data, + uint32_t size +) +{ + assert(score); + assert(score->vt); + return score->vt->load(score, data, size); +} + + +void aymo_score_unload( + struct aymo_score_instance* score +) +{ + assert(score); + assert(score->vt); + score->vt->unload(score); +} + + +struct aymo_score_status* aymo_score_get_status( + struct aymo_score_instance* score +) +{ + assert(score); + assert(score->vt); + return score->vt->get_status(score); +} + + +void aymo_score_restart( + struct aymo_score_instance* score +) +{ + assert(score); + assert(score->vt); + score->vt->restart(score); +} + + +uint32_t aymo_score_tick( + struct aymo_score_instance* score, + uint32_t count +) +{ + assert(score); + assert(score->vt); + return score->vt->tick(score, count); +} + + +enum aymo_score_type aymo_score_ext_to_type( + const char *tag +) +{ + if (tag != NULL) { + if (((tag[0] == 'A') || (tag[0] == 'a')) && + ((tag[1] == 'V') || (tag[1] == 'v')) && + ((tag[2] == 'D') || (tag[2] == 'd')) && + (tag[3] == '\0')) { + return aymo_score_type_avd; + } + if (((tag[0] == 'D') || (tag[0] == 'd')) && + ((tag[1] == 'R') || (tag[1] == 'r')) && + ((tag[2] == 'O') || (tag[2] == 'o')) && + (tag[3] == '\0')) { + return aymo_score_type_dro; + } + if (((tag[0] == 'I') || (tag[0] == 'i')) && + ((tag[1] == 'M') || (tag[1] == 'm')) && + ((tag[2] == 'F') || (tag[2] == 'f')) && + (tag[3] == '\0')) { + return aymo_score_type_imf; + } + if (((tag[0] == 'R') || (tag[0] == 'r')) && + ((tag[1] == 'A') || (tag[1] == 'a')) && + ((tag[2] == 'W') || (tag[2] == 'w')) && + (tag[3] == '\0')) { + return aymo_score_type_raw; + } + } + return aymo_score_type_unknown; +} + + +const struct aymo_score_vt* aymo_score_type_to_vt( + enum aymo_score_type score_type +) +{ + switch (score_type) { + case aymo_score_type_avd: return &aymo_score_avd_vt; + case aymo_score_type_dro: return &aymo_score_dro_vt; + case aymo_score_type_imf: return &aymo_score_imf_vt; + case aymo_score_type_raw: return &aymo_score_raw_vt; + default: return NULL; + } +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_score_avd.c b/src/aymo_score_avd.c new file mode 100644 index 0000000..d3606a6 --- /dev/null +++ b/src/aymo_score_avd.c @@ -0,0 +1,174 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_score_avd.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_score_vt aymo_score_avd_vt = { + "aymo_score_avd", + (aymo_score_ctor_f)aymo_score_avd_ctor, + (aymo_score_dtor_f)aymo_score_avd_dtor, + (aymo_score_load_f)aymo_score_avd_load, + (aymo_score_unload_f)aymo_score_avd_unload, + (aymo_score_get_status_f)aymo_score_avd_get_status, + (aymo_score_restart_f)aymo_score_avd_restart, + (aymo_score_tick_f)aymo_score_avd_tick +}; + + +int aymo_score_avd_ctor( + struct aymo_score_avd_instance* score +) +{ + assert(score); + + score->vt = &aymo_score_avd_vt; + + score->events = NULL; + score->length = 0u; + aymo_score_avd_restart(score); + return 0; +} + + +void aymo_score_avd_dtor( + struct aymo_score_avd_instance* score +) +{ + AYMO_UNUSED_VAR(score); + assert(score); +} + + +int aymo_score_avd_load( + struct aymo_score_avd_instance* score, + const void* data, + uint32_t size +) +{ + assert(score); + + uint32_t length = (size / sizeof(struct aymo_score_avd_event)); + assert(!length || data); + + score->events = (const struct aymo_score_avd_event*)data; + score->length = length; + aymo_score_avd_restart(score); + return 0; +} + + +void aymo_score_avd_unload( + struct aymo_score_avd_instance* score +) +{ + aymo_score_avd_ctor(score); +} + + +struct aymo_score_status* aymo_score_avd_get_status( + struct aymo_score_avd_instance* score +) +{ + assert(score); + return &score->status; +} + + +void aymo_score_avd_restart( + struct aymo_score_avd_instance* score +) +{ + assert(score); + + score->index = 0u; + + score->status.delay = 0u; + score->status.address = 0u; + score->status.value = 0u; + score->status.flags = 0u; + + if (score->index >= score->length) { + score->status.flags |= AYMO_SCORE_FLAG_EOF; + } +} + + +uint32_t aymo_score_avd_tick( + struct aymo_score_avd_instance* score, + uint32_t count +) +{ + assert(score); + assert(!score->length || score->events); + + uint32_t pending = count; + + do { + if (pending >= score->status.delay) { + pending -= score->status.delay; + score->status.delay = 0u; + } + else { + score->status.delay -= pending; + pending = 0u; + } + + score->status.address = 0u; + score->status.value = 0u; + score->status.flags = 0u; + + if (score->status.delay) { + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + else if (score->index < score->length) { + const struct aymo_score_avd_event* event = &score->events[score->index++]; + + if (event->address_hi & 0x80u) { // delay tag + uint32_t delay = (((uint32_t)(event->address_hi & 0x7Fu) << 16u) | + ((uint32_t)event->address_lo << 8u) | event->value); + if (delay) { + score->status.delay = delay; + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + } + else { + score->status.address = (((uint16_t)event->address_hi << 8u) | event->address_lo); + score->status.value = event->value; + score->status.flags = AYMO_SCORE_FLAG_EVENT; + count -= pending; // FIXME: what if another event follows immediately? --> count -= CONSUMED + break; + } + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + break; + } + } while (pending); + + return count; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_score_dro.c b/src/aymo_score_dro.c new file mode 100644 index 0000000..e9ab2db --- /dev/null +++ b/src/aymo_score_dro.c @@ -0,0 +1,376 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_score_dro.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_score_vt aymo_score_dro_vt = { + "aymo_score_dro", + (aymo_score_ctor_f)aymo_score_dro_ctor, + (aymo_score_dtor_f)aymo_score_dro_dtor, + (aymo_score_load_f)aymo_score_dro_load, + (aymo_score_unload_f)aymo_score_dro_unload, + (aymo_score_get_status_f)aymo_score_dro_get_status, + (aymo_score_restart_f)aymo_score_dro_restart, + (aymo_score_tick_f)aymo_score_dro_tick +}; + + +static inline uint16_t make_u16le(uint8_t lo, uint8_t hi) +{ + return (uint16_t)((unsigned)lo | ((unsigned)hi << 8u)); +} + + +int aymo_score_dro_ctor_specific( + struct aymo_score_dro_instance* score, + uint32_t opl_rate +) +{ + assert(score); + assert(opl_rate); + + uint32_t division = (opl_rate / 1000u); // TODO: improve resolution via fixed point 24.8 + division += (uint32_t)(division == 0u); + + score->vt = &aymo_score_dro_vt; + + score->header = NULL; + score->v1_header = NULL; + score->v2_header = NULL; + score->codemap = NULL; + score->events = NULL; + + score->opl_rate = opl_rate; + score->division = division; + score->length = 0u; + score->offset = 0u; + score->address_hi = 0u; + + aymo_score_dro_restart(score); + return 0; +} + + +int aymo_score_dro_ctor( + struct aymo_score_dro_instance* score +) +{ + return aymo_score_dro_ctor_specific(score, AYMO_SCORE_OPL_RATE_DEFAULT); +} + + +void aymo_score_dro_dtor( + struct aymo_score_dro_instance* score +) +{ + AYMO_UNUSED_VAR(score); + assert(score); +} + + +int aymo_score_dro_load( + struct aymo_score_dro_instance* score, + const void* data, + uint32_t size +) +{ + assert(score); + assert(data); + assert(size); + + score->header = NULL; + score->v1_header = NULL; + score->v2_header = NULL; + score->codemap = NULL; + score->events = NULL; + score->length = 0u; + + aymo_score_dro_restart(score); + + if (size < sizeof(struct aymo_score_dro_header)) { + return 1; + } + const uint8_t* ptr = (const uint8_t*)data; + const struct aymo_score_dro_header* header = NULL; + header = (const struct aymo_score_dro_header*)(const void*)ptr; + ptr += sizeof(struct aymo_score_dro_header); + size -= sizeof(struct aymo_score_dro_header); + const struct aymo_score_dro_v1_header* v1_header = NULL; + const struct aymo_score_dro_v2_header* v2_header = NULL; + const uint8_t* codemap = NULL; + const uint8_t* events = NULL; + uint32_t length = 0u; + + for (unsigned i = 0u; i < 8u; ++i) { + if (header->signature[i] != AYMO_DRO_SIGNATURE[i]) { + return 1; + } + } + + if ((((header->version_major == 0u) && (header->version_minor == 1u)) || + ((header->version_major == 1u) && (header->version_minor == 0u)))) { + if (size < sizeof(struct aymo_score_dro_v1_header)) { + return 1; + } + v1_header = (const struct aymo_score_dro_v1_header*)(const void*)ptr; + ptr += sizeof(struct aymo_score_dro_v1_header); + size -= sizeof(struct aymo_score_dro_v1_header); + if ((v1_header->hardware_extra[0] || + v1_header->hardware_extra[1] || + v1_header->hardware_extra[2])) { + ptr -= 3u; + } + events = ptr; + length = v1_header->length_bytes; + } + else if ((header->version_major == 2u) && (header->version_minor == 0u)) { + if (size < sizeof(struct aymo_score_dro_v1_header)) { + return 1; + } + v2_header = (const struct aymo_score_dro_v2_header*)(const void*)ptr; + ptr += sizeof(struct aymo_score_dro_v2_header); + size -= sizeof(struct aymo_score_dro_v2_header); + if (v2_header->format != (uint8_t)aymo_score_dro_v2_format_interleaved) { + return 1; + } + if (v2_header->codemap_length > 128u) { + return 1; + } + if (size < v2_header->codemap_length) { + return 1; + } + codemap = ptr; + ptr += v2_header->codemap_length; + size -= v2_header->codemap_length; + events = ptr; + length = (v2_header->length_pairs * sizeof(struct aymo_score_dro_pair)); + } + else { + return 1; + } + + score->header = header; + score->v1_header = v1_header; + score->v2_header = v2_header; + score->codemap = codemap; + score->events = events; + score->length = length; + + aymo_score_dro_restart(score); + return 0; +} + + +void aymo_score_dro_unload( + struct aymo_score_dro_instance* score +) +{ + aymo_score_dro_restart(score); +} + + +void aymo_score_dro_restart( + struct aymo_score_dro_instance* score +) +{ + assert(score); + + score->offset = 0u; + score->address_hi = 0u; + + score->status.delay = 0u; + score->status.address = 0u; + score->status.value = 0u; + score->status.flags = 0u; + + if (score->offset >= score->length) { + score->status.flags |= AYMO_SCORE_FLAG_EOF; + } +} + + +static void aymo_score_dro_decode_v1( + struct aymo_score_dro_instance* score +) +{ + const uint8_t* ptr = &(score->events[score->offset]); + + switch ((enum aymo_score_dro_v1_code)ptr[0]) { + case aymo_score_dro_v1_code_delay_byte: { + if ((score->offset + 1u) <= score->length) { + score->status.delay = ((ptr[1] + 1uL) * score->division); + score->status.flags = AYMO_SCORE_FLAG_DELAY; + score->offset += 2u; + } + else { + score->status.flags = AYMO_SCORE_FLAG_DELAY; + score->offset = score->length; + } + break; + } + case aymo_score_dro_v1_code_delay_word: { + if ((score->offset + 2u) <= score->length) { + score->status.delay = ((make_u16le(ptr[1], ptr[2]) + 1uL) * score->division); + score->status.flags = AYMO_SCORE_FLAG_DELAY; + score->offset += 3u; + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + score->offset = score->length; + } + break; + } + case aymo_score_dro_v1_code_switch_low: { + score->address_hi = 0u; + score->offset += 1u; + break; + } + case aymo_score_dro_v1_code_switch_high: { + score->address_hi = 1u; + score->offset += 1u; + break; + } + case aymo_score_dro_v1_code_escape: { + if ((score->offset + 2u) <= score->length) { + score->status.address = make_u16le(ptr[1], score->address_hi); + score->status.value = ptr[2]; + score->status.flags = AYMO_SCORE_FLAG_EVENT; + score->offset += 3u; + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + score->offset = score->length; + } + break; + } + case aymo_score_dro_v1_code_invalid: + default: { + if ((score->offset + 2u) <= score->length) { + score->status.address = make_u16le(ptr[0], score->address_hi); + score->status.value = ptr[1]; + score->status.flags = AYMO_SCORE_FLAG_EVENT; + score->offset += 2u; + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + score->offset = score->length; + } + break; + } + } +} + + +static void aymo_score_dro_decode_v2( + struct aymo_score_dro_instance* score +) +{ + const struct aymo_score_dro_v2_header *v2_header = score->v2_header; + const uint8_t* ptr = &(score->events[score->offset]); + + if (ptr[0] == v2_header->short_delay_code) { + score->status.delay = (ptr[1] + 1uL); + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + else if (ptr[0] == v2_header->long_delay_code) { + score->status.delay = ((ptr[1] + 1uL) * 256u); + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + else if ((ptr[0] & 0xFFu) < v2_header->codemap_length) { + score->address_hi = ((ptr[0] & 0x80u) >> 7u); + uint8_t address_lo = score->codemap[ptr[0] & 0xFFu]; + score->status.address = make_u16le(address_lo, score->address_hi); + score->status.value = ptr[1]; + score->status.flags = AYMO_SCORE_FLAG_EVENT; + } + score->offset += 2u; +} + + +struct aymo_score_status* aymo_score_dro_get_status( + struct aymo_score_dro_instance* score +) +{ + assert(score); + return &score->status; +} + + +uint32_t aymo_score_dro_tick( + struct aymo_score_dro_instance* score, + uint32_t count +) +{ + assert(score); + assert(!score->length || score->events); + + uint32_t pending = count; + + do { + if (pending >= score->status.delay) { + pending -= score->status.delay; + score->status.delay = 0u; + } + else { + score->status.delay -= pending; + pending = 0u; + } + + score->status.address = 0u; + score->status.value = 0u; + score->status.flags = 0u; + + if (score->status.delay) { + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + else if (score->offset < score->length) { + if (score->v2_header) { + aymo_score_dro_decode_v2(score); + } + else if (score->v1_header) { + aymo_score_dro_decode_v1(score); + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + score->offset = score->length; + break; + } + + if (score->status.flags & AYMO_SCORE_FLAG_EVENT) { + count -= pending; // FIXME: what if another event follows immediately? --> count -= CONSUMED + break; + } + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + break; + } + } while (pending); + + return count; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_score_imf.c b/src/aymo_score_imf.c new file mode 100644 index 0000000..ec0d1f3 --- /dev/null +++ b/src/aymo_score_imf.c @@ -0,0 +1,266 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_score_imf.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_score_vt aymo_score_imf_vt = { + "aymo_score_imf", + (aymo_score_ctor_f)aymo_score_imf_ctor, + (aymo_score_dtor_f)aymo_score_imf_dtor, + (aymo_score_load_f)aymo_score_imf_load, + (aymo_score_unload_f)aymo_score_imf_unload, + (aymo_score_get_status_f)aymo_score_imf_get_status, + (aymo_score_restart_f)aymo_score_imf_restart, + (aymo_score_tick_f)aymo_score_imf_tick +}; + + +// See: https://moddingwiki.shikadi.net/wiki/IMF_Format +uint8_t aymo_score_imf_guess_type( + const void* data, + uint32_t size +) +{ + assert(data); + + if (size < 2u) { + return 0u; + } + + const uint8_t* ptr = (const uint8_t *)data; + uint16_t word = (ptr[0] | ((uint16_t)ptr[1] << 8u)); + ptr += 2u; + if (!word || (word & 3u)) { + return 0u; + } + + uint32_t sum1 = 0u; + uint32_t sum2 = 0u; + uint16_t i = 42u; + + while ((size >= 4u) && i--) + { + word = (ptr[0] | ((uint16_t)ptr[1] << 8u)); + ptr += 2u; + sum1 += word; + + word = (ptr[0] | ((uint16_t)ptr[1] << 8u)); + ptr += 2u; + sum2 += word; + } + return (uint8_t)(sum1 > sum2); +} + + +int aymo_score_imf_ctor_specific( + struct aymo_score_imf_instance* score, + uint32_t imf_rate, + uint32_t opl_rate +) +{ + assert(score); + assert(opl_rate); + assert(imf_rate); + + uint32_t division = (opl_rate / imf_rate); // TODO: improve resolution via fixed point 24.8 + division += (uint32_t)(division == 0u); + + score->vt = &aymo_score_imf_vt; + + score->events = NULL; + score->imf_rate = imf_rate; + score->opl_rate = opl_rate; + score->division = division; + score->length = 0u; + score->type = 0u; + score->address_hi = 0u; + + aymo_score_imf_restart(score); + return 0; +} + + +int aymo_score_imf_ctor( + struct aymo_score_imf_instance* score +) +{ + return aymo_score_imf_ctor_specific(score, aymo_score_imf_rate_default, AYMO_SCORE_OPL_RATE_DEFAULT); +} + + +void aymo_score_imf_dtor( + struct aymo_score_imf_instance* score +) +{ + AYMO_UNUSED_VAR(score); + assert(score); +} + + +int aymo_score_imf_load_specific( + struct aymo_score_imf_instance* score, + const void* data, + uint32_t size, + uint8_t type +) +{ + assert(score); + assert(data); + assert(size); + + score->type = type; + + if (type) { + const uint8_t* ptr = (const uint8_t*)data; + uint32_t length_by_header = (ptr[0] | ((uint16_t)ptr[1] << 8u)); + length_by_header /= sizeof(struct aymo_score_imf_event); + score->length = length_by_header; + score->events = (const struct aymo_score_imf_event*)(const void*)&ptr[2]; + + uint32_t length_by_size = (uint32_t)(size - 2); + length_by_size /= sizeof(struct aymo_score_imf_event); + if (score->length > length_by_size) { + score->length = length_by_size; + } + } + else { + uint32_t length_by_size = (uint32_t)size; + length_by_size /= sizeof(struct aymo_score_imf_event); + score->length = length_by_size; + score->events = (const struct aymo_score_imf_event*)data; + } + + aymo_score_imf_restart(score); + return 0; +} + + +int aymo_score_imf_load( + struct aymo_score_imf_instance* score, + const void* data, + uint32_t size +) +{ + uint8_t type = aymo_score_imf_guess_type(data, size); + return aymo_score_imf_load_specific(score, data, size, type); +} + + +void aymo_score_imf_unload( + struct aymo_score_imf_instance* score +) +{ + aymo_score_imf_restart(score); +} + + +struct aymo_score_status* aymo_score_imf_get_status( + struct aymo_score_imf_instance* score +) +{ + assert(score); + return &score->status; +} + + +void aymo_score_imf_restart( + struct aymo_score_imf_instance* score +) +{ + assert(score); + + score->index = 0u; + score->address_hi = 0u; + + score->status.delay = 0u; + score->status.address = 0u; + score->status.value = 0u; + score->status.flags = 0u; + + if (score->index >= score->length) { + score->status.flags |= AYMO_SCORE_FLAG_EOF; + } +} + + +uint32_t aymo_score_imf_tick( + struct aymo_score_imf_instance* score, + uint32_t count +) +{ + assert(score); + assert(!score->length || score->events); + + uint32_t pending = count; + + do { + if (pending >= score->status.delay) { + pending -= score->status.delay; + score->status.delay = 0u; + } + else { + score->status.delay -= pending; + pending = 0u; + } + + score->status.address = 0u; + score->status.value = 0u; + score->status.flags = 0u; + + if (score->status.delay) { + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + else if (score->index < score->length) { + const struct aymo_score_imf_event* event = &score->events[score->index++]; + + uint16_t delay = (((uint16_t)event->delay_hi << 8u) | event->delay_lo); + if (delay) { + score->status.delay = (delay * score->division); + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + + // Override virtual register 0x05 to extend the address range for OPL3 + if AYMO_UNLIKELY(event->address_lo == 0x05u) { + score->address_hi = (event->value & 0x01u); + } + else { + score->status.address = ((uint16_t)(score->address_hi << 8u) | event->address_lo); + score->status.value = event->value; + score->status.flags = AYMO_SCORE_FLAG_EVENT; + count -= pending; // FIXME: what if another event follows immediately? --> count -= CONSUMED + break; + } + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + break; + } + } while (pending); + + return count; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_score_raw.c b/src/aymo_score_raw.c new file mode 100644 index 0000000..53bd148 --- /dev/null +++ b/src/aymo_score_raw.c @@ -0,0 +1,231 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_score_raw.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_score_vt aymo_score_raw_vt = { + "aymo_score_raw", + (aymo_score_ctor_f)aymo_score_raw_ctor, + (aymo_score_dtor_f)aymo_score_raw_dtor, + (aymo_score_load_f)aymo_score_raw_load, + (aymo_score_unload_f)aymo_score_raw_unload, + (aymo_score_get_status_f)aymo_score_raw_get_status, + (aymo_score_restart_f)aymo_score_raw_restart, + (aymo_score_tick_f)aymo_score_raw_tick +}; + + +static void aymo_score_raw_update_clock( + struct aymo_score_raw_instance* score +) +{ + score->clock += (uint16_t)(score->clock == 0u); + score->raw_rate = (AYMO_SCORE_RAW_REFCLK / score->clock); + score->division = (AYMO_SCORE_OPL_RATE_DEFAULT / score->raw_rate); // TODO: improve resolution via fixed point 24.8 + score->division += (uint32_t)(score->division == 0u); +} + + +int aymo_score_raw_ctor( + struct aymo_score_raw_instance* score +) +{ + assert(score); + + score->vt = &aymo_score_raw_vt; + + score->events = NULL; + score->raw_rate = AYMO_SCORE_RAW_REFCLK; + score->division = 1u; + score->length = 0u; + score->address_hi = 0u; + + aymo_score_raw_restart(score); + return 0; +} + + +void aymo_score_raw_dtor( + struct aymo_score_raw_instance* score +) +{ + AYMO_UNUSED_VAR(score); + assert(score); +} + + +int aymo_score_raw_load( + struct aymo_score_raw_instance* score, + const void* data, + uint32_t size +) +{ + assert(score); + assert(data); + assert(size); + + if (size < sizeof(struct aymo_score_raw_header)) { + return 1; + } + const uint8_t* ptr = (const uint8_t*)data; + + if (((ptr[0] != 'R') || + (ptr[1] != 'A') || + (ptr[2] != 'W') || + (ptr[3] != 'A') || + (ptr[4] != 'D') || + (ptr[5] != 'A') || + (ptr[6] != 'T') || + (ptr[7] != 'A'))) { + return 1; + } + score->clock_initial = *(const uint16_t*)(const void*)&ptr[8]; + score->events = (const struct aymo_score_raw_event*)(const void*)&ptr[10]; + + uint32_t length_by_size = (uint32_t)(size - sizeof(struct aymo_score_raw_header)); + length_by_size /= sizeof(struct aymo_score_raw_event); + if (score->length > length_by_size) { + score->length = length_by_size; + } + + aymo_score_raw_restart(score); + return 0; +} + + +void aymo_score_raw_unload( + struct aymo_score_raw_instance* score +) +{ + aymo_score_raw_restart(score); +} + + +struct aymo_score_status* aymo_score_raw_get_status( + struct aymo_score_raw_instance* score +) +{ + assert(score); + return &score->status; +} + + +void aymo_score_raw_restart( + struct aymo_score_raw_instance* score +) +{ + assert(score); + + score->index = 0u; + score->address_hi = 0u; + score->clock = score->clock_initial; + aymo_score_raw_update_clock(score); + + score->status.delay = 0u; + score->status.address = 0u; + score->status.value = 0u; + score->status.flags = 0u; + + if (score->index >= score->length) { + score->status.flags |= AYMO_SCORE_FLAG_EOF; + } +} + + +uint32_t aymo_score_raw_tick( + struct aymo_score_raw_instance* score, + uint32_t count +) +{ + assert(score); + assert(!score->length || score->events); + + uint32_t pending = count; + + do { + if (pending >= score->status.delay) { + pending -= score->status.delay; + score->status.delay = 0u; + } + else { + score->status.delay -= pending; + pending = 0u; + } + + score->status.address = 0u; + score->status.value = 0u; + score->status.flags = 0u; + + if (score->status.delay) { + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + else if (score->index < score->length) { + const struct aymo_score_raw_event* event = &score->events[score->index++]; + + if (event->ctrl == 0x00u) { + uint8_t delay = event->data; + if (delay) { + score->status.delay = (delay * score->division); + score->status.flags = AYMO_SCORE_FLAG_DELAY; + } + } + else if (event->ctrl == 0x02u) { + if (event->ctrl == 0x00u) { + if ((score->index + 1u) < score->length) { + score->index++; + score->clock = *(const uint16_t*)(void*)++event; + aymo_score_raw_update_clock(score); + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + break; + } + } + else if (event->ctrl == 0x01u) { + score->address_hi = 0u; + } + else if (event->ctrl == 0x02u) { + score->address_hi = 1u; + } + } + else { + score->status.address = ((uint16_t)(score->address_hi << 8u) | event->ctrl); + score->status.value = event->data; + score->status.flags = AYMO_SCORE_FLAG_EVENT; + count -= pending; // FIXME: what if another event follows immediately? --> count -= CONSUMED + break; + } + } + else { + score->status.flags = AYMO_SCORE_FLAG_EOF; + break; + } + } while (pending); + + return count; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_sys_linux.c b/src/aymo_sys_linux.c new file mode 100644 index 0000000..01d943d --- /dev/null +++ b/src/aymo_sys_linux.c @@ -0,0 +1,19 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ diff --git a/src/aymo_sys_windows.c b/src/aymo_sys_windows.c new file mode 100644 index 0000000..902a949 --- /dev/null +++ b/src/aymo_sys_windows.c @@ -0,0 +1,71 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cc.h" +#if (defined(AYMO_CC_HOST_WINDOWS) || defined(AYMO_CC_HOST_CYGWIN)) + +#define WIN32_LEAN_AND_MEAN 1 +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +BOOL WINAPI DllMain( + _In_ HINSTANCE hinstDLL, + _In_ DWORD fdwReason, + _In_ LPVOID lpvReserved +) +{ + // Perform actions based on the reason for calling. + switch (fdwReason) + { + case DLL_PROCESS_ATTACH: { + // Initialize once for each new process. + // Return FALSE to fail DLL load. + + // Thread optimization. + DisableThreadLibraryCalls(hinstDLL); + break; + } + case DLL_THREAD_ATTACH: { + // Do thread-specific initialization. + break; + } + case DLL_THREAD_DETACH: { + // Do thread-specific cleanup. + break; + } + case DLL_PROCESS_DETACH: { + if (lpvReserved) { + // Do not do cleanup if process termination scenario. + break; + } + // Perform any necessary cleanup. + break; + } + default: break; + } + return TRUE; // Successful DLL_PROCESS_ATTACH. +} + + +AYMO_CXX_EXTERN_C_END + +#endif // (defined(AYMO_CC_HOST_WINDOWS) || defined(AYMO_CC_HOST_CYGWIN)) diff --git a/src/aymo_tda8425.c b/src/aymo_tda8425.c new file mode 100644 index 0000000..41981ff --- /dev/null +++ b/src/aymo_tda8425.c @@ -0,0 +1,172 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#include "aymo_tda8425.h" +#include "aymo_tda8425_arm_neon.h" +#include "aymo_tda8425_none.h" +#include "aymo_tda8425_x86_avx2.h" +#include "aymo_tda8425_x86_sse41.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_tda8425_math* aymo_tda8425_math; + +static const struct aymo_tda8425_vt* aymo_tda8425_best_vt; + + +void aymo_tda8425_boot(const struct aymo_tda8425_math* math) +{ + assert(math); + + aymo_tda8425_math = math; + + #ifdef AYMO_CPU_SUPPORT_X86_AVX2 + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) { + aymo_tda8425_best_vt = aymo_tda8425_x86_avx2_get_vt(); + return; + } + #endif + + #ifdef AYMO_CPU_SUPPORT_X86_SSE41 + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) { + aymo_tda8425_best_vt = aymo_tda8425_x86_sse41_get_vt(); + return; + } + #endif + + #ifdef AYMO_CPU_SUPPORT_ARM_NEON + if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) { + aymo_tda8425_best_vt = aymo_tda8425_arm_neon_get_vt(); + return; + } + #endif + + aymo_tda8425_best_vt = aymo_tda8425_none_get_vt(); +} + + +const struct aymo_tda8425_vt* aymo_tda8425_get_vt(const char* cpu_ext) +{ + if (cpu_ext == NULL) { + return NULL; + } + + #ifdef AYMO_CPU_SUPPORT_X86_AVX2 + if (!aymo_strcmp(cpu_ext, "x86_avx2")) { + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) { + return aymo_tda8425_x86_avx2_get_vt(); + } + } + #endif + + #ifdef AYMO_CPU_SUPPORT_X86_SSE41 + if (!aymo_strcmp(cpu_ext, "x86_sse41")) { + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) { + return aymo_tda8425_x86_sse41_get_vt(); + } + } + #endif + + #ifdef AYMO_CPU_SUPPORT_ARM_NEON + if (!aymo_strcmp(cpu_ext, "arm_neon")) { + if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) { + return aymo_tda8425_arm_neon_get_vt(); + } + } + #endif + + if (!aymo_strcmp(cpu_ext, "none")) { + return aymo_tda8425_none_get_vt(); + } + return NULL; +} + + +const struct aymo_tda8425_vt* aymo_tda8425_get_best_vt(void) +{ + return aymo_tda8425_best_vt; +} + + +uint32_t aymo_tda8425_get_sizeof(struct aymo_tda8425_chip* chip) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->get_sizeof); + + return chip->vt->get_sizeof(); +} + + +void aymo_tda8425_ctor(struct aymo_tda8425_chip* chip, float sample_rate) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->ctor); + + chip->vt->ctor(chip, sample_rate); +} + + +void aymo_tda8425_dtor(struct aymo_tda8425_chip* chip) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->dtor); + + chip->vt->dtor(chip); +} + + +uint8_t aymo_tda8425_read(struct aymo_tda8425_chip* chip, uint16_t address) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->read); + + return chip->vt->read(chip, address); +} + + +void aymo_tda8425_write(struct aymo_tda8425_chip* chip, uint16_t address, uint8_t value) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->write); + + chip->vt->write(chip, address, value); +} + + +void aymo_tda8425_process_f32(struct aymo_tda8425_chip* chip, uint32_t count, const float x[], float y[]) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->process_f32); + + chip->vt->process_f32(chip, count, x, y); +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_tda8425_arm_neon.c b/src/aymo_tda8425_arm_neon.c new file mode 100644 index 0000000..c2a56b6 --- /dev/null +++ b/src/aymo_tda8425_arm_neon.c @@ -0,0 +1,504 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_tda8425_arm_neon.h" +#include "aymo_tda8425.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + +#undef cos +#undef fabs +#undef log10 +#undef pow +#undef sqrt +#undef tan + +#define cos (aymo_tda8425_math->cos) +#define fabs (aymo_tda8425_math->fabs) +#define log10 (aymo_tda8425_math->log10) +#define pow (aymo_tda8425_math->pow) +#define sqrt (aymo_tda8425_math->sqrt) +#define tan (aymo_tda8425_math->tan) + + +const struct aymo_tda8425_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_tda8425_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_tda8425_ctor_f)&(aymo_(ctor)), + (aymo_tda8425_dtor_f)&(aymo_(dtor)), + (aymo_tda8425_read_f)&(aymo_(read)), + (aymo_tda8425_write_f)&(aymo_(write)), + (aymo_tda8425_process_f32_f)&(aymo_(process_f32)) +}; + + +const struct aymo_tda8425_vt* aymo_(get_vt)(void) +{ + return &aymo_(vt); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate) +{ + assert(chip); + assert(sample_rate > 0.f); + + // Wipe everything + aymo_memset(chip, 0, sizeof(struct aymo_(chip))); + + // Setup default parameters + chip->sample_rate = sample_rate; + chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[0]; + chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[0]; + + // Setup default registers + aymo_(write)(chip, 0x00u, 0xFCu); // VL: 0 dB + aymo_(write)(chip, 0x01u, 0xFCu); // VR: 0 dB + aymo_(write)(chip, 0x02u, 0xF6u); // BA: 0 dB + aymo_(write)(chip, 0x03u, 0xF6u); // TR: 0 dB + aymo_(write)(chip, 0x07u, 0xFCu); // PP: light pseudo + aymo_(write)(chip, 0x08u, 0xCEu); // SF: linear stereo, channel 1, unmuted +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +static void aymo_(apply_vl)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vl & 0x3Fu]; + + if (chip->reg_sf & 0x20u) { // mute + db = -90.; + } + + double g = pow(10., (db * .05)); + chip->kv = vset_lane_f32((float)g, chip->kv, 0); +} + + +static void aymo_(apply_vr)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vr & 0x3Fu]; + + if (chip->reg_sf & 0x20u) { // mute + db = -90.; + } + + double g = pow(10., (db * .05)); + chip->kv = vset_lane_f32((float)g, chip->kv, 1); +} + + +static void aymo_(apply_ba)(struct aymo_(chip)* chip) +{ + double dbb = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu]; + double gb = pow(10., (dbb * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fcb = 300.; // [Hz] + double wb = ((2. * pi) * fcb); + double kb = (tan(wb * (.5 / fs)) / wb); + + double a0 = ((kb * wb) + gb); + double a1 = ((kb * wb) - gb); + double a2 = 0.; + + double b0 = (((kb * wb) * (gb * gb)) + gb); + double b1 = (((kb * wb) * (gb * gb)) - gb); + double b2 = 0.; + + double ra0 = (1. / a0); + chip->kb0 = vsetq_lane_f32((float)(b0 * ra0), chip->kb0, 2); + chip->kb1 = vsetq_lane_f32((float)(b1 * ra0), chip->kb1, 2); + chip->kb2 = vsetq_lane_f32((float)(b2 * ra0), chip->kb2, 2); + ra0 = -ra0; + chip->ka1 = vsetq_lane_f32((float)(a1 * ra0), chip->ka1, 2); + chip->ka2 = vsetq_lane_f32((float)(a2 * ra0), chip->ka2, 2); +} + + +static void aymo_(apply_tr)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_tr_to_db[chip->reg_tr & 0x0Fu]; + double gt = pow(10., (db * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fcd = 10.; // [Hz] + double wd = ((2. * pi) * fcd); + double kd = ((chip->reg_sf & 0x40u) ? 0. : (tan(wd * (.5 / fs)) / wd)); + double fct = 4500.; // [Hz] + double wt = ((2. * pi) * fct); + double kt = (tan(wt * (.5 / fs)) / wt); + + double a0 = (((gt * kt * wt) * (kd * wd)) + ((gt * kt * wt) + (kd * wd)) + 1.); + double a1 = (((gt * kt * wt) * (kd * wd) * 2.) - 2.); + double a2 = (((gt * kt * wt) * (kd * wd)) - ((gt * kt * wt) + (kd * wd)) + 1.); + + double b0 = ((gt * gt) + (gt * kt * wt)); + double b1 = ((gt * gt) * -2.); + double b2 = ((gt * gt) - (gt * kt * wt)); + + double ra0 = (1. / a0); + chip->kb0 = vsetq_lane_f32((float)(b0 * ra0), chip->kb0, 1); + chip->kb1 = vsetq_lane_f32((float)(b1 * ra0), chip->kb1, 1); + chip->kb2 = vsetq_lane_f32((float)(b2 * ra0), chip->kb2, 1); + ra0 = -ra0; + chip->ka1 = vsetq_lane_f32((float)(a1 * ra0), chip->ka1, 1); + chip->ka2 = vsetq_lane_f32((float)(a2 * ra0), chip->ka2, 1); +} + + +static void aymo_(apply_source_mode)(struct aymo_(chip)* chip) +{ + // Default mute + vf32x2_t klr = vdup_n_f32(0.f); + vf32x2_t krl = vdup_n_f32(0.f); + + uint8_t source = (chip->reg_sf & 0x07u); + uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u); + + // Forced mono + if (mode == 0x00u) { // process + switch (source) { + // Channel 1 + case 0x02u: + case 0x04u: + case 0x06u: { + klr = vdup_n_f32(1.f); + krl = vdup_n_f32(1.f); + break; + } + } + } + else { // not forced mono + switch (source) { + // Channel 1 + case 0x02u: { // mono left + klr = vset_lane_f32(1.f, klr, 0); + krl = vset_lane_f32(1.f, krl, 1); + break; + } + case 0x04u: { // mono right + klr = vset_lane_f32(1.f, klr, 1); + krl = vset_lane_f32(1.f, krl, 0); + break; + } + case 0x06u: { // stereo + klr = vdup_n_f32(1.f); + krl = vdup_n_f32(0.f); + break; + } + default: { + if (mode == 0x03u) { // spatial stereo + mode = 0x02u; // force linear stereo (mute) + } + break; + } + } + + // Spatial stereo + if (mode == 0x03u) { // process + const float xt = .52f; // cross-talk + vf32x2_t kx = vdup_n_f32(xt); + klr = vadd_f32(klr, kx); + krl = vsub_f32(krl, kx); + } + } // not forced mono + + chip->klr = klr; + chip->krl = krl; +} + + +static void aymo_(apply_pseudo)(struct aymo_(chip)* chip) +{ + uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u); + + // Pseudo stereo + if (mode == 0x02u) { // enabled + double c1 = (double)chip->pseudo_c1; + double c2 = (double)chip->pseudo_c2; + double r1 = 15000.; // [ohm] + double r2 = 15000.; // [ohm] + double t1 = (c1 * r1); + double t2 = (c2 * r2); + + double fs = (double)chip->sample_rate; + double k = (.5 / fs); + double kk = (k * k); + double t1_t2 = (t1 * t2); + double t1_t2_k = ((t1 + t2) * k); + + double a0 = (kk + t1_t2 + t1_t2_k); + double a1 = ((kk - t1_t2) * 2.); + double a2 = (kk + t1_t2 - t1_t2_k); + + double b0 = a2; + double b1 = a1; + double b2 = a0; + + double ra0 = (1. / a0); + chip->kb0 = vsetq_lane_f32((float)(b0 * ra0), chip->kb0, 0); + chip->kb1 = vsetq_lane_f32((float)(b1 * ra0), chip->kb1, 0); + chip->kb2 = vsetq_lane_f32((float)(b2 * ra0), chip->kb2, 0); + ra0 = -ra0; + chip->ka1 = vsetq_lane_f32((float)(a1 * ra0), chip->ka1, 0); + chip->ka2 = vsetq_lane_f32((float)(a2 * ra0), chip->ka2, 0); + } + else { // pass-through + chip->kb0 = vsetq_lane_f32(1.f, chip->kb0, 0); + chip->kb1 = vsetq_lane_f32(.0f, chip->kb1, 0); + chip->kb2 = vsetq_lane_f32(.0f, chip->kb2, 0); + + chip->ka1 = vsetq_lane_f32(.0f, chip->ka1, 0); + chip->ka2 = vsetq_lane_f32(.0f, chip->ka2, 0); + } +} + + +static void aymo_(apply_tfilter)(struct aymo_(chip)* chip) +{ + // T-filter + if (chip->reg_sf & 0x80u) { // pass-through + chip->kb0 = vsetq_lane_f32(1.f, chip->kb0, 3); + chip->kb1 = vsetq_lane_f32(.0f, chip->kb1, 3); + chip->kb2 = vsetq_lane_f32(.0f, chip->kb2, 3); + + chip->ka1 = vsetq_lane_f32(.0f, chip->ka1, 3); + chip->ka2 = vsetq_lane_f32(.0f, chip->ka2, 3); + } + else { // enabled + double db = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu]; + double g = pow(10., (db * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fc = 180.; // [Hz] + double w = ((2. * pi) * fc); + double k = (tan(w * (.5 / fs)) / w); + + double log10_g = log10(g); + double ang = (log10_g * .85); + double abs_sqrt_log10_g = sqrt(fabs(log10_g)); + double abs2_sqrt_log10_g = abs_sqrt_log10_g * abs_sqrt_log10_g; + double kw = (k * w); + double m_k2w2 = ((kw * kw) * -.05); + double sqrt_5 = 2.23606797749978980505147774238139391; + double ph = (pi * .75); + double h_sqrt_5_kw_abs_sqrt_log10_g = ((sqrt_5 * .2) * kw * abs_sqrt_log10_g); + double cosm = cos(ang - ph); + double cosp = cos(ang + ph); + + double a0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosm))); + double a1 = (((m_k2w2 + abs2_sqrt_log10_g)) * 2.); + double a2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosm))); + + double b0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosp))); + double b1 = a1; + double b2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosp))); + + double ra0 = (1. / a0); + chip->kb0 = vsetq_lane_f32((float)(b0 * ra0), chip->kb0, 3); + chip->kb1 = vsetq_lane_f32((float)(b1 * ra0), chip->kb1, 3); + chip->kb2 = vsetq_lane_f32((float)(b2 * ra0), chip->kb2, 3); + ra0 = -ra0; + chip->ka1 = vsetq_lane_f32((float)(a1 * ra0), chip->ka1, 3); + chip->ka2 = vsetq_lane_f32((float)(a2 * ra0), chip->ka2, 3); + } +} + + +static void aymo_(apply_pp)(struct aymo_(chip)* chip) +{ + uint8_t pseudo_preset = (chip->reg_pp & 0x03u); + if (pseudo_preset >= 3u) { + pseudo_preset = 0u; + } + chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[pseudo_preset]; + chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[pseudo_preset]; +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + assert(chip); + + switch (address) { + case 0x00u: { + return chip->reg_vl; + } + case 0x01u: { + return chip->reg_vr; + } + case 0x02u: { + return chip->reg_ba; + } + case 0x03u: { + return chip->reg_tr; + } + case 0x07u: { + return chip->reg_pp; + } + case 0x08u: { + return chip->reg_sf; + } + default: { + return 0xFFu; + } + } +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + switch (address) { + case 0x00u: { // VL + value |= 0xC0u; + chip->reg_vl = value; + aymo_(apply_vl)(chip); + break; + } + case 0x01u: { // VR + value |= 0xC0u; + chip->reg_vr = value; + aymo_(apply_vr)(chip); + break; + } + case 0x02u: { // BA + value |= 0xF0u; + chip->reg_ba = value; + aymo_(apply_ba)(chip); + break; + } + case 0x03u: { // TR + value |= 0xF0u; + chip->reg_tr = value; + aymo_(apply_tr)(chip); + break; + } + case 0x07u: { // PP + value |= 0xFCu; + chip->reg_pp = value; + aymo_(apply_pp)(chip); + aymo_(apply_pseudo)(chip); + break; + } + case 0x08u: { // SF + chip->reg_sf = value; + aymo_(apply_source_mode)(chip); + aymo_(apply_pseudo)(chip); + aymo_(apply_tfilter)(chip); + aymo_(apply_vl)(chip); + aymo_(apply_vr)(chip); + aymo_(apply_tr)(chip); + break; + } + } +} + + +void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]) +{ + assert(chip); + assert(x); + assert(y); + + vf32x4_t b2l = chip->hb1l; + vf32x4_t b2r = chip->hb1r; + vf32x4_t a2l = chip->ha1l; + vf32x4_t a2r = chip->ha1r; + + const float* xe = &x[count * 2u]; + + while AYMO_LIKELY(x != xe) { + vf32x4_t y2l = vaddq_f32(vmulq_f32(b2l, chip->kb2), vmulq_f32(a2l, chip->ka2)); + vf32x4_t y2r = vaddq_f32(vmulq_f32(b2r, chip->kb2), vmulq_f32(a2r, chip->ka2)); + chip->hb2l = b2l; + chip->hb2r = b2r; + chip->ha2l = a2l; + chip->ha2r = a2r; + + vf32x4_t b1l = chip->hb0l; + vf32x4_t b1r = chip->hb0r; + vf32x4_t a1l = chip->ha0l; + vf32x4_t a1r = chip->ha0r; + vf32x4_t y1l = vaddq_f32(vmulq_f32(b1l, chip->kb1), vmulq_f32(a1l, chip->ka1)); + vf32x4_t y1r = vaddq_f32(vmulq_f32(b1r, chip->kb1), vmulq_f32(a1r, chip->ka1)); + chip->hb1l = b1l; + chip->hb1r = b1r; + chip->ha1l = a1l; + chip->ha1r = a1r; + + vf32x4_t yyl = vaddq_f32(y2l, y1l); + vf32x4_t yyr = vaddq_f32(y2r, y1r); + + vf32x2_t xlr = vld1_f32(x); x += 2u; + vf32x2_t xrl = vrev64_f32(xlr); + vf32x2_t wx = vadd_f32(vmul_f32(xlr, chip->klr), vmul_f32(xrl, chip->krl)); + vf32x4_t xx = vcombine_f32(wx, wx); + + vf32x4_t xl = vrev64q_f32(xx); + vf32x4_t b0l = vextq_f32(xl, a1l, 3); + vf32x4_t b0r = vextq_f32(xx, a1r, 3); + yyl = vaddq_f32(yyl, vmulq_f32(b0l, chip->kb0)); + yyr = vaddq_f32(yyr, vmulq_f32(b0r, chip->kb0)); + chip->hb0l = b0l; + chip->hb0r = b0r; + + chip->ha0l = yyl; + chip->ha0r = yyr; + + vf32x2_t ylh = vget_high_f32(yyl); + vf32x2_t yrh = vget_high_f32(yyr); + vf32x2_t yy = vext_f32(ylh, vrev64_f32(yrh), 1); + yy = vmul_f32(yy, chip->kv); + + b2l = chip->hb1l; + b2r = chip->hb1r; + a2l = chip->ha1l; + a2r = chip->ha1r; + + vst1_f32(y, yy); y += 2u; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON diff --git a/src/aymo_tda8425_common.c b/src/aymo_tda8425_common.c new file mode 100644 index 0000000..0745270 --- /dev/null +++ b/src/aymo_tda8425_common.c @@ -0,0 +1,150 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_tda8425_common.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +const int8_t aymo_tda8425_reg_v_to_db[64] = +{ + -90, // 0 + -90, // 1 + -90, // 2 + -90, // 3 + -90, // 4 + -90, // 5 + -90, // 6 + -90, // 7 + -90, // 8 + -90, // 9 + -90, // 10 + -90, // 11 + -90, // 12 + -90, // 13 + -90, // 14 + -90, // 15 + -90, // 16 + -90, // 17 + -90, // 18 + -90, // 19 + -90, // 20 + -90, // 21 + -90, // 22 + -90, // 23 + -90, // 24 + -90, // 25 + -90, // 26 + -90, // 27 + -64, // 28 + -62, // 29 + -60, // 30 + -58, // 31 + -56, // 32 + -54, // 33 + -52, // 34 + -50, // 35 + -48, // 36 + -46, // 37 + -44, // 38 + -42, // 39 + -40, // 40 + -38, // 41 + -36, // 42 + -34, // 43 + -32, // 44 + -30, // 45 + -28, // 46 + -26, // 47 + -24, // 48 + -22, // 49 + -20, // 50 + -18, // 51 + -16, // 52 + -14, // 53 + -12, // 54 + -10, // 55 + - 8, // 56 + - 6, // 57 + - 4, // 58 + - 2, // 59 + + 0, // 60 + + 2, // 61 + + 4, // 62 + + 6 // 63 +}; + +const int8_t aymo_tda8425_reg_ba_to_db[16] = +{ + -12, // 0 + -12, // 1 + -12, // 2 + - 9, // 3 + - 6, // 4 + - 3, // 5 + + 0, // 6 + + 3, // 7 + + 6, // 8 + + 9, // 9 + +12, // 10 + +15, // 11 + +15, // 12 + +15, // 13 + +15, // 14 + +15 // 15 +}; + +const int8_t aymo_tda8425_reg_tr_to_db[16] = +{ + -12, // 0 + -12, // 1 + -12, // 2 + - 9, // 3 + - 6, // 4 + - 3, // 5 + + 0, // 6 + + 3, // 7 + + 6, // 8 + + 9, // 9 + +12, // 10 + +12, // 11 + +12, // 12 + +12, // 13 + +12, // 14 + +12 // 15 +}; + + +const float aymo_tda8425_pseudo_preset_c1[3] = +{ + 15.e-9f, + 5.6e-9f, + 5.6e-9f +}; + +const float aymo_tda8425_pseudo_preset_c2[3] = +{ + 15.e-9f, + 47.e-9f, + 68.e-9f +}; + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_tda8425_none.c b/src/aymo_tda8425_none.c new file mode 100644 index 0000000..44f4fd2 --- /dev/null +++ b/src/aymo_tda8425_none.c @@ -0,0 +1,148 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" + +#include "aymo_tda8425_common.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_tda8425_none.h" + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_tda8425_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_tda8425_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_tda8425_ctor_f)&(aymo_(ctor)), + (aymo_tda8425_dtor_f)&(aymo_(dtor)), + (aymo_tda8425_read_f)&(aymo_(read)), + (aymo_tda8425_write_f)&(aymo_(write)), + (aymo_tda8425_process_f32_f)&(aymo_(process_f32)) +}; + + +const struct aymo_tda8425_vt* aymo_(get_vt)(void) +{ + return &aymo_(vt); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate) +{ + assert(chip); + assert(sample_rate > .0f); + + for (int i = 0; i < AYMO_(DELAY); ++i) { + chip->yh[i][0] = .0f; + chip->yh[i][1] = .0f; + } + + TDA8425_Chip* emu = &chip->emu; + TDA8425_Chip_Ctor(emu); + + TDA8425_Chip_Setup( + emu, + (TDA8425_Float)sample_rate, + (TDA8425_Float)TDA8425_Pseudo_C1_Table[TDA8425_Pseudo_Preset_1], + (TDA8425_Float)TDA8425_Pseudo_C2_Table[TDA8425_Pseudo_Preset_1], + TDA8425_Tfilter_Mode_Disabled + ); + + TDA8425_Chip_Reset(emu); + + TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_VL, 0xFCu); + TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_VR, 0xFCu); + TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_BA, 0xF6u); + TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_TR, 0xF6u); + TDA8425_Chip_Write(emu, (TDA8425_Address)TDA8425_Reg_SF, 0xCEu); + + TDA8425_Chip_Start(emu); +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + assert(chip); + + if (address <= (uint16_t)TDA8425_Reg_SF) { + return TDA8425_Chip_Read(&chip->emu, (TDA8425_Address)address); + } + return 0xFFu; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address <= (uint16_t)TDA8425_Reg_SF) { + TDA8425_Chip_Write(&chip->emu, (TDA8425_Address)address, value); + } +} + + +void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]) +{ + assert(chip); + assert(x); + assert(y); + + TDA8425_Chip* emu = &chip->emu; + TDA8425_Chip_Process_Data data; + data.inputs[TDA8425_Source_2][TDA8425_Stereo_L] = (TDA8425_Float)0.f; + data.inputs[TDA8425_Source_2][TDA8425_Stereo_R] = (TDA8425_Float)0.f; + + while (count--) { + data.inputs[TDA8425_Source_1][TDA8425_Stereo_L] = (TDA8425_Float)*x++; + data.inputs[TDA8425_Source_1][TDA8425_Stereo_R] = (TDA8425_Float)*x++; + + TDA8425_Chip_Process(emu, &data); + + for (int i = (AYMO_(DELAY) - 1); i > 0; --i) { + chip->yh[i][0] = chip->yh[i-1][0]; + chip->yh[i][1] = chip->yh[i-1][1]; + } + chip->yh[0][0] = (float)data.outputs[TDA8425_Stereo_L]; + chip->yh[0][1] = (float)data.outputs[TDA8425_Stereo_R]; + + *y++ = chip->yh[AYMO_(DELAY)-1][0]; + *y++ = chip->yh[AYMO_(DELAY)-1][1]; + } +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_tda8425_x86_avx2.c b/src/aymo_tda8425_x86_avx2.c new file mode 100644 index 0000000..769e63f --- /dev/null +++ b/src/aymo_tda8425_x86_avx2.c @@ -0,0 +1,499 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +#include "aymo_tda8425.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_tda8425_x86_avx2.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + +#undef cos +#undef fabs +#undef log10 +#undef pow +#undef sqrt +#undef tan + +#define cos (aymo_tda8425_math->cos) +#define fabs (aymo_tda8425_math->fabs) +#define log10 (aymo_tda8425_math->log10) +#define pow (aymo_tda8425_math->pow) +#define sqrt (aymo_tda8425_math->sqrt) +#define tan (aymo_tda8425_math->tan) + + +#undef mm256_alignr_ps +#define mm256_alignr_ps(a, b, imm8) \ + (_mm256_castsi256_ps(_mm256_alignr_epi8(_mm256_castps_si256(a), _mm256_castps_si256(b), ((imm8) * 4)))) + + +const struct aymo_tda8425_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_tda8425_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_tda8425_ctor_f)&(aymo_(ctor)), + (aymo_tda8425_dtor_f)&(aymo_(dtor)), + (aymo_tda8425_read_f)&(aymo_(read)), + (aymo_tda8425_write_f)&(aymo_(write)), + (aymo_tda8425_process_f32_f)&(aymo_(process_f32)) +}; + + +const struct aymo_tda8425_vt* aymo_(get_vt)(void) +{ + return &aymo_(vt); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate) +{ + assert(chip); + assert(sample_rate > 0.f); + + // Wipe everything + aymo_memset(chip, 0, sizeof(struct aymo_(chip))); + + // Setup default parameters + chip->sample_rate = sample_rate; + chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[0]; + chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[0]; + + // Setup default registers + aymo_(write)(chip, 0x00u, 0xFCu); // VL: 0 dB + aymo_(write)(chip, 0x01u, 0xFCu); // VR: 0 dB + aymo_(write)(chip, 0x02u, 0xF6u); // BA: 0 dB + aymo_(write)(chip, 0x03u, 0xF6u); // TR: 0 dB + aymo_(write)(chip, 0x07u, 0xFCu); // PP: light pseudo + aymo_(write)(chip, 0x08u, 0xCEu); // SF: linear stereo, channel 1, unmuted +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +static void aymo_(apply_vl)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vl & 0x3Fu]; + + if (chip->reg_sf & 0x20u) { // mute + db = -90.; + } + + double g = pow(10., (db * .05)); + vf32x4_t kvlo = _mm_set_ps((float)g, .0f, .0f, .0f); + chip->kv = _mm256_insertf128_ps(chip->kv, kvlo, 0); +} + + +static void aymo_(apply_vr)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vr & 0x3Fu]; + + if (chip->reg_sf & 0x20u) { // mute + db = -90.; + } + + double g = pow(10., (db * .05)); + vf32x4_t kvhi = _mm_set_ps((float)g, .0f, .0f, .0f); + chip->kv = _mm256_insertf128_ps(chip->kv, kvhi, 1); +} + + +static void aymo_(apply_ba)(struct aymo_(chip)* chip) +{ + double dbb = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu]; + double gb = pow(10., (dbb * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fcb = 300.; // [Hz] + double wb = ((2. * pi) * fcb); + double kb = (tan(wb * (.5 / fs)) / wb); + + double a0 = ((kb * wb) + gb); + double a1 = ((kb * wb) - gb); + double a2 = 0.; + + double b0 = (((kb * wb) * (gb * gb)) + gb); + double b1 = (((kb * wb) * (gb * gb)) - gb); + double b2 = 0.; + + double ra0 = (1. / a0); + chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps((float)(b0 * ra0)), 0x44); + chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps((float)(b1 * ra0)), 0x44); + chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps((float)(b2 * ra0)), 0x44); + ra0 = -ra0; + chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps((float)(a1 * ra0)), 0x44); + chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps((float)(a2 * ra0)), 0x44); +} + + +static void aymo_(apply_tr)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_tr_to_db[chip->reg_tr & 0x0Fu]; + double gt = pow(10., (db * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fcd = 10.; // [Hz] + double wd = ((2. * pi) * fcd); + double kd = ((chip->reg_sf & 0x40u) ? 0. : (tan(wd * (.5 / fs)) / wd)); + double fct = 4500.; // [Hz] + double wt = ((2. * pi) * fct); + double kt = (tan(wt * (.5 / fs)) / wt); + + double a0 = (((gt * kt * wt) * (kd * wd)) + ((gt * kt * wt) + (kd * wd)) + 1.); + double a1 = (((gt * kt * wt) * (kd * wd) * 2.) - 2.); + double a2 = (((gt * kt * wt) * (kd * wd)) - ((gt * kt * wt) + (kd * wd)) + 1.); + + double b0 = ((gt * gt) + (gt * kt * wt)); + double b1 = ((gt * gt) * -2.); + double b2 = ((gt * gt) - (gt * kt * wt)); + + double ra0 = (1. / a0); + chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps((float)(b0 * ra0)), 0x22); + chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps((float)(b1 * ra0)), 0x22); + chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps((float)(b2 * ra0)), 0x22); + ra0 = -ra0; + chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps((float)(a1 * ra0)), 0x22); + chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps((float)(a2 * ra0)), 0x22); +} + + +static void aymo_(apply_source_mode)(struct aymo_(chip)* chip) +{ + // Default mute + vf32x8_t klr = _mm256_setzero_ps(); + vf32x8_t krl = _mm256_setzero_ps(); + + uint8_t source = (chip->reg_sf & 0x07u); + uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u); + + // Forced mono + if (mode == 0x00u) { // process + switch (source) { + // Channel 1 + case 0x02u: + case 0x04u: + case 0x06u: { + klr = _mm256_set_ps(1.f, .0f, .0f, .0f, 1.f, .0f, .0f, .0f); + krl = _mm256_set_ps(1.f, .0f, .0f, .0f, 1.f, .0f, .0f, .0f); + break; + } + } + } + else { // not forced mono + switch (source) { + // Channel 1 + case 0x02u: { // mono left + klr = _mm256_set_ps(0.f, .0f, .0f, .0f, 1.f, .0f, .0f, .0f); + krl = _mm256_set_ps(1.f, .0f, .0f, .0f, 0.f, .0f, .0f, .0f); + break; + } + case 0x04u: { // mono right + klr = _mm256_set_ps(1.f, .0f, .0f, .0f, 0.f, .0f, .0f, .0f); + krl = _mm256_set_ps(0.f, .0f, .0f, .0f, 1.f, .0f, .0f, .0f); + break; + } + case 0x06u: { // stereo + klr = _mm256_set_ps(1.f, .0f, .0f, .0f, 1.f, .0f, .0f, .0f); + krl = _mm256_set_ps(0.f, .0f, .0f, .0f, 0.f, .0f, .0f, .0f); + break; + } + default: { + if (mode == 0x03u) { // spatial stereo + mode = 0x02u; // force linear stereo (mute) + } + break; + } + } + + // Spatial stereo + if (mode == 0x03u) { // process + const float xt = .52f; // cross-talk + __m256 kx = _mm256_set_ps(xt, .0f, .0f, .0f, xt, .0f, .0f, .0f); + klr = _mm256_add_ps(klr, kx); + krl = _mm256_sub_ps(krl, kx); + } + } // not forced mono + + chip->klr = klr; + chip->krl = krl; +} + + +static void aymo_(apply_pseudo)(struct aymo_(chip)* chip) +{ + uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u); + + // Pseudo stereo + if (mode == 0x02u) { // enabled + double c1 = (double)chip->pseudo_c1; + double c2 = (double)chip->pseudo_c2; + double r1 = 15000.; // [ohm] + double r2 = 15000.; // [ohm] + double t1 = (c1 * r1); + double t2 = (c2 * r2); + + double fs = (double)chip->sample_rate; + double k = (.5 / fs); + double kk = (k * k); + double t1_t2 = (t1 * t2); + double t1_t2_k = ((t1 + t2) * k); + + double a0 = (kk + t1_t2 + t1_t2_k); + double a1 = ((kk - t1_t2) * 2.); + double a2 = (kk + t1_t2 - t1_t2_k); + + double b0 = a2; + double b1 = a1; + double b2 = a0; + + double ra0 = (1. / a0); + chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps((float)(b0 * ra0)), 0x11); + chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps((float)(b1 * ra0)), 0x11); + chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps((float)(b2 * ra0)), 0x11); + ra0 = -ra0; + chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps((float)(a1 * ra0)), 0x11); + chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps((float)(a2 * ra0)), 0x11); + } + else { // pass-through + chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps(1.f), 0x11); + chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps(.0f), 0x11); + chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps(.0f), 0x11); + + chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps(.0f), 0x11); + chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps(.0f), 0x11); + } +} + + +static void aymo_(apply_tfilter)(struct aymo_(chip)* chip) +{ + // T-filter + if (chip->reg_sf & 0x80u) { // pass-through + chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps(1.f), 0x88); + chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps(.0f), 0x88); + chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps(.0f), 0x88); + + chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps(.0f), 0x88); + chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps(.0f), 0x88); + } + else { // enabled + double db = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu]; + double g = pow(10., (db * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fc = 180.; // [Hz] + double w = ((2. * pi) * fc); + double k = (tan(w * (.5 / fs)) / w); + + double log10_g = log10(g); + double ang = (log10_g * .85); + double abs_sqrt_log10_g = sqrt(fabs(log10_g)); + double abs2_sqrt_log10_g = abs_sqrt_log10_g * abs_sqrt_log10_g; + double kw = (k * w); + double m_k2w2 = ((kw * kw) * -.05); + double sqrt_5 = 2.23606797749978980505147774238139391; + double ph = (pi * .75); + double h_sqrt_5_kw_abs_sqrt_log10_g = ((sqrt_5 * .2) * kw * abs_sqrt_log10_g); + double cosm = cos(ang - ph); + double cosp = cos(ang + ph); + + double a0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosm))); + double a1 = (((m_k2w2 + abs2_sqrt_log10_g)) * 2.); + double a2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosm))); + + double b0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosp))); + double b1 = a1; + double b2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosp))); + + double ra0 = (1. / a0); + chip->kb0 = _mm256_blend_ps(chip->kb0, _mm256_set1_ps((float)(b0 * ra0)), 0x88); + chip->kb1 = _mm256_blend_ps(chip->kb1, _mm256_set1_ps((float)(b1 * ra0)), 0x88); + chip->kb2 = _mm256_blend_ps(chip->kb2, _mm256_set1_ps((float)(b2 * ra0)), 0x88); + ra0 = -ra0; + chip->ka1 = _mm256_blend_ps(chip->ka1, _mm256_set1_ps((float)(a1 * ra0)), 0x88); + chip->ka2 = _mm256_blend_ps(chip->ka2, _mm256_set1_ps((float)(a2 * ra0)), 0x88); + } +} + + +static void aymo_(apply_pp)(struct aymo_(chip)* chip) +{ + uint8_t pseudo_preset = (chip->reg_pp & 0x03u); + if (pseudo_preset >= 3u) { + pseudo_preset = 0u; + } + chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[pseudo_preset]; + chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[pseudo_preset]; +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + assert(chip); + + switch (address) { + case 0x00u: { + return chip->reg_vl; + } + case 0x01u: { + return chip->reg_vr; + } + case 0x02u: { + return chip->reg_ba; + } + case 0x03u: { + return chip->reg_tr; + } + case 0x07u: { + return chip->reg_pp; + } + case 0x08u: { + return chip->reg_sf; + } + default: { + return 0xFFu; + } + } +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + switch (address) { + case 0x00u: { // VL + value |= 0xC0u; + chip->reg_vl = value; + aymo_(apply_vl)(chip); + break; + } + case 0x01u: { // VR + value |= 0xC0u; + chip->reg_vr = value; + aymo_(apply_vr)(chip); + break; + } + case 0x02u: { // BA + value |= 0xF0u; + chip->reg_ba = value; + aymo_(apply_ba)(chip); + break; + } + case 0x03u: { // TR + value |= 0xF0u; + chip->reg_tr = value; + aymo_(apply_tr)(chip); + break; + } + case 0x07u: { // PP + value |= 0xFCu; + chip->reg_pp = value; + aymo_(apply_pp)(chip); + aymo_(apply_pseudo)(chip); + break; + } + case 0x08u: { // SF + chip->reg_sf = value; + aymo_(apply_source_mode)(chip); + aymo_(apply_pseudo)(chip); + aymo_(apply_tfilter)(chip); + aymo_(apply_vl)(chip); + aymo_(apply_vr)(chip); + aymo_(apply_tr)(chip); + break; + } + } +} + + +void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]) +{ + assert(chip); + assert(x); + assert(y); + + float AYMO_ALIGN_V256 xlrv[8]; + float AYMO_ALIGN_V256 xrlv[8]; + float AYMO_ALIGN_V256 yyv[8]; + + vf32x8_t b2 = chip->hb1; + vf32x8_t a2 = chip->ha1; + + const float* xe = &x[count * 2u]; + + while AYMO_LIKELY(x != xe) { + vf32x8_t y2 = _mm256_add_ps(_mm256_mul_ps(b2, chip->kb2), _mm256_mul_ps(a2, chip->ka2)); + chip->hb2 = b2; + chip->ha2 = a2; + + vf32x8_t b1 = chip->hb0; + vf32x8_t a1 = chip->ha0; + vf32x8_t y1 = _mm256_add_ps(_mm256_mul_ps(b1, chip->kb1), _mm256_mul_ps(a1, chip->ka1)); + chip->hb1 = b1; + chip->ha1 = a1; + + vf32x8_t yy = _mm256_add_ps(y2, y1); + + xrlv[7] = xlrv[3] = *x++; + xrlv[3] = xlrv[7] = *x++; + _mm_sfence(); + vf32x8_t xlr = _mm256_load_ps(xlrv); + vf32x8_t xrl = _mm256_load_ps(xrlv); + vf32x8_t xx = _mm256_add_ps(_mm256_mul_ps(xlr, chip->klr), _mm256_mul_ps(xrl, chip->krl)); + + vf32x8_t b0 = mm256_alignr_ps(chip->ha0, xx, 3); + yy = _mm256_add_ps(yy, _mm256_mul_ps(b0, chip->kb0)); + chip->hb0 = b0; + + chip->ha0 = yy; + + yy = _mm256_mul_ps(yy, chip->kv); + _mm256_store_ps(yyv, yy); + + b2 = chip->hb1; + a2 = chip->ha1; + + _mm_sfence(); + *y++ = yyv[3]; + *y++ = yyv[7]; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 diff --git a/src/aymo_tda8425_x86_sse41.c b/src/aymo_tda8425_x86_sse41.c new file mode 100644 index 0000000..ebfb44e --- /dev/null +++ b/src/aymo_tda8425_x86_sse41.c @@ -0,0 +1,512 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#include "aymo_tda8425.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_tda8425_x86_sse41.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + +#undef cos +#undef fabs +#undef log10 +#undef pow +#undef sqrt +#undef tan + +#define cos (aymo_tda8425_math->cos) +#define fabs (aymo_tda8425_math->fabs) +#define log10 (aymo_tda8425_math->log10) +#define pow (aymo_tda8425_math->pow) +#define sqrt (aymo_tda8425_math->sqrt) +#define tan (aymo_tda8425_math->tan) + + +#undef mm_insert_ps +#define mm_insert_ps(a, b, imm8) \ + (_mm_blend_ps((a), _mm_set1_ps(b), (1 << (imm8)))) + + +#undef mm_alignr_ps +#define mm_alignr_ps(a, b, imm8) \ + (_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(a), _mm_castps_si128(b), ((imm8) * 4)))) + + +const struct aymo_tda8425_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_tda8425_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_tda8425_ctor_f)&(aymo_(ctor)), + (aymo_tda8425_dtor_f)&(aymo_(dtor)), + (aymo_tda8425_read_f)&(aymo_(read)), + (aymo_tda8425_write_f)&(aymo_(write)), + (aymo_tda8425_process_f32_f)&(aymo_(process_f32)) +}; + + +const struct aymo_tda8425_vt* aymo_(get_vt)(void) +{ + return &aymo_(vt); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip, float sample_rate) +{ + assert(chip); + assert(sample_rate > 0.f); + + // Wipe everything + aymo_memset(chip, 0, sizeof(struct aymo_(chip))); + + // Setup default parameters + chip->sample_rate = sample_rate; + chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[0]; + chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[0]; + + // Setup default registers + aymo_(write)(chip, 0x00u, 0xFCu); // VL: 0 dB + aymo_(write)(chip, 0x01u, 0xFCu); // VR: 0 dB + aymo_(write)(chip, 0x02u, 0xF6u); // BA: 0 dB + aymo_(write)(chip, 0x03u, 0xF6u); // TR: 0 dB + aymo_(write)(chip, 0x07u, 0xFCu); // PP: light pseudo + aymo_(write)(chip, 0x08u, 0xCEu); // SF: linear stereo, channel 1, unmuted +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +static void aymo_(apply_vl)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vl & 0x3Fu]; + + if (chip->reg_sf & 0x20u) { // mute + db = -90.; + } + + double g = pow(10., (db * .05)); + chip->kv = mm_insert_ps(chip->kv, (float)g, 2); +} + + +static void aymo_(apply_vr)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_v_to_db[chip->reg_vr & 0x3Fu]; + + if (chip->reg_sf & 0x20u) { // mute + db = -90.; + } + + double g = pow(10., (db * .05)); + chip->kv = mm_insert_ps(chip->kv, (float)g, 3); +} + + +static void aymo_(apply_ba)(struct aymo_(chip)* chip) +{ + double dbb = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu]; + double gb = pow(10., (dbb * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fcb = 300.; // [Hz] + double wb = ((2. * pi) * fcb); + double kb = (tan(wb * (.5 / fs)) / wb); + + double a0 = ((kb * wb) + gb); + double a1 = ((kb * wb) - gb); + double a2 = 0.; + + double b0 = (((kb * wb) * (gb * gb)) + gb); + double b1 = (((kb * wb) * (gb * gb)) - gb); + double b2 = 0.; + + double ra0 = (1. / a0); + chip->kb0 = mm_insert_ps(chip->kb0, (float)(b0 * ra0), 2); + chip->kb1 = mm_insert_ps(chip->kb1, (float)(b1 * ra0), 2); + chip->kb2 = mm_insert_ps(chip->kb2, (float)(b2 * ra0), 2); + ra0 = -ra0; + chip->ka1 = mm_insert_ps(chip->ka1, (float)(a1 * ra0), 2); + chip->ka2 = mm_insert_ps(chip->ka2, (float)(a2 * ra0), 2); +} + + +static void aymo_(apply_tr)(struct aymo_(chip)* chip) +{ + double db = (double)aymo_tda8425_reg_tr_to_db[chip->reg_tr & 0x0Fu]; + double gt = pow(10., (db * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fcd = 10.; // [Hz] + double wd = ((2. * pi) * fcd); + double kd = ((chip->reg_sf & 0x40u) ? 0. : (tan(wd * (.5 / fs)) / wd)); + double fct = 4500.; // [Hz] + double wt = ((2. * pi) * fct); + double kt = (tan(wt * (.5 / fs)) / wt); + + double a0 = (((gt * kt * wt) * (kd * wd)) + ((gt * kt * wt) + (kd * wd)) + 1.); + double a1 = (((gt * kt * wt) * (kd * wd) * 2.) - 2.); + double a2 = (((gt * kt * wt) * (kd * wd)) - ((gt * kt * wt) + (kd * wd)) + 1.); + + double b0 = ((gt * gt) + (gt * kt * wt)); + double b1 = ((gt * gt) * -2.); + double b2 = ((gt * gt) - (gt * kt * wt)); + + double ra0 = (1. / a0); + chip->kb0 = mm_insert_ps(chip->kb0, (float)(b0 * ra0), 1); + chip->kb1 = mm_insert_ps(chip->kb1, (float)(b1 * ra0), 1); + chip->kb2 = mm_insert_ps(chip->kb2, (float)(b2 * ra0), 1); + ra0 = -ra0; + chip->ka1 = mm_insert_ps(chip->ka1, (float)(a1 * ra0), 1); + chip->ka2 = mm_insert_ps(chip->ka2, (float)(a2 * ra0), 1); +} + + +static void aymo_(apply_source_mode)(struct aymo_(chip)* chip) +{ + // Default mute + vf32x4_t klr = _mm_setzero_ps(); + vf32x4_t krl = _mm_setzero_ps(); + + uint8_t source = (chip->reg_sf & 0x07u); + uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u); + + // Forced mono + if (mode == 0x00u) { // process + switch (source) { + // Channel 1 + case 0x02u: + case 0x04u: + case 0x06u: { + klr = _mm_set_ps(1.f, 1.f, .0f, .0f); + krl = _mm_set_ps(1.f, 1.f, .0f, .0f); + break; + } + } + } + else { // not forced mono + switch (source) { + // Channel 1 + case 0x02u: { // mono left + klr = _mm_set_ps(0.f, 1.f, .0f, .0f); + krl = _mm_set_ps(1.f, 0.f, .0f, .0f); + break; + } + case 0x04u: { // mono right + klr = _mm_set_ps(1.f, 0.f, .0f, .0f); + krl = _mm_set_ps(0.f, 1.f, .0f, .0f); + break; + } + case 0x06u: { // stereo + klr = _mm_set_ps(1.f, 1.f, .0f, .0f); + krl = _mm_set_ps(0.f, 0.f, .0f, .0f); + break; + } + default: { + if (mode == 0x03u) { // spatial stereo + mode = 0x02u; // force linear stereo (mute) + } + break; + } + } + + // Spatial stereo + if (mode == 0x03u) { // process + const float xt = .52f; // cross-talk + vf32x4_t kx = _mm_set_ps(xt, xt, .0f, .0f); + klr = _mm_add_ps(klr, kx); + krl = _mm_sub_ps(krl, kx); + } + } // not forced mono + + chip->klr = klr; + chip->krl = krl; +} + + +static void aymo_(apply_pseudo)(struct aymo_(chip)* chip) +{ + uint8_t mode = ((chip->reg_sf >> 3u) & 0x03u); + + // Pseudo stereo + if (mode == 0x02u) { // enabled + double c1 = (double)chip->pseudo_c1; + double c2 = (double)chip->pseudo_c2; + double r1 = 15000.; // [ohm] + double r2 = 15000.; // [ohm] + double t1 = (c1 * r1); + double t2 = (c2 * r2); + + double fs = (double)chip->sample_rate; + double k = (.5 / fs); + double kk = (k * k); + double t1_t2 = (t1 * t2); + double t1_t2_k = ((t1 + t2) * k); + + double a0 = (kk + t1_t2 + t1_t2_k); + double a1 = ((kk - t1_t2) * 2.); + double a2 = (kk + t1_t2 - t1_t2_k); + + double b0 = a2; + double b1 = a1; + double b2 = a0; + + double ra0 = (1. / a0); + chip->kb0 = mm_insert_ps(chip->kb0, (float)(b0 * ra0), 0); + chip->kb1 = mm_insert_ps(chip->kb1, (float)(b1 * ra0), 0); + chip->kb2 = mm_insert_ps(chip->kb2, (float)(b2 * ra0), 0); + ra0 = -ra0; + chip->ka1 = mm_insert_ps(chip->ka1, (float)(a1 * ra0), 0); + chip->ka2 = mm_insert_ps(chip->ka2, (float)(a2 * ra0), 0); + } + else { // pass-through + chip->kb0 = mm_insert_ps(chip->kb0, 1.f, 0); + chip->kb1 = mm_insert_ps(chip->kb1, .0f, 0); + chip->kb2 = mm_insert_ps(chip->kb2, .0f, 0); + + chip->ka1 = mm_insert_ps(chip->ka1, .0f, 0); + chip->ka2 = mm_insert_ps(chip->ka2, .0f, 0); + } +} + + +static void aymo_(apply_tfilter)(struct aymo_(chip)* chip) +{ + // T-filter + if (chip->reg_sf & 0x80u) { // pass-through + chip->kb0 = mm_insert_ps(chip->kb0, 1.f, 3); + chip->kb1 = mm_insert_ps(chip->kb1, .0f, 3); + chip->kb2 = mm_insert_ps(chip->kb2, .0f, 3); + + chip->ka1 = mm_insert_ps(chip->ka1, .0f, 3); + chip->ka2 = mm_insert_ps(chip->ka2, .0f, 3); + } + else { // enabled + double db = (double)aymo_tda8425_reg_ba_to_db[chip->reg_ba & 0x0Fu]; + double g = pow(10., (db * (.05 * .5))); + double fs = (double)chip->sample_rate; + double pi = 3.14159265358979323846264338327950288; + double fc = 180.; // [Hz] + double w = ((2. * pi) * fc); + double k = (tan(w * (.5 / fs)) / w); + + double log10_g = log10(g); + double ang = (log10_g * .85); + double abs_sqrt_log10_g = sqrt(fabs(log10_g)); + double abs2_sqrt_log10_g = abs_sqrt_log10_g * abs_sqrt_log10_g; + double kw = (k * w); + double m_k2w2 = ((kw * kw) * -.05); + double sqrt_5 = 2.23606797749978980505147774238139391; + double ph = (pi * .75); + double h_sqrt_5_kw_abs_sqrt_log10_g = ((sqrt_5 * .2) * kw * abs_sqrt_log10_g); + double cosm = cos(ang - ph); + double cosp = cos(ang + ph); + + double a0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosm))); + double a1 = (((m_k2w2 + abs2_sqrt_log10_g)) * 2.); + double a2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosm))); + + double b0 = (((m_k2w2 - abs2_sqrt_log10_g) + (h_sqrt_5_kw_abs_sqrt_log10_g * cosp))); + double b1 = a1; + double b2 = (((m_k2w2 - abs2_sqrt_log10_g) - (h_sqrt_5_kw_abs_sqrt_log10_g * cosp))); + + double ra0 = (1. / a0); + chip->kb0 = mm_insert_ps(chip->kb0, (float)(b0 * ra0), 3); + chip->kb1 = mm_insert_ps(chip->kb1, (float)(b1 * ra0), 3); + chip->kb2 = mm_insert_ps(chip->kb2, (float)(b2 * ra0), 3); + ra0 = -ra0; + chip->ka1 = mm_insert_ps(chip->ka1, (float)(a1 * ra0), 3); + chip->ka2 = mm_insert_ps(chip->ka2, (float)(a2 * ra0), 3); + } +} + + +static void aymo_(apply_pp)(struct aymo_(chip)* chip) +{ + uint8_t pseudo_preset = (chip->reg_pp & 0x03u); + if (pseudo_preset >= 3u) { + pseudo_preset = 0u; + } + chip->pseudo_c1 = aymo_tda8425_pseudo_preset_c1[pseudo_preset]; + chip->pseudo_c2 = aymo_tda8425_pseudo_preset_c2[pseudo_preset]; +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + assert(chip); + + switch (address) { + case 0x00u: { + return chip->reg_vl; + } + case 0x01u: { + return chip->reg_vr; + } + case 0x02u: { + return chip->reg_ba; + } + case 0x03u: { + return chip->reg_tr; + } + case 0x07u: { + return chip->reg_pp; + } + case 0x08u: { + return chip->reg_sf; + } + default: { + return 0xFFu; + } + } +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + switch (address) { + case 0x00u: { // VL + value |= 0xC0u; + chip->reg_vl = value; + aymo_(apply_vl)(chip); + break; + } + case 0x01u: { // VR + value |= 0xC0u; + chip->reg_vr = value; + aymo_(apply_vr)(chip); + break; + } + case 0x02u: { // BA + value |= 0xF0u; + chip->reg_ba = value; + aymo_(apply_ba)(chip); + break; + } + case 0x03u: { // TR + value |= 0xF0u; + chip->reg_tr = value; + aymo_(apply_tr)(chip); + break; + } + case 0x07u: { // PP + value |= 0xFCu; + chip->reg_pp = value; + aymo_(apply_pp)(chip); + aymo_(apply_pseudo)(chip); + break; + } + case 0x08u: { // SF + chip->reg_sf = value; + aymo_(apply_source_mode)(chip); + aymo_(apply_pseudo)(chip); + aymo_(apply_tfilter)(chip); + aymo_(apply_vl)(chip); + aymo_(apply_vr)(chip); + aymo_(apply_tr)(chip); + break; + } + } +} + + +void aymo_(process_f32)(struct aymo_(chip)* chip, uint32_t count, const float x[], float y[]) +{ + assert(chip); + assert(x); + assert(y); + + vf32x4_t b2l = chip->hb1l; + vf32x4_t b2r = chip->hb1r; + vf32x4_t a2l = chip->ha1l; + vf32x4_t a2r = chip->ha1r; + + const float* xe = &x[count * 2u]; + + while AYMO_LIKELY(x != xe) { + vf32x4_t y2l = _mm_add_ps(_mm_mul_ps(b2l, chip->kb2), _mm_mul_ps(a2l, chip->ka2)); + vf32x4_t y2r = _mm_add_ps(_mm_mul_ps(b2r, chip->kb2), _mm_mul_ps(a2r, chip->ka2)); + chip->hb2l = b2l; + chip->hb2r = b2r; + chip->ha2l = a2l; + chip->ha2r = a2r; + + vf32x4_t b1l = chip->hb0l; + vf32x4_t b1r = chip->hb0r; + vf32x4_t a1l = chip->ha0l; + vf32x4_t a1r = chip->ha0r; + vf32x4_t y1l = _mm_add_ps(_mm_mul_ps(b1l, chip->kb1), _mm_mul_ps(a1l, chip->ka1)); + vf32x4_t y1r = _mm_add_ps(_mm_mul_ps(b1r, chip->kb1), _mm_mul_ps(a1r, chip->ka1)); + chip->hb1l = b1l; + chip->hb1r = b1r; + chip->ha1l = a1l; + chip->ha1r = a1r; + + vf32x4_t yyl = _mm_add_ps(y2l, y1l); + vf32x4_t yyr = _mm_add_ps(y2r, y1r); + + vf32x4_t xlr = _mm_loadh_pi(_mm_undefined_ps(), (const void*)x); x += 2u; + vf32x4_t xrl = _mm_shuffle_ps(xlr, xlr, _MM_SHUFFLE(2, 3, 0, 1)); // "23.." + vf32x4_t xx = _mm_add_ps(_mm_mul_ps(xlr, chip->klr), _mm_mul_ps(xrl, chip->krl)); + + vf32x4_t xl = _mm_shuffle_ps(xx, xx, _MM_SHUFFLE(2, 3, 0, 1)); // "2..." + vf32x4_t b0l = mm_alignr_ps(a1l, xl, 3); + vf32x4_t b0r = mm_alignr_ps(a1r, xx, 3); + yyl = _mm_add_ps(yyl, _mm_mul_ps(b0l, chip->kb0)); + yyr = _mm_add_ps(yyr, _mm_mul_ps(b0r, chip->kb0)); + chip->hb0l = b0l; + chip->hb0r = b0r; + + chip->ha0l = yyl; + chip->ha0r = yyr; + + yyl = _mm_shuffle_ps(yyl, yyl, _MM_SHUFFLE(2, 3, 0, 1)); // ".3.." + vf32x4_t yy = _mm_blend_ps(yyl, yyr, 0x8); // "1000" + yy = _mm_mul_ps(yy, chip->kv); + + b2l = chip->hb1l; + b2r = chip->hb1r; + a2l = chip->ha1l; + a2r = chip->ha1r; + + _mm_storeh_pi((void*)y, yy); y += 2u; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 diff --git a/src/aymo_wave.c b/src/aymo_wave.c new file mode 100644 index 0000000..4cb1606 --- /dev/null +++ b/src/aymo_wave.c @@ -0,0 +1,79 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_wave.h" + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +AYMO_PUBLIC void aymo_wave_heading_setup( + struct aymo_wave_heading* heading, + uint16_t wave_fmt_type, + uint16_t channel_count, + uint16_t sample_bits, + uint32_t sample_rate, + uint32_t sample_count +) +{ + assert(heading); + assert(channel_count > 0u); + assert(sample_bits > 0u); + assert(sample_rate > 0u); + + uint16_t sample_byte_size = (sample_bits / 8u); + uint32_t sample_data_size = (sample_count * channel_count * sample_byte_size); + assert(sample_data_size < (UINT32_MAX - 32u)); + + heading->riff_fourcc[0] = 'R'; + heading->riff_fourcc[1] = 'I'; + heading->riff_fourcc[2] = 'F'; + heading->riff_fourcc[3] = 'F'; + heading->riff_size = (32u + sample_data_size); + + heading->wave_fourcc[0] = 'W'; + heading->wave_fourcc[1] = 'A'; + heading->wave_fourcc[2] = 'V'; + heading->wave_fourcc[3] = 'E'; + + heading->wave_fmt_fourcc[0] = 'f'; + heading->wave_fmt_fourcc[1] = 'm'; + heading->wave_fmt_fourcc[2] = 't'; + heading->wave_fmt_fourcc[3] = ' '; + heading->wave_fmt_size = 16u; + + heading->wave_fmt_type = wave_fmt_type; + heading->wave_fmt_channel_count = channel_count; + heading->wave_fmt_sample_rate = sample_rate; + heading->wave_fmt_byte_rate = (sample_byte_size * sample_rate); + heading->wave_fmt_block_align = (sample_byte_size * channel_count); + heading->wave_fmt_sample_bits = sample_bits; + + heading->wave_data_fourcc[0] = 'd'; + heading->wave_data_fourcc[1] = 'a'; + heading->wave_data_fourcc[2] = 't'; + heading->wave_data_fourcc[3] = 'a'; + heading->wave_data_size = sample_data_size; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_ym7128.c b/src/aymo_ym7128.c new file mode 100644 index 0000000..da2c169 --- /dev/null +++ b/src/aymo_ym7128.c @@ -0,0 +1,148 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#include "aymo_ym7128.h" +#include "aymo_ym7128_arm_neon.h" +#include "aymo_ym7128_none.h" +#include "aymo_ym7128_x86_sse41.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +static const struct aymo_ym7128_vt* aymo_ym7128_best_vt; + + +void aymo_ym7128_boot(void) +{ + #ifdef AYMO_CPU_SUPPORT_X86_SSE41 + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) { + aymo_ym7128_best_vt = aymo_ym7128_x86_sse41_get_vt(); + return; + } + #endif + + #ifdef AYMO_CPU_SUPPORT_ARM_NEON + if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) { + aymo_ym7128_best_vt = aymo_ym7128_arm_neon_get_vt(); + return; + } + #endif + + aymo_ym7128_best_vt = aymo_ym7128_none_get_vt(); +} + + +const struct aymo_ym7128_vt* aymo_ym7128_get_vt(const char* cpu_ext) +{ + if (cpu_ext == NULL) { + return NULL; + } + + #ifdef AYMO_CPU_SUPPORT_X86_SSE41 + if (!aymo_strcmp(cpu_ext, "x86_sse41")) { + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) { + return aymo_ym7128_x86_sse41_get_vt(); + } + } + #endif + + #ifdef AYMO_CPU_SUPPORT_ARM_NEON + if (!aymo_strcmp(cpu_ext, "arm_neon")) { + if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) { + return aymo_ym7128_arm_neon_get_vt(); + } + } + #endif + + if (!aymo_strcmp(cpu_ext, "none")) { + return aymo_ym7128_none_get_vt(); + } + return NULL; +} + + +const struct aymo_ym7128_vt* aymo_ym7128_get_best_vt(void) +{ + return aymo_ym7128_best_vt; +} + + +uint32_t aymo_ym7128_get_sizeof(struct aymo_ym7128_chip* chip) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->get_sizeof); + + return chip->vt->get_sizeof(); +} + + +void aymo_ym7128_ctor(struct aymo_ym7128_chip* chip) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->ctor); + + chip->vt->ctor(chip); +} + + +void aymo_ym7128_dtor(struct aymo_ym7128_chip* chip) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->dtor); + + chip->vt->dtor(chip); +} + + +uint8_t aymo_ym7128_read(struct aymo_ym7128_chip* chip, uint16_t address) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->read); + + return chip->vt->read(chip, address); +} + + +void aymo_ym7128_write(struct aymo_ym7128_chip* chip, uint16_t address, uint8_t value) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->write); + + chip->vt->write(chip, address, value); +} + + +void aymo_ym7128_process_i16(struct aymo_ym7128_chip* chip, uint32_t count, const int16_t x[], int16_t y[]) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->process_i16); + + chip->vt->process_i16(chip, count, x, y); +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_ym7128_arm_neon.c b/src/aymo_ym7128_arm_neon.c new file mode 100644 index 0000000..00b717c --- /dev/null +++ b/src/aymo_ym7128_arm_neon.c @@ -0,0 +1,270 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +#include "aymo_cpu_arm_neon_inline.h" +#include "aymo_ym7128_common.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ym7128_arm_neon.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_ym7128_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_ym7128_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_ym7128_ctor_f)&(aymo_(ctor)), + (aymo_ym7128_dtor_f)&(aymo_(dtor)), + (aymo_ym7128_read_f)&(aymo_(read)), + (aymo_ym7128_write_f)&(aymo_(write)), + (aymo_ym7128_process_i16_f)&(aymo_(process_i16)) +}; + + +const struct aymo_ym7128_vt* aymo_(get_vt)(void) +{ + return &aymo_(vt); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip) +{ + assert(chip); + + // Wipe everything + aymo_memset(chip, 0, sizeof(struct aymo_(chip))); + + // Initialize input stage coefficients (-1 as a placeholder for computed values) + chip->xxv[2] = 1; + chip->xxv[3] = 1; + + chip->kk1 = vseta(0, -1, -1, -1, -0x8000, -0x8000, -0x8000, -0x8000); + chip->kk2 = vseta(0, -1, -0x8000, 0, 0, 0, 0x8000, -0x8000); + chip->kkm = vseta(0, 0x7FFF, 0x7FFF, 0, 0, 0, AYMO_YM7128_DELAY_LENGTH, AYMO_YM7128_DELAY_LENGTH); + + // Initialize oversampler coefficients + const int16_t* k = aymo_ym7128_kernel_linear; + chip->ka = vseta(k[ 6], k[ 6], k[ 4], k[ 4], k[ 2], k[ 2], k[ 0], k[ 0]); + chip->kb = vseta(k[ 7], k[ 7], k[ 5], k[ 5], k[ 3], k[ 3], k[ 1], k[ 1]); + chip->kc = vseta(k[14], k[14], k[12], k[12], k[10], k[10], k[ 8], k[ 8]); + chip->kd = vseta(k[15], k[15], k[13], k[13], k[11], k[11], k[ 9], k[ 9]); + chip->ke = vseta( 0, 0, 0, 0, k[18], k[18], k[16], k[16]); + chip->kf = vseta( 0, 0, 0, 0, 0, 0, k[17], k[17]); + + // Initialize as pass-through + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_gl1, 0x3Fu); + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_gr1, 0x3Fu); + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vm, 0x3Fu); + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vl, 0x3Fu); + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vr, 0x3Fu); +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + assert(chip); + + if (address < (uint16_t)AYMO_YM7128_REG_COUNT) { + return chip->regs[address]; + } + return 0x00u; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address <= (uint16_t)aymo_ym7128_reg_gl8) { + value &= 0x3Fu; + int16_t gl = aymo_ym7128_gain[value]; + int i = (int)(address - (uint16_t)aymo_ym7128_reg_gl1); + chip->kgl = vinsertn(chip->kgl, gl, i); + } + else if (address <= (uint16_t)aymo_ym7128_reg_gr8) { + value &= 0x3Fu; + int16_t gr = aymo_ym7128_gain[value]; + int i = (int)(address - (uint16_t)aymo_ym7128_reg_gr1); + chip->kgr = vinsertn(chip->kgr, gr, i); + } + else if (address <= (uint16_t)aymo_ym7128_reg_vr) { + value &= 0x3Fu; + int16_t v = aymo_ym7128_gain[value]; + if (address == (uint16_t)aymo_ym7128_reg_vm) { + chip->kk1 = vinsert(chip->kk1, -v, 5); + } + else if (address == (uint16_t)aymo_ym7128_reg_vc) { + chip->kk2 = vinsert(chip->kk2, v, 6); + } + else if (address == (uint16_t)aymo_ym7128_reg_vl) { + chip->kv = vinsert(chip->kv, v, 6); + } + else { + chip->kv = vinsert(chip->kv, v, 7); + } + } + else if (address <= (uint16_t)aymo_ym7128_reg_c1) { + value &= 0x3Fu; + int16_t v = ((int16_t)value << (16 - AYMO_YM7128_COEFF_BITS)); + if (address == (uint16_t)aymo_ym7128_reg_c0) { + chip->kk1 = vinsert(chip->kk1, v, 4); + } + else { + chip->kk1 = vinsert(chip->kk1, v, 6); + } + } + else if (address <= (uint16_t)aymo_ym7128_reg_t8) { + value &= 0x1Fu; + int16_t t = aymo_ym7128_tap[value]; + int16_t hi = chip->xxv[1]; // hi + t = (hi - t); + if (t < 0) { + t += AYMO_YM7128_DELAY_LENGTH; + } + if (address == (uint16_t)aymo_ym7128_reg_t0) { + chip->xxv[0] = t; // ti0 + } + else { + uint16_t i = (address - (uint16_t)aymo_ym7128_reg_t1); + chip->tiv[i] = t; + } + } + + if (address < (uint16_t)AYMO_YM7128_REG_COUNT) { + chip->regs[address] = value; + } +} + + +void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]) +{ + assert(chip); + assert(x); + assert(y); + if AYMO_UNLIKELY(!count) return; + + int16_t AYMO_ALIGN_V128 vv[8] = {0}; + + int16_t ti0 = chip->xxv[0]; + int16_t t0 = chip->uh[ti0]; + chip->xxv[4] = t0; + + const int16_t* xe = &x[count]; + + while AYMO_LIKELY(x != xe) { + chip->xxv[5] = (*x++ & AYMO_YM7128_SIGNAL_MASK); + + vsfence(); + vi16x8_t xx = vload(chip->xxv); + chip->xxv[6] = t0; // t0d = t0 + xx = vmulhrs(xx, chip->kk1); + xx = vaddsi(xx, vrevv(xx)); + xx = vmulhrs(xx, chip->kk2); + xx = vand(xx, vcmpgt(chip->kkm, xx)); + xx = vaddsi(xx, vrev64q_s16(xx)); + vstore(vv, xx); + vi16x8_t ti = vload(chip->tiv); + vi16x8_t tj = vsub(ti, vset1(-1)); + vi16x8_t tm = vcmpgt(vset1(AYMO_YM7128_DELAY_LENGTH - 1), ti); // tj < DL + vstore(chip->tiv, vand(tj, tm)); + vsfence(); + + chip->xxv[0] = vv[0]; // ti0' + int16_t hj = vv[1]; + chip->xxv[1] = hj; // hi' + int16_t u = vv[5]; + chip->uh[hj] = u; + int16_t AYMO_ALIGN_V128 tuv[8]; + for (unsigned i = 0u; i < 8u; ++i) { + tuv[i] = chip->uh[chip->tiv[i]]; + } + vsfence(); + vi16x8_t tu = vload(tuv); + + vi16x8_t gl = vmulhrs(tu, chip->kgl); + vi16x8_t gr = vmulhrs(tu, chip->kgr); + vi32x4_t ggl = vpaddlq_s16(gl); + vi32x4_t ggr = vpaddlq_s16(gr); + ggl = vvadd(ggl, vvext(ggl, ggl, 2)); + ggr = vvadd(ggr, vvext(ggr, ggr, 2)); + ggl = vvadd(ggl, vvrev(ggl)); + ggr = vvadd(ggr, vvrev(ggr)); + vi16x8_t ggrl = vvpacks(ggr, ggl); + vi16x8_t gglr = vext(ggrl, ggrl, 1); + vi16x8_t vlr = vmulhrs(gglr, chip->kv); + + vi16x8_t zc = chip->zc; + vi16x8_t zb = chip->zb; + zc = vext(zb, zc, 6); // '543210..' + chip->zc = zc; + + vi16x8_t y1 = vmulhrs(zc, chip->kf); + vi16x8_t y0 = vmulhrs(zc, chip->ke); + + vi16x8_t za = chip->za; + zb = vext(za, zb, 6); // '543210..' + chip->zb = zb; + + y1 = vaddsi(y1, vmulhrs(zb, chip->kd)); + y0 = vaddsi(y0, vmulhrs(zb, chip->kc)); + + za = vext(vlr, za, 6); // '543210..' + chip->za = za; + + y1 = vaddsi(y1, vmulhrs(za, chip->kb)); + y0 = vaddsi(y0, vmulhrs(za, chip->ka)); + + vi16x4_t yy0 = vqadd_s16(vgetlo(y0), vgethi(y0)); + vi16x4_t yy1 = vqadd_s16(vgetlo(y1), vgethi(y1)); + yy0 = vqadd_s16(yy0, vext_s16(yy0, yy0, 2)); + yy1 = vqadd_s16(yy1, vext_s16(yy1, yy1, 2)); + + vi16x4_t yy = vext_s16(yy0, yy1, 2); + yy = vand_s16(yy, vdup_n_s16((int16_t)AYMO_YM7128_SIGNAL_MASK)); + vst1_s16(y, yy); y += 4u; + + ti0 = chip->xxv[0]; + t0 = chip->uh[ti0]; + chip->xxv[4] = t0; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON diff --git a/src/aymo_ym7128_common.c b/src/aymo_ym7128_common.c new file mode 100644 index 0000000..e5482b4 --- /dev/null +++ b/src/aymo_ym7128_common.c @@ -0,0 +1,192 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_ym7128_common.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +#define PGAIN(x) ((int16_t)((double)(x) * (double)AYMO_YM7128_GAIN_UNIT) \ + & (int16_t)AYMO_YM7128_GAIN_MASK) + +#define NGAIN(x) ((int16_t)(~(int32_t)PGAIN(x) & (int32_t)0xFFFF)) // pseudo-negative + +const int16_t aymo_ym7128_gain[64u] = +{ + NGAIN(0.000000000000000000), // -oo dB- + NGAIN(0.001000000000000000), // -60 dB- + NGAIN(0.001258925411794167), // -58 dB- + NGAIN(0.001584893192461114), // -56 dB- + NGAIN(0.001995262314968879), // -54 dB- + NGAIN(0.002511886431509579), // -52 dB- + NGAIN(0.003162277660168379), // -50 dB- + NGAIN(0.003981071705534973), // -48 dB- + NGAIN(0.005011872336272725), // -46 dB- + NGAIN(0.006309573444801930), // -44 dB- + NGAIN(0.007943282347242814), // -42 dB- + NGAIN(0.010000000000000000), // -40 dB- + NGAIN(0.012589254117941675), // -38 dB- + NGAIN(0.015848931924611134), // -36 dB- + NGAIN(0.019952623149688799), // -34 dB- + NGAIN(0.025118864315095794), // -32 dB- + NGAIN(0.031622776601683791), // -30 dB- + NGAIN(0.039810717055349734), // -28 dB- + NGAIN(0.050118723362727220), // -26 dB- + NGAIN(0.063095734448019331), // -24 dB- + NGAIN(0.079432823472428138), // -22 dB- + NGAIN(0.100000000000000006), // -20 dB- + NGAIN(0.125892541179416728), // -18 dB- + NGAIN(0.158489319246111343), // -16 dB- + NGAIN(0.199526231496887974), // -14 dB- + NGAIN(0.251188643150958013), // -12 dB- + NGAIN(0.316227766016837941), // -10 dB- + NGAIN(0.398107170553497203), // - 8 dB- + NGAIN(0.501187233627272244), // - 6 dB- + NGAIN(0.630957344480193250), // - 4 dB- + NGAIN(0.794328234724281490), // - 2 dB- + NGAIN(1.000000000000000000), // - 0 dB- + + PGAIN(0.000000000000000000), // -oo dB+ + PGAIN(0.001000000000000000), // -60 dB+ + PGAIN(0.001258925411794167), // -58 dB+ + PGAIN(0.001584893192461114), // -56 dB+ + PGAIN(0.001995262314968879), // -54 dB+ + PGAIN(0.002511886431509579), // -52 dB+ + PGAIN(0.003162277660168379), // -50 dB+ + PGAIN(0.003981071705534973), // -48 dB+ + PGAIN(0.005011872336272725), // -46 dB+ + PGAIN(0.006309573444801930), // -44 dB+ + PGAIN(0.007943282347242814), // -42 dB+ + PGAIN(0.010000000000000000), // -40 dB+ + PGAIN(0.012589254117941675), // -38 dB+ + PGAIN(0.015848931924611134), // -36 dB+ + PGAIN(0.019952623149688799), // -34 dB+ + PGAIN(0.025118864315095794), // -32 dB+ + PGAIN(0.031622776601683791), // -30 dB+ + PGAIN(0.039810717055349734), // -28 dB+ + PGAIN(0.050118723362727220), // -26 dB+ + PGAIN(0.063095734448019331), // -24 dB+ + PGAIN(0.079432823472428138), // -22 dB+ + PGAIN(0.100000000000000006), // -20 dB+ + PGAIN(0.125892541179416728), // -18 dB+ + PGAIN(0.158489319246111343), // -16 dB+ + PGAIN(0.199526231496887974), // -14 dB+ + PGAIN(0.251188643150958013), // -12 dB+ + PGAIN(0.316227766016837941), // -10 dB+ + PGAIN(0.398107170553497203), // - 8 dB+ + PGAIN(0.501187233627272244), // - 6 dB+ + PGAIN(0.630957344480193250), // - 4 dB+ + PGAIN(0.794328234724281490), // - 2 dB+ + PGAIN(1.000000000000000000) // - 0 dB+ +}; + + +#define TAP(i) ((int16_t)(((i) * (AYMO_YM7128_DELAY_LENGTH - 1)) / (AYMO_YM7128_TAP_COUNT - 1))) + +const int16_t aymo_ym7128_tap[32u] = +{ + TAP( 0), // 0.0 ms + TAP( 1), // 3.2 ms + TAP( 2), // 6.5 ms + TAP( 3), // 9.7 ms + TAP( 4), // 12.9 ms + TAP( 5), // 16.1 ms + TAP( 6), // 19.3 ms + TAP( 7), // 22.6 ms + TAP( 8), // 25.8 ms + TAP( 9), // 29.0 ms + TAP(10), // 32.3 ms + TAP(11), // 35.5 ms + TAP(12), // 38.7 ms + TAP(13), // 41.9 ms + TAP(14), // 45.2 ms + TAP(15), // 48.4 ms + TAP(16), // 51.6 ms + TAP(17), // 54.9 ms + TAP(18), // 58.1 ms + TAP(19), // 61.3 ms + TAP(20), // 64.5 ms + TAP(21), // 67.8 ms + TAP(22), // 71.0 ms + TAP(23), // 74.2 ms + TAP(24), // 77.4 ms + TAP(25), // 80.7 ms + TAP(26), // 83.9 ms + TAP(27), // 87.1 ms + TAP(28), // 90.4 ms + TAP(29), // 93.6 ms + TAP(30), // 96.8 ms + TAP(31) // 100.0 ms +}; + + +#undef KERNEL +#define KERNEL(x) ((int16_t)((double)(x) * (double)AYMO_YM7128_GAIN_UNIT) \ + & (int16_t)AYMO_YM7128_GAIN_MASK) + +const int16_t aymo_ym7128_kernel_linear[19u] = +{ + KERNEL(+0.005969087803865891), + KERNEL(-0.003826518613910499), + KERNEL(-0.016623943725986926), + KERNEL(+0.007053928712894589), + KERNEL(+0.038895802111020034), + KERNEL(-0.010501507751597486), + KERNEL(-0.089238395139830201), + KERNEL(+0.013171814880420758), + KERNEL(+0.312314472963171053), + KERNEL(+0.485820312497107776), + KERNEL(+0.312314472963171053), + KERNEL(+0.013171814880420758), + KERNEL(-0.089238395139830201), + KERNEL(-0.010501507751597486), + KERNEL(+0.038895802111020034), + KERNEL(+0.007053928712894589), + KERNEL(-0.016623943725986926), + KERNEL(-0.003826518613910499), + KERNEL(+0.005969087803865891) +}; + + +const int16_t aymo_ym7128_kernel_minphase[19u] = +{ + KERNEL(+0.073585247514714749), + KERNEL(+0.269340051166713890), + KERNEL(+0.442535202999738531), + KERNEL(+0.350129745841520346), + KERNEL(+0.026195691646307945), + KERNEL(-0.178423532471468610), + KERNEL(-0.081176763571493171), + KERNEL(+0.083194010466739091), + KERNEL(+0.067960765530891545), + KERNEL(-0.035840063980478287), + KERNEL(-0.044393769145659796), + KERNEL(+0.013156688603347873), + KERNEL(+0.023451305043275420), + KERNEL(-0.004374029821991059), + KERNEL(-0.009480786001493536), + KERNEL(+0.002700502551912207), + KERNEL(+0.003347671274177581), + KERNEL(-0.002391896275498628), + KERNEL(+0.000483958628744376) +}; + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_ym7128_none.c b/src/aymo_ym7128_none.c new file mode 100644 index 0000000..7d7dc48 --- /dev/null +++ b/src/aymo_ym7128_none.c @@ -0,0 +1,130 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_ym7128_common.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ym7128_none.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_ym7128_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_ym7128_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_ym7128_ctor_f)&(aymo_(ctor)), + (aymo_ym7128_dtor_f)&(aymo_(dtor)), + (aymo_ym7128_read_f)&(aymo_(read)), + (aymo_ym7128_write_f)&(aymo_(write)), + (aymo_ym7128_process_i16_f)&(aymo_(process_i16)) +}; + + +const struct aymo_ym7128_vt* aymo_(get_vt)(void) +{ + return &aymo_(vt); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip) +{ + assert(chip); + + YM7128B_ChipFixed* emu = &chip->emu; + YM7128B_ChipFixed_Ctor(emu); + YM7128B_ChipFixed_Reset(emu); + + // Initialize as pass-through + YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_GL1, 0x3Fu); + YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_GR1, 0x3Fu); + YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_VM, 0x3Fu); + YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_VL, 0x3Fu); + YM7128B_ChipFixed_Write(emu, (YM7128B_Address)YM7128B_Reg_VR, 0x3Fu); + + YM7128B_ChipFixed_Start(emu); +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + assert(chip); + + YM7128B_ChipFixed* emu = &chip->emu; + YM7128B_ChipFixed_Stop(emu); + YM7128B_ChipFixed_Dtor(emu); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + assert(chip); + + if (address <= (uint16_t)YM7128B_Address_Max) { + return YM7128B_ChipFixed_Read(&chip->emu, (YM7128B_Address)address); + } + return 0x00u; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address <= (uint16_t)YM7128B_Address_Max) { + YM7128B_ChipFixed_Write(&chip->emu, (YM7128B_Address)address, value); + } +} + + +void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]) +{ + assert(chip); + assert(x); + assert(y); + if AYMO_UNLIKELY(!count) return; + + YM7128B_ChipFixed* emu = &chip->emu; + YM7128B_ChipFixed_Process_Data data; + + const int16_t* xe = &x[count]; + + while AYMO_LIKELY(x != xe) { + data.inputs[YM7128B_InputChannel_Mono] = *x++; + + YM7128B_ChipFixed_Process(emu, &data); + + for (int k = 0; k < YM7128B_Oversampler_Factor; ++k) { + for (int c = 0; c < YM7128B_OutputChannel_Count; ++c) { + *y++ = data.outputs[c][k]; + } + } + } +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_ym7128_x86_sse41.c b/src/aymo_ym7128_x86_sse41.c new file mode 100644 index 0000000..67f7a0a --- /dev/null +++ b/src/aymo_ym7128_x86_sse41.c @@ -0,0 +1,270 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published yb the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#include "aymo_cpu_x86_sse41_inline.h" +#include "aymo_ym7128_common.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ym7128_x86_sse41.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_ym7128_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_ym7128_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_ym7128_ctor_f)&(aymo_(ctor)), + (aymo_ym7128_dtor_f)&(aymo_(dtor)), + (aymo_ym7128_read_f)&(aymo_(read)), + (aymo_ym7128_write_f)&(aymo_(write)), + (aymo_ym7128_process_i16_f)&(aymo_(process_i16)) +}; + + +const struct aymo_ym7128_vt* aymo_(get_vt)(void) +{ + return &aymo_(vt); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip) +{ + assert(chip); + + // Wipe everything + aymo_memset(chip, 0, sizeof(struct aymo_(chip))); + + // Initialize input stage coefficients (-1 as a placeholder for computed values) + chip->xxv[2] = 1; + chip->xxv[3] = 1; + + chip->kk1 = vseta(0, -1, -1, -1, -0x8000, -0x8000, -0x8000, -0x8000); + chip->kk2 = vseta(0, -1, -0x8000, 0, 0, 0, 0x8000, -0x8000); + chip->kkm = vseta(0, 0x7FFF, 0x7FFF, 0, 0, 0, AYMO_YM7128_DELAY_LENGTH, AYMO_YM7128_DELAY_LENGTH); + + // Initialize oversampler coefficients + const int16_t* k = aymo_ym7128_kernel_linear; + chip->ka = vseta(k[ 6], k[ 6], k[ 4], k[ 4], k[ 2], k[ 2], k[ 0], k[ 0]); + chip->kb = vseta(k[ 7], k[ 7], k[ 5], k[ 5], k[ 3], k[ 3], k[ 1], k[ 1]); + chip->kc = vseta(k[14], k[14], k[12], k[12], k[10], k[10], k[ 8], k[ 8]); + chip->kd = vseta(k[15], k[15], k[13], k[13], k[11], k[11], k[ 9], k[ 9]); + chip->ke = vseta( 0, 0, 0, 0, k[18], k[18], k[16], k[16]); + chip->kf = vseta( 0, 0, 0, 0, 0, 0, k[17], k[17]); + + // Initialize as pass-through + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_gl1, 0x3Fu); + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_gr1, 0x3Fu); + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vm, 0x3Fu); + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vl, 0x3Fu); + aymo_(write)(chip, (uint16_t)aymo_ym7128_reg_vr, 0x3Fu); +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + assert(chip); + + if (address < (uint16_t)AYMO_YM7128_REG_COUNT) { + return chip->regs[address]; + } + return 0x00u; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address <= (uint16_t)aymo_ym7128_reg_gl8) { + value &= 0x3Fu; + int16_t gl = aymo_ym7128_gain[value]; + int i = (int)(address - (uint16_t)aymo_ym7128_reg_gl1); + chip->kgl = vinsertn(chip->kgl, gl, i); + } + else if (address <= (uint16_t)aymo_ym7128_reg_gr8) { + value &= 0x3Fu; + int16_t gr = aymo_ym7128_gain[value]; + int i = (int)(address - (uint16_t)aymo_ym7128_reg_gr1); + chip->kgr = vinsertn(chip->kgr, gr, i); + } + else if (address <= (uint16_t)aymo_ym7128_reg_vr) { + value &= 0x3Fu; + int16_t v = aymo_ym7128_gain[value]; + if (address == (uint16_t)aymo_ym7128_reg_vm) { + chip->kk1 = vinsert(chip->kk1, -v, 5); + } + else if (address == (uint16_t)aymo_ym7128_reg_vc) { + chip->kk2 = vinsert(chip->kk2, v, 6); + } + else if (address == (uint16_t)aymo_ym7128_reg_vl) { + chip->kv = vinsert(chip->kv, v, 6); + } + else { + chip->kv = vinsert(chip->kv, v, 7); + } + } + else if (address <= (uint16_t)aymo_ym7128_reg_c1) { + value &= 0x3Fu; + int16_t v = ((int16_t)value << (16 - AYMO_YM7128_COEFF_BITS)); + if (address == (uint16_t)aymo_ym7128_reg_c0) { + chip->kk1 = vinsert(chip->kk1, v, 4); + } + else { + chip->kk1 = vinsert(chip->kk1, v, 6); + } + } + else if (address <= (uint16_t)aymo_ym7128_reg_t8) { + value &= 0x1Fu; + int16_t t = aymo_ym7128_tap[value]; + int16_t hi = chip->xxv[1]; // hi + t = (hi - t); + if (t < 0) { + t += AYMO_YM7128_DELAY_LENGTH; + } + if (address == (uint16_t)aymo_ym7128_reg_t0) { + chip->xxv[0] = t; // ti0 + } + else { + uint16_t i = (address - (uint16_t)aymo_ym7128_reg_t1); + chip->tiv[i] = t; + } + } + + if (address < (uint16_t)AYMO_YM7128_REG_COUNT) { + chip->regs[address] = value; + } +} + + +void aymo_(process_i16)(struct aymo_(chip)* chip, uint32_t count, const int16_t x[], int16_t y[]) +{ + assert(chip); + assert(x); + assert(y); + if AYMO_UNLIKELY(!count) return; + + int16_t AYMO_ALIGN_V128 vv[8] = {0}; + + int16_t ti0 = chip->xxv[0]; + int16_t t0 = chip->uh[ti0]; + chip->xxv[4] = t0; + + const int16_t* xe = &x[count]; + + while AYMO_LIKELY(x != xe) { + chip->xxv[5] = (*x++ & AYMO_YM7128_SIGNAL_MASK); + + vsfence(); + vi16x8_t xx = vload((void*)chip->xxv); + chip->xxv[6] = t0; // t0d = t0 + xx = vmulhrs(xx, chip->kk1); + xx = vaddsi(xx, vvshuffle(xx, KSHUFFLE(2, 3, 0, 1))); // "2301" + xx = vmulhrs(xx, chip->kk2); + xx = vand(xx, vcmpgt(chip->kkm, xx)); + xx = vaddsi(xx, valignr(xx, xx, 2)); + vstore((void*)vv, xx); + vi16x8_t ti = vload((void*)chip->tiv); + vi16x8_t tj = vsub(ti, vset1(-1)); + vi16x8_t tm = vcmpgt(vset1(AYMO_YM7128_DELAY_LENGTH - 1), ti); // tj < DL + vstore((void*)chip->tiv, vand(tj, tm)); + vsfence(); + + chip->xxv[0] = vv[7]; // ti0' + int16_t hj = vv[1]; + chip->xxv[1] = hj; // hi' + int16_t u = vv[5]; + chip->uh[hj] = u; + int16_t AYMO_ALIGN_V128 tuv[8]; + for (unsigned i = 0u; i < 8u; ++i) { + tuv[i] = chip->uh[chip->tiv[i]]; + } + vsfence(); + vi16x8_t tu = vload((void*)tuv); + + vi16x8_t gl = vmulhrs(tu, chip->kgl); + vi16x8_t gr = vmulhrs(tu, chip->kgr); + vi32x4_t ggl = vmadd(gl, vset1(1)); + vi32x4_t ggr = vmadd(gr, vset1(1)); + ggl = vvadd(ggl, vvshuffle(ggl, KSHUFFLE(1, 0, 3, 2))); // "1032" + ggr = vvadd(ggr, vvshuffle(ggr, KSHUFFLE(1, 0, 3, 2))); // "1032" + ggl = vvadd(ggl, vvshuffle(ggl, KSHUFFLE(2, 3, 0, 1))); // "2301" + ggr = vvadd(ggr, vvshuffle(ggr, KSHUFFLE(2, 3, 0, 1))); // "2301" + vi16x8_t ggrl = vvpacks(ggr, ggl); + vi16x8_t gglr = valignr(ggrl, ggrl, 2); + vi16x8_t vlr = vmulhrs(gglr, chip->kv); + + vi16x8_t zc = chip->zc; + vi16x8_t zb = chip->zb; + zc = valignr(zc, zb, 12); // '543210..' + chip->zc = zc; + + vi16x8_t y1 = vmulhrs(zc, chip->kf); + vi16x8_t y0 = vmulhrs(zc, chip->ke); + + vi16x8_t za = chip->za; + zb = valignr(zb, za, 12); // '543210..' + chip->zb = zb; + + y1 = vaddsi(y1, vmulhrs(zb, chip->kd)); + y0 = vaddsi(y0, vmulhrs(zb, chip->kc)); + + za = valignr(za, vlr, 12); // '543210..' + chip->za = za; + + y1 = vaddsi(y1, vmulhrs(za, chip->kb)); + y0 = vaddsi(y0, vmulhrs(za, chip->ka)); + + y0 = vaddsi(y0, vvshuffle(y0, KSHUFFLE(1, 0, 3, 2))); // "1032" + y1 = vaddsi(y1, vvshuffle(y1, KSHUFFLE(1, 0, 3, 2))); // "1032" + y0 = vaddsi(y0, vvshuffle(y0, KSHUFFLE(2, 3, 0, 1))); // "2301" + y1 = vaddsi(y1, vvshuffle(y1, KSHUFFLE(2, 3, 0, 1))); // "2301" + + vi16x8_t yy = vblendi(y0, y1, 0xCC); // '1100''1100' + yy = vand(yy, vset1((int16_t)AYMO_YM7128_SIGNAL_MASK)); + vstorelo((void*)y, yy); y += 4u; + + ti0 = chip->xxv[0]; + t0 = chip->uh[ti0]; + chip->xxv[4] = t0; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 diff --git a/src/aymo_ymf262.c b/src/aymo_ymf262.c new file mode 100644 index 0000000..c2f707c --- /dev/null +++ b/src/aymo_ymf262.c @@ -0,0 +1,250 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include +#include "aymo_cpu.h" +#include "aymo_ymf262.h" +#include "aymo_ymf262_arm_neon.h" +#include "aymo_ymf262_none.h" +#include "aymo_ymf262_x86_sse41.h" +#include "aymo_ymf262_x86_avx.h" +#include "aymo_ymf262_x86_avx2.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +static const struct aymo_ymf262_vt* aymo_ymf262_best_vt; + + +void aymo_ymf262_boot(void) +{ + #ifdef AYMO_CPU_SUPPORT_X86_AVX2 + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) { + aymo_ymf262_best_vt = aymo_ymf262_x86_avx2_get_vt(); + return; + } + #endif + + #ifdef AYMO_CPU_SUPPORT_X86_AVX + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX) { + aymo_ymf262_best_vt = aymo_ymf262_x86_avx_get_vt(); + return; + } + #endif + + #ifdef AYMO_CPU_SUPPORT_X86_SSE41 + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) { + aymo_ymf262_best_vt = aymo_ymf262_x86_sse41_get_vt(); + return; + } + #endif + + #ifdef AYMO_CPU_SUPPORT_ARM_NEON + if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) { + aymo_ymf262_best_vt = aymo_ymf262_arm_neon_get_vt(); + } + #endif + + aymo_ymf262_best_vt = aymo_ymf262_none_get_vt(); +} + + +const struct aymo_ymf262_vt* aymo_ymf262_get_vt(const char* cpu_ext) +{ + if (cpu_ext == NULL) { + return NULL; + } + + #ifdef AYMO_CPU_SUPPORT_X86_AVX2 + if (!aymo_strcmp(cpu_ext, "x86_avx2")) { + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX2) { + return aymo_ymf262_x86_avx2_get_vt(); + } + } + #endif + + #ifdef AYMO_CPU_SUPPORT_X86_AVX + if (!aymo_strcmp(cpu_ext, "x86_avx")) { + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_AVX) { + return aymo_ymf262_x86_avx_get_vt(); + } + } + #endif + + #ifdef AYMO_CPU_SUPPORT_X86_SSE41 + if (!aymo_strcmp(cpu_ext, "x86_sse41")) { + if (aymo_cpu_x86_get_extensions() & AYMO_CPU_X86_EXT_SSE41) { + return aymo_ymf262_x86_sse41_get_vt(); + } + } + #endif + + #ifdef AYMO_CPU_SUPPORT_ARM_NEON + if (!aymo_strcmp(cpu_ext, "arm_neon")) { + if (aymo_cpu_arm_get_extensions() & AYMO_CPU_ARM_EXT_NEON) { + return aymo_ymf262_arm_neon_get_vt(); + } + } + #endif + + if (!aymo_strcmp(cpu_ext, "none")) { + return aymo_ymf262_none_get_vt(); + } + return NULL; +} + + +const struct aymo_ymf262_vt* aymo_ymf262_get_best_vt(void) +{ + return aymo_ymf262_best_vt; +} + + +uint32_t aymo_ymf262_get_sizeof(struct aymo_ymf262_chip* chip) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->get_sizeof); + + return chip->vt->get_sizeof(); +} + + +void aymo_ymf262_ctor(struct aymo_ymf262_chip* chip) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->ctor); + + chip->vt->ctor(chip); +} + + +void aymo_ymf262_dtor(struct aymo_ymf262_chip* chip) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->dtor); + + chip->vt->dtor(chip); +} + + +uint8_t aymo_ymf262_read(struct aymo_ymf262_chip* chip, uint16_t address) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->read); + + return chip->vt->read(chip, address); +} + + +void aymo_ymf262_write(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->write); + + chip->vt->write(chip, address, value); +} + + +int aymo_ymf262_enqueue_write(struct aymo_ymf262_chip* chip, uint16_t address, uint8_t value) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->enqueue_write); + + return chip->vt->enqueue_write(chip, address, value); +} + + +int aymo_ymf262_enqueue_delay(struct aymo_ymf262_chip* chip, uint32_t count) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->enqueue_delay); + + return chip->vt->enqueue_delay(chip, count); +} + + +int16_t aymo_ymf262_get_output(struct aymo_ymf262_chip* chip, uint8_t channel) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->get_output); + + return chip->vt->get_output(chip, channel); +} + + +void aymo_ymf262_tick(struct aymo_ymf262_chip* chip, uint32_t count) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->tick); + + chip->vt->tick(chip, count); +} + + +void aymo_ymf262_generate_i16x2(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->generate_i16x2); + + chip->vt->generate_i16x2(chip, count, y); +} + + +void aymo_ymf262_generate_i16x4(struct aymo_ymf262_chip* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->generate_i16x4); + + chip->vt->generate_i16x4(chip, count, y); +} + + +void aymo_ymf262_generate_f32x2(struct aymo_ymf262_chip* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->generate_f32x2); + + chip->vt->generate_f32x2(chip, count, y); +} + + +void aymo_ymf262_generate_f32x4(struct aymo_ymf262_chip* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(chip->vt); + assert(chip->vt->generate_f32x4); + + chip->vt->generate_f32x4(chip, count, y); +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_ymf262_arm_neon.c b/src/aymo_ymf262_arm_neon.c new file mode 100644 index 0000000..52a8f57 --- /dev/null +++ b/src/aymo_ymf262_arm_neon.c @@ -0,0 +1,1688 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include +#include "aymo_cpu_arm_neon_inline.h" +#include "aymo_ymf262.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_arm_neon.h" + +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_ymf262_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_ymf262_ctor_f)&(aymo_(ctor)), + (aymo_ymf262_dtor_f)&(aymo_(dtor)), + (aymo_ymf262_read_f)&(aymo_(read)), + (aymo_ymf262_write_f)&(aymo_(write)), + (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)), + (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)), + (aymo_ymf262_get_output_f)&(aymo_(get_output)), + (aymo_ymf262_tick_f)&(aymo_(tick)), + (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)), + (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)), + (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)), + (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4)) +}; + + +// 32-bit Slot Group side (lo/hi) +const int8_t aymo_(sgo_side)[8] = +{ + 0, 0, 0, 0, 1, 1, 1, 1 +}; + +// 32-bit Slot Group cell +const int8_t aymo_(sgo_cell)[8] = +{ + 0, 1, 2, 3, 0, 1, 2, 3 +}; + + +const int16_t aymo_(eg_incstep_table)[4] = +{ + ((1 << 3) | (1 << 2) | (1 << 1) | (0 << 0)), + ((1 << 3) | (0 << 2) | (0 << 1) | (0 << 0)), + ((1 << 3) | (1 << 2) | (0 << 1) | (0 << 0)), + ((0 << 3) | (0 << 2) | (0 << 1) | (0 << 0)) +}; + + +// Wave descriptors +const struct aymo_(wave) aymo_(wave_table)[8] = // TODO: share bits; select vit shifts +{ + { 0, 0x0000, 0x0200, 0x0100, 0x00FF, -1 }, + { 0, 0x0200, 0x0000, 0x0100, 0x00FF, -1 }, + { 0, 0x0000, 0x0000, 0x0100, 0x00FF, -1 }, + { 0, 0x0100, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0400, 0x0200, 0x0100, 0x00FF, -1 }, + { 1, 0x0400, 0x0000, 0x0100, 0x00FF, -1 }, + { 0, 0x0000, 0x0200, 0x0200, 0x0001, 0 }, + { 3, 0x0000, 0x1000, 0x1000, 0x1FFF, 0 } +}; + + +// 2-channel connection descriptors +const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */] = +{ + { + { -1, 0, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, -1 } + }, +}; + +// 4-channel connection descriptors +const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */] = +{ + { + { -1, 0, 0 }, + { 0, -1, 0 }, + { 0, -1, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, 0 }, + { 0, -1, -1 }, + { 0, 0, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, 0 }, + { 0, -1, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, 0 }, + { 0, -1, -1 }, + { 0, 0, -1 } + }, +}; + +// Rhythm connection descriptors +const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */] = +{ + // Channel 6: BD, FM + { + { -1, 0, 0 }, + { 0, -1, -1 } + }, + // Channel 6: BD, AM + { + { -1, 0, 0 }, + { 0, 0, -1 } + }, + // Channel 7: HH + SD + { + { 0, 0, -1 }, + { 0, 0, -1 } + }, + // Channel 8: TT + TC + { + { 0, 0, -1 }, + { 0, 0, -1 } + } +}; + + +// Slot mask output delay for outputs A and C +const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)] = // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag +{ + 0xF8, + 0xF8, + 0xF8, + 0xFF, + 0xF8, + 0xFF, + 0xF8, + 0xFF +}; + + +// Slot mask output delay for outputs B and D +const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)] = // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag +{ + 0x88, + 0xF8, + 0x88, + 0xF8, + 0x88, + 0xFF, + 0x88, + 0xFF +}; + + +// Updates phase generator +static inline +void aymo_(pg_update_deltafreq)( + struct aymo_(chip)* chip, + struct aymo_(ch2x_group)* cg, + struct aymo_(slot_group)* sg +) +{ + // Update phase + vi16_t fnum = cg->pg_fnum; + vi16_t range = vand(fnum, vset1(7 << 7)); + range = vand(sg->pg_vib, vsllv(range, chip->pg_vib_shs)); + range = vmullo(range, chip->pg_vib_sign); + fnum = vadd(fnum, range); + + vi32_t fnum_lo = vunpacklo(fnum); + vi32_t fnum_hi = vunpackhi(fnum); + vi32_t block_sll_lo = vunpacklo(cg->pg_block); + vi32_t block_sll_hi = vunpackhi(cg->pg_block); + vi32_t basefreq_lo = vvsrli(vvsllv(fnum_lo, block_sll_lo), 1); + vi32_t basefreq_hi = vvsrli(vvsllv(fnum_hi, block_sll_hi), 1); + vi32_t pg_mult_x2_lo = vunpacklo(sg->pg_mult_x2); + vi32_t pg_mult_x2_hi = vunpackhi(sg->pg_mult_x2); + vi32_t deltafreq_lo = vvsrli(vvmullo(basefreq_lo, pg_mult_x2_lo), 1); + vi32_t deltafreq_hi = vvsrli(vvmullo(basefreq_hi, pg_mult_x2_hi), 1); + sg->pg_deltafreq_lo = deltafreq_lo; + sg->pg_deltafreq_hi = deltafreq_hi; +} + + +// Updates noise generator +static inline +void aymo_(ng_update)(struct aymo_(chip)* chip, unsigned times) +{ + // Update noise + uint32_t noise = chip->ng_noise; + while (times--) { + uint32_t n_bit = (((noise >> 14) ^ noise) & 1); + noise = ((noise >> 1) | (n_bit << 22)); + } + chip->ng_noise = noise; +} + + +// Updates rhythm manager, slot group 1 +static inline +void aymo_(rm_update_sg1)(struct aymo_(chip)* chip) +{ + struct aymo_(slot_group)* sg = &chip->sg[1]; + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Double rhythm outputs + vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0); + vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask); + chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d)); + } + + vi16_t phase = sg->pg_phase_out; + uint16_t phase13 = (uint16_t)vextract(phase, 1); + + // Update noise bits + chip->rm_hh_bit2 = ((phase13 >> 2) & 1); + chip->rm_hh_bit3 = ((phase13 >> 3) & 1); + chip->rm_hh_bit7 = ((phase13 >> 7) & 1); + chip->rm_hh_bit8 = ((phase13 >> 8) & 1); + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Calculate noise bit + uint16_t rm_xor = ( + (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) | + (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) | + (chip->rm_tc_bit3 ^ chip->rm_tc_bit5) + ); + + // Update HH + uint16_t noise = (uint16_t)chip->ng_noise; + phase13 = (rm_xor << 9); + if (rm_xor ^ (noise & 1)) { + phase13 |= 0xD0; + } else { + phase13 |= 0x34; + } + phase = vinsert(phase, (int16_t)phase13, 1); + + sg->pg_phase_out = phase; + } +} + + +// Updates rhythm manager, slot group 3 +static inline +void aymo_(rm_update_sg3)(struct aymo_(chip)* chip) +{ + struct aymo_(slot_group)* sg = &chip->sg[3]; + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Double rhythm outputs + vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0); + vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask); + chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d)); + + // Calculate noise bit + uint16_t rm_xor = ( + (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) | + (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) | + (chip->rm_tc_bit3 ^ chip->rm_tc_bit5) + ); + vi16_t phase = sg->pg_phase_out; + + // Update SD + uint16_t noise = (uint16_t)chip->ng_noise; + uint16_t phase16 = ( + ((uint16_t)chip->rm_hh_bit8 << 9) | + ((uint16_t)(chip->rm_hh_bit8 ^ (noise & 1)) << 8) + ); + phase = vinsert(phase, (int16_t)phase16, 1); + + // Update TC + uint32_t phase17 = vextract(phase, 2); + chip->rm_tc_bit3 = ((phase17 >> 3) & 1); + chip->rm_tc_bit5 = ((phase17 >> 5) & 1); + phase17 = ((rm_xor << 9) | 0x80); + phase = vinsert(phase, (int16_t)phase17, 2); + + sg->pg_phase_out = phase; + } +} + + +// Updates slot generators +static +void aymo_(sg_update)( + struct aymo_(chip)* chip, + struct aymo_(slot_group)* sg +) +{ + // EG: Compute envelope output + vi16_t sg_eg_rout = sg->eg_rout; + sg->eg_out = vadd(vadd(sg_eg_rout, sg->eg_tremolo_am), sg->eg_ksl_sh_tl_x4); + + // PG: Compute phase output + vi32_t phase_out_mask = vvset1(0xFFFF); + vi32_t phase_out_lo = vvand(vvsrli(sg->pg_phase_lo, 9), phase_out_mask); + vi32_t phase_out_hi = vvand(vvsrli(sg->pg_phase_hi, 9), phase_out_mask); + vi16_t phase_out = vvpack(phase_out_lo, phase_out_hi); + sg->pg_phase_out = phase_out; + + // EG: Compute rate + vi16_t eg_prgen = sg->eg_gen; + vi16_t eg_gen_rel = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_RELEASE))); + vi16_t notreset = vcmpz(vand(sg->eg_key, eg_gen_rel)); + vi16_t eg_gen_shl = vblendv(vset1(AYMO_(EG_GEN_SHL_ATTACK)), sg->eg_gen_shl, notreset); + vi16_t reg_rate = vsllv(sg->eg_adsr, eg_gen_shl); // move to top nibble + vi16_t rate_temp = vand(reg_rate, vset1((int16_t)0xF000)); // keep top nibble + rate_temp = vsrli(rate_temp, AYMO_(EG_GEN_SRLHI)); + vi16_t rate = vadd(sg->eg_ks, rate_temp); + vi16_t rate_lo = vand(rate, vset1(3)); + vi16_t rate_hi = vsrli(rate, 2); + rate_hi = vmini(rate_hi, vset1(15)); + + // PG: Update phase + vi32_t notreset_lo = vunpacklo(notreset); + vi32_t notreset_hi = vunpackhi(notreset); + vi32_t pg_phase_lo = vvand(notreset_lo, sg->pg_phase_lo); + vi32_t pg_phase_hi = vvand(notreset_hi, sg->pg_phase_hi); + sg->pg_phase_lo = vvadd(pg_phase_lo, sg->pg_deltafreq_lo); + sg->pg_phase_hi = vvadd(pg_phase_hi, sg->pg_deltafreq_hi); + + // EG: Compute shift (< 12) + vi16_t eg_shift = vadd(rate_hi, chip->eg_add); + vi16_t rate_pre_lt12 = vor(vslli(rate_lo, 1), vset1(8)); + vi16_t shift_lt12 = vsrlv(rate_pre_lt12, vu2i(vsubsu(vi2u(vset1(15)), vi2u(eg_shift)))); + vi16_t eg_state = vset1((int16_t)chip->eg_state); + shift_lt12 = vand(shift_lt12, eg_state); + + // WG: Compute feedback and modulation inputs + vi16_t fbsum = vadd(sg->wg_out, sg->wg_prout); + vi16_t fbsum_sh = vsllv(fbsum, sg->wg_fb_shs); + vi16_t prmod = vand(chip->wg_mod, sg->wg_prmod_gate); + vi16_t fbmod = vand(fbsum_sh, sg->wg_fbmod_gate); + sg->wg_prout = sg->wg_out; + + // WG: Compute operator phase input + vi16_t modsum = vadd(fbmod, prmod); + vi16_t phase = vadd(phase_out, modsum); + + // EG: Compute shift (>= 12) + vi16_t incstep_ge12 = vand(vsrlv(chip->eg_incstep, rate_lo), vset1(1)); + vi16_t shift_ge12 = vadd(vand(rate_hi, vset1(3)), incstep_ge12); + shift_ge12 = vmini(shift_ge12, vset1(3)); + shift_ge12 = vblendv(shift_ge12, eg_state, vcmpz(shift_ge12)); + + vi16_t shift = vblendv(shift_lt12, shift_ge12, vcmpgt(rate_hi, vset1(11))); + shift = vandnot(vcmpz(rate_temp), shift); + + // EG: Instant attack + vi16_t eg_rout = sg_eg_rout; + eg_rout = vandnot(vandnot(notreset, vcmpeq(rate_hi, vset1(15))), eg_rout); + + // WG: Process phase + vi16_t phase_sped = vsllv(phase, sg->wg_phase_shl); + vi16_t phase_gate = vcmpz(vand(phase_sped, sg->wg_phase_zero)); + vi16_t phase_flip = vcmpp(vand(phase_sped, sg->wg_phase_flip)); + vi16_t phase_mask = sg->wg_phase_mask; + vi16_t phase_xor = vand(phase_flip, phase_mask); + vi16_t phase_idx = vxor(phase_sped, phase_xor); + phase_out = vand(vand(phase_gate, phase_mask), phase_idx); + + // EG: Envelope off + vi16_t eg_off = vcmpgt(sg_eg_rout, vset1(0x01F7)); + vi16_t eg_gen_natk_and_nrst = vand(vcmpp(eg_prgen), notreset); + eg_rout = vblendv(eg_rout, vset1(0x01FF), vand(eg_gen_natk_and_nrst, eg_off)); + + // WG: Compute logsin variant + vi16_t phase_lo = phase_out; // vgather() masks to low byte + vi16_t logsin_val = vgather(aymo_ymf262_logsin_table, phase_lo); + logsin_val = vblendv(vset1(0x1000), logsin_val, phase_gate); + + // EG: Compute common increment not in attack state + vi16_t eg_inc_natk_cond = vand(vand(notreset, vcmpz(eg_off)), vcmpp(shift)); + vi16_t eg_inc_natk = vand(eg_inc_natk_cond, vpow2m1lt4(shift)); + vi16_t eg_gen = eg_prgen; + + // WG: Compute exponential output + vi16_t exp_in = vblendv(phase_out, logsin_val, sg->wg_sine_gate); + vi16_t exp_level = vadd(exp_in, vslli(sg->eg_out, 3)); + exp_level = vmini(exp_level, vset1(0x1FFF)); + vi16_t exp_level_lo = exp_level; // vgather() masks to low byte + vi16_t exp_level_hi = vsrli(exp_level, 8); + vi16_t exp_value = vgather(aymo_ymf262_exp_x2_table, exp_level_lo); + vi16_t exp_out = vsrlv(exp_value, exp_level_hi); + + // EG: Move attack to decay state + vi16_t eg_inc_atk_cond = vand(vand(vcmpp(sg->eg_key), vcmpp(shift)), + vand(vcmpz(eg_prgen), vcmpgt(vset1(15), rate_hi))); + vi16_t eg_inc_atk_ninc = vsrlv(sg->eg_rout, vsub(vset1(4), shift)); + vi16_t eg_inc = vandnot(eg_inc_atk_ninc, eg_inc_atk_cond); + vi16_t eg_gen_atk_to_dec = vcmpz(vor(eg_prgen, sg->eg_rout)); + eg_gen = vsub(eg_gen, eg_gen_atk_to_dec); // 0 --> 1 + eg_inc = vblendv(eg_inc_natk, eg_inc, vcmpz(eg_prgen)); + eg_inc = vandnot(eg_gen_atk_to_dec, eg_inc); + + // WG: Compute operator wave output + vi16_t wave_pos = vcmpz(vand(phase_sped, sg->wg_phase_neg)); + vi16_t wave_neg = vandnot(wave_pos, phase_gate); + vi16_t wave_out = vxor(exp_out, wave_neg); + sg->wg_out = wave_out; + chip->wg_mod = wave_out; + + // EG: Move decay to sustain state + vi16_t eg_gen_dec = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_DECAY))); + vi16_t sl_hit = vcmpeq(vsrli(sg->eg_rout, 4), sg->eg_sl); + vi16_t eg_gen_dec_to_sus = vand(eg_gen_dec, sl_hit); + eg_gen = vsub(eg_gen, eg_gen_dec_to_sus); // 1 --> 2 + eg_inc = vandnot(eg_gen_dec_to_sus, eg_inc); + + // WG: Update chip output accumulators, with quirky slot output delay + vi16_t og_out_ac = vblendv(wave_out, sg->og_prout, sg->og_prout_ac); + vi16_t og_out_bd = vblendv(wave_out, sg->og_prout, sg->og_prout_bd); + sg->og_prout = wave_out; + chip->og_acc_a = vadd(chip->og_acc_a, vand(og_out_ac, sg->og_out_ch_gate_a)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(og_out_ac, sg->og_out_ch_gate_c)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(og_out_bd, sg->og_out_ch_gate_b)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(og_out_bd, sg->og_out_ch_gate_d)); + + // EG: Move back to attack state + eg_gen = vand(notreset, eg_gen); // * --> 0 + + // EG: Move to release state + eg_gen = vor(eg_gen, vsrli(vcmpz(sg->eg_key), 14)); // * --> 3 + + // EG: Update envelope generator + eg_rout = vadd(eg_rout, eg_inc); + eg_rout = vand(eg_rout, vset1(0x01FF)); + sg->eg_rout = eg_rout; + sg->eg_gen = eg_gen; + sg->eg_gen_shl = vslli(eg_gen, 2); + +#ifdef AYMO_DEBUG + sg->eg_rate = rate; + sg->eg_inc = eg_inc; + sg->wg_fbmod = fbsum_sh; + sg->wg_mod = modsum; +#endif +} + + +// Clear output accumulators +static inline +void aymo_(og_clear)(struct aymo_(chip)* chip) +{ + chip->og_acc_a = vsetz(); + chip->og_acc_b = vsetz(); + chip->og_acc_c = vsetz(); + chip->og_acc_d = vsetz(); +} + + +// Updates output mixdown +static inline +void aymo_(og_update)(struct aymo_(chip)* chip) +{ + vi32x4_t sum_a = vpaddlq_s16(chip->og_acc_a); + vi32x4_t sum_b = vpaddlq_s16(chip->og_acc_b); + vi32x4_t sum_c = vpaddlq_s16(chip->og_acc_c); + vi32x4_t sum_d = vpaddlq_s16(chip->og_acc_d); + + sum_a = vaddq_s32(sum_a, vrev64q_s32(sum_a)); + sum_b = vaddq_s32(sum_b, vrev64q_s32(sum_b)); + sum_c = vaddq_s32(sum_c, vrev64q_s32(sum_c)); + sum_d = vaddq_s32(sum_d, vrev64q_s32(sum_d)); + + vi32x2_t tot_a = vadd_s32(vget_low_s32(sum_a), vget_high_s32(sum_a)); + vi32x2_t tot_b = vadd_s32(vget_low_s32(sum_b), vget_high_s32(sum_b)); + vi32x2_t tot_c = vadd_s32(vget_low_s32(sum_c), vget_high_s32(sum_c)); + vi32x2_t tot_d = vadd_s32(vget_low_s32(sum_d), vget_high_s32(sum_d)); + + vi32x2_t tot_ab = vext_s32(tot_a, tot_b, 1); + vi32x2_t tot_cd = vext_s32(tot_c, tot_d, 1); + vi16x4_t sat_abcd = vqmovn_s32(vcombine_s32(tot_ab, tot_cd)); + + vu16x4_t sel_old = vcreate_u16(0x0000FFFF0000FFFFuLL); + vi16x4_t out_abcd = vbsl_s16(sel_old, chip->og_old, sat_abcd); + + chip->og_out = out_abcd; + chip->og_old = sat_abcd; +} + + +// Updates timer management +static inline +void aymo_(tm_update)(struct aymo_(chip)* chip) +{ + // Update tremolo + if AYMO_UNLIKELY((chip->tm_timer & 0x3F) == 0x3F) { + chip->eg_tremolopos = ((chip->eg_tremolopos + 1) % 210); + + uint16_t eg_tremolopos = chip->eg_tremolopos; + if (eg_tremolopos >= 105) { + eg_tremolopos = (210 - eg_tremolopos); + } + vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift)); + + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am); + } + } + + // Update vibrato + if AYMO_UNLIKELY((chip->tm_timer & 0x3FF) == 0x3FF) { + chip->pg_vibpos = ((chip->pg_vibpos + 1) & 7); + uint8_t vibpos = chip->pg_vibpos; + int16_t pg_vib_shs = -7; + int16_t pg_vib_sign = +1; + + if (!(vibpos & 3)) { + pg_vib_shs = +16; + } + else if (vibpos & 1) { + pg_vib_shs -= 1; + } + pg_vib_shs -= (int16_t)(uint16_t)chip->eg_vibshift; + + if (vibpos & 4) { + pg_vib_sign = -1; + } + chip->pg_vib_shs = vset1(pg_vib_shs); + chip->pg_vib_sign = vset1(pg_vib_sign); + + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + aymo_(pg_update_deltafreq)(chip, cg, sg); + } + } + + chip->tm_timer++; + int16_t eg_incstep = aymo_(eg_incstep_table)[chip->tm_timer & 3]; + chip->eg_incstep = vset1(eg_incstep); + + // Update timed envelope patterns + int16_t eg_shift = (int16_t)uffsll(chip->eg_timer); + int16_t eg_add = ((eg_shift > 13) ? 0 : eg_shift); + chip->eg_add = vset1(eg_add); + + // Update envelope timer and flip state + if (chip->eg_state | chip->eg_timerrem) { + if (chip->eg_timer < ((1ULL << AYMO_YMF262_SLOT_NUM) - 1ULL)) { + chip->eg_timer++; + chip->eg_timerrem = 0; + } + else { + chip->eg_timer = 0; + chip->eg_timerrem = 1; + } + } + chip->eg_state ^= 1; +} + + +// Updates the register queue +static inline +void aymo_(rq_update)(struct aymo_(chip)* chip) +{ + if (chip->rq_delay) { + if (--chip->rq_delay) { + return; + } + } + if (chip->rq_head != chip->rq_tail) { + struct aymo_(reg_queue_item)* item = &chip->rq_buffer[chip->rq_head]; + + if (item->address & 0x8000u) { + chip->rq_delay = AYMO_(REG_QUEUE_LATENCY); + chip->rq_delay += (((uint32_t)(item->address & 0x7FFFu) << 16) | item->value); + } + else { + aymo_(write)(chip, item->address, item->value); + } + + if (++chip->rq_head >= AYMO_(REG_QUEUE_LENGTH)) { + chip->rq_head = 0; + } + } +} + + +static +void aymo_(tick_once)(struct aymo_(chip)* chip) +{ + int sgi; + + // Clear output accumulators + aymo_(og_clear)(chip); + + // Process slot group 0 + sgi = 0; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 2 + sgi = 2; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 4 + sgi = 4; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 6 + sgi = 6; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 1 + sgi = 1; + aymo_(sg_update)(chip, &chip->sg[sgi]); + aymo_(ng_update)(chip, (36 - 3)); // slot 16 --> slot 13 + aymo_(rm_update_sg1)(chip); + + // Process slot group 3 + sgi = 3; + aymo_(sg_update)(chip, &chip->sg[sgi]); + aymo_(ng_update)(chip, 3); // slot 13 --> slot 16 + aymo_(rm_update_sg3)(chip); + + if AYMO_UNLIKELY(chip->process_all_slots) { + // Process slot group 5 + sgi = 5; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 7 + sgi = 7; + aymo_(sg_update)(chip, &chip->sg[sgi]); + } + + // Update outputs + aymo_(og_update)(chip); + + // Update timers + aymo_(tm_update)(chip); + + // Dequeue registers + aymo_(rq_update)(chip); +} + + +static +void aymo_(eg_update_ksl)(struct aymo_(chip)* chip, int word) +{ + int slot = aymo_ymf262_word_to_slot[word]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h); + + int16_t pg_fnum = vextractv(cg->pg_fnum, sgo); + int16_t pg_fnum_hn = ((pg_fnum >> 6) & 15); + + int ch2x = aymo_ymf262_word_to_ch2x[aymo_ymf262_slot_to_word[slot]]; + int16_t eg_block = (int16_t)(chip->ch2x_regs[ch2x].reg_B0h.block); + int16_t eg_ksl = aymo_ymf262_eg_ksl_table[pg_fnum_hn]; + eg_ksl = ((eg_ksl << 2) - ((8 - eg_block) << 5)); + if (eg_ksl < 0) { + eg_ksl = 0; + } + int16_t eg_kslsh = aymo_ymf262_eg_kslsh_table[reg_40h->ksl]; + int16_t eg_ksl_sh = (eg_ksl >> eg_kslsh); + + int16_t eg_tl_x4 = ((int16_t)reg_40h->tl << 2); + + int16_t eg_ksl_sh_tl_x4 = (eg_ksl_sh + eg_tl_x4); + vinsertv(sg->eg_ksl_sh_tl_x4, eg_ksl_sh_tl_x4, sgo); + +#ifdef AYMO_DEBUG + vinsertv(sg->eg_ksl, eg_ksl, sgo); +#endif +} + + +static +void aymo_(chip_pg_update_nts)(struct aymo_(chip)* chip) +{ + for (int slot = 0; slot < AYMO_(SLOT_NUM_MAX); ++slot) { + int word = aymo_ymf262_slot_to_word[slot]; + int ch2x = aymo_ymf262_word_to_ch2x[word]; + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h); + int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8)); + int16_t eg_ksv = ((reg_B0h->block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1)); + + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + + struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h); + int16_t ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1)); + + vinsertv(cg->eg_ksv, eg_ksv, sgo); + vinsertv(sg->eg_ks, ks, sgo); + } +} + + +static +void aymo_(pg_update_fnum)( + struct aymo_(chip)* chip, int ch2x, + int16_t pg_fnum, int16_t eg_ksv, int16_t pg_block +) +{ + int word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int sgi0 = (word0 / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word0 % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi0); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + + vinsertv(cg->pg_block, pg_block, sgo); + vinsertv(cg->pg_fnum, pg_fnum, sgo); + vinsertv(cg->eg_ksv, eg_ksv, sgo); + + struct aymo_(slot_group)* sg0 = &(chip->sg[sgi0]); + int slot0 = aymo_ymf262_word_to_slot[word0]; + struct aymo_ymf262_reg_20h* reg_20h0 = &(chip->slot_regs[slot0].reg_20h); + int16_t ks0 = (eg_ksv >> ((reg_20h0->ksr ^ 1) << 1)); + vinsertv(sg0->eg_ks, ks0, sgo); + aymo_(eg_update_ksl)(chip, word0); + aymo_(pg_update_deltafreq)(chip, cg, sg0); + + int word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int sgi1 = (word1 / AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg1 = &(chip->sg[sgi1]); + int slot1 = aymo_ymf262_word_to_slot[word1]; + struct aymo_ymf262_reg_20h* reg_20h1 = &(chip->slot_regs[slot1].reg_20h); + int16_t ks1 = (eg_ksv >> ((reg_20h1->ksr ^ 1) << 1)); + vinsertv(sg1->eg_ks, ks1, sgo); + aymo_(eg_update_ksl)(chip, word1); + aymo_(pg_update_deltafreq)(chip, cg, sg1); +} + + +static +void aymo_(ch2x_update_fnum)(struct aymo_(chip)* chip, int ch2x, int8_t ch2p) +{ + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h); + int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8)); + int16_t pg_block = (int16_t)reg_B0h->block; + int16_t eg_ksv = ((pg_block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1)); + + aymo_(pg_update_fnum)(chip, ch2x, pg_fnum, eg_ksv, pg_block); + + if (ch2p >= 0) { + aymo_(pg_update_fnum)(chip, ch2p, pg_fnum, eg_ksv, pg_block); + } +} + + +static inline +void aymo_(eg_key_on)(struct aymo_(chip)* chip, int word, int16_t mode) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + int16_t eg_key = vextractv(sg->eg_key, sgo); + eg_key |= mode; + vinsertv(sg->eg_key, eg_key, sgo); +} + + +static inline +void aymo_(eg_key_off)(struct aymo_(chip)* chip, int word, int16_t mode) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + int16_t eg_key = vextractv(sg->eg_key, sgo); + eg_key &= (int16_t)~mode; + vinsertv(sg->eg_key, eg_key, sgo); +} + + +static +void aymo_(ch2x_key_on)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm) { + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + unsigned ch2x_is_drum = (chip->og_ch2x_drum & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + + if (ch2x_is_pairing && !ch2x_is_secondary) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0]; + int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL)); + } + else if (!ch2x_is_pairing || ch2x_is_drum) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } + } + else { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } +} + + +static +void aymo_(ch2x_key_off)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm) { + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + unsigned ch2x_is_drum = (chip->og_ch2x_drum & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + + if (ch2x_is_pairing && !ch2x_is_secondary) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0]; + int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL)); + } + else if (!ch2x_is_pairing || ch2x_is_drum) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } + } + else { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } +} + + +static +void aymo_(cm_rewire_slot)(struct aymo_(chip)* chip, int word, const struct aymo_(conn)* conn) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + vinsertv(sg->wg_fbmod_gate, conn->wg_fbmod_gate, sgo); + vinsertv(sg->wg_prmod_gate, conn->wg_prmod_gate, sgo); + int16_t og_out_gate = conn->og_out_gate; + vinsertv(sg->og_out_gate, og_out_gate, sgo); + + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + vinsertv(sg->og_out_ch_gate_a, (vextractv(cg->og_ch_gate_a, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_b, (vextractv(cg->og_ch_gate_b, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_c, (vextractv(cg->og_ch_gate_c, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_d, (vextractv(cg->og_ch_gate_d, sgo) & og_out_gate), sgo); +} + + +static +void aymo_(cm_rewire_ch2x)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm && (chip->og_ch2x_pairing & (1UL << ch2x))) { + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (ch2x_is_secondary) { + int t = ch2x; + ch2x = ch2p; + ch2p = t; + } + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt); + const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]); + } + else { + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]); + } +} + + +static +void aymo_(cm_rewire_conn)( + struct aymo_(chip)* chip, + const struct aymo_ymf262_reg_104h* reg_104h_prev +) +{ + struct aymo_ymf262_reg_104h* reg_104h = &chip->chip_regs.reg_104h; + unsigned diff = (reg_104h_prev ? (reg_104h_prev->conn ^ reg_104h->conn) : 0xFF); + + for (int ch4x = 0; ch4x < (AYMO_(CHANNEL_NUM_MAX) / 2); ++ch4x) { + if (diff & (1 << ch4x)) { + int ch2x = aymo_ymf262_ch4x_to_pair[ch4x][0]; + int ch2p = aymo_ymf262_ch4x_to_pair[ch4x][1]; + + if (reg_104h->conn & (1 << ch4x)) { + chip->og_ch2x_pairing |= ((1UL << ch2x) | (1UL << ch2p)); + + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt); + const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]); + } + else { + chip->og_ch2x_pairing &= ~((1UL << ch2x) | (1UL << ch2p)); + + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]); + + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + const struct aymo_(conn)* ch2p_conn = aymo_(conn_ch2x_table)[ch2p_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch2p_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch2p_conn[1]); + } + } + } +} + + +static +void aymo_(cm_rewire_rhythm)( + struct aymo_(chip)* chip, + const struct aymo_ymf262_reg_BDh* reg_BDh_prev +) +{ + const struct aymo_ymf262_reg_BDh reg_BDh_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + const struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh; + int force_update = 0; + + if (reg_BDh->ryt) { + if (!reg_BDh_prev->ryt) { + // Apply special connection for rhythm mode + unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt; + const struct aymo_(conn)* ch6_conn = aymo_(conn_ryt_table)[ch6_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]); + + const struct aymo_(conn)* ch7_conn = aymo_(conn_ryt_table)[2]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]); + + const struct aymo_(conn)* ch8_conn = aymo_(conn_ryt_table)[3]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]); + + force_update = 1; + } + } + else { + if (reg_BDh_prev->ryt) { + // Apply standard Channel_2xOP connection + unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt; + const struct aymo_(conn)* ch6_conn = aymo_(conn_ch2x_table)[ch6_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]); + + unsigned ch7_cnt = chip->ch2x_regs[7].reg_C0h.cnt; + const struct aymo_(conn)* ch7_conn = aymo_(conn_ch2x_table)[ch7_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]); + + unsigned ch8_cnt = chip->ch2x_regs[8].reg_C0h.cnt; + const struct aymo_(conn)* ch8_conn = aymo_(conn_ch2x_table)[ch8_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]); + + reg_BDh = ®_BDh_zero; // force all keys off + force_update = 1; + } + } + + if ((reg_BDh->hh != reg_BDh_prev->hh) || force_update) { + int word_hh = aymo_ymf262_ch2x_to_word[7][0]; + if (reg_BDh->hh) { + aymo_(eg_key_on)(chip, word_hh, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_hh, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->tc != reg_BDh_prev->tc) || force_update) { + int word_tc = aymo_ymf262_ch2x_to_word[8][1]; + if (reg_BDh->tc) { + aymo_(eg_key_on)(chip, word_tc, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_tc, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->tom != reg_BDh_prev->tom) || force_update) { + int word_tom = aymo_ymf262_ch2x_to_word[8][0]; + if (reg_BDh->tom) { + aymo_(eg_key_on)(chip, word_tom, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_tom, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->sd != reg_BDh_prev->sd) || force_update) { + int word_sd = aymo_ymf262_ch2x_to_word[7][1]; + if (reg_BDh->sd) { + aymo_(eg_key_on)(chip, word_sd, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_sd, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->bd != reg_BDh_prev->bd) || force_update) { + int word_bd0 = aymo_ymf262_ch2x_to_word[6][0]; + int word_bd1 = aymo_ymf262_ch2x_to_word[6][1]; + if (reg_BDh->bd) { + aymo_(eg_key_on)(chip, word_bd0, AYMO_(EG_KEY_DRUM)); + aymo_(eg_key_on)(chip, word_bd1, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_bd0, AYMO_(EG_KEY_DRUM)); + aymo_(eg_key_off)(chip, word_bd1, AYMO_(EG_KEY_DRUM)); + } + } +} + + +static +void aymo_(write_00h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + switch (address) { + case 0x01: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_01h) = value; + break; + } + case 0x02: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_02h) = value; + break; + } + case 0x03: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_03h) = value; + break; + } + case 0x04: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_04h) = value; + break; + } + case 0x104: { + struct aymo_ymf262_reg_104h reg_104h_prev = chip->chip_regs.reg_104h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_104h) = value; + aymo_(cm_rewire_conn)(chip, ®_104h_prev); + break; + } + case 0x105: { + struct aymo_ymf262_reg_105h reg_105h_prev = chip->chip_regs.reg_105h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_105h) = value; + if (chip->chip_regs.reg_105h.newm != reg_105h_prev.newm) { + ; + } + break; + } + case 0x08: { + struct aymo_ymf262_reg_08h reg_08h_prev = chip->chip_regs.reg_08h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_08h) = value; + if (chip->chip_regs.reg_08h.nts != reg_08h_prev.nts) { + aymo_(chip_pg_update_nts)(chip); + } + break; + } + } +} + + +static +void aymo_(write_20h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int sgi = (aymo_ymf262_slot_to_word[slot] / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (aymo_ymf262_slot_to_word[slot] % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h); + struct aymo_ymf262_reg_20h reg_20h_prev = *reg_20h; + *(uint8_t*)(void*)reg_20h = value; + unsigned update_deltafreq = 0; + + if (reg_20h->mult != reg_20h_prev.mult) { + int16_t pg_mult_x2 = aymo_ymf262_pg_mult_x2_table[reg_20h->mult]; + vinsertv(sg->pg_mult_x2, pg_mult_x2, sgo); + update_deltafreq = 1; // force + } + + if (reg_20h->ksr != reg_20h_prev.ksr) { + int16_t eg_ksv = vextractv(cg->eg_ksv, sgo); + int16_t eg_ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1)); + vinsertv(sg->eg_ks, eg_ks, sgo); + } + + if (reg_20h->egt != reg_20h_prev.egt) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->sr = (reg_20h->egt ? 0 : chip->slot_regs[slot].reg_80h.rr); + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + } + + if (reg_20h->vib != reg_20h_prev.vib) { + int16_t pg_vib = -(int16_t)reg_20h->vib; + vinsertv(sg->pg_vib, pg_vib, sgo); + update_deltafreq = 1; // force + } + + if (reg_20h->am != reg_20h_prev.am) { + int16_t eg_am = -(int16_t)reg_20h->am; + vinsertv(sg->eg_am, eg_am, sgo); + + uint16_t eg_tremolopos = chip->eg_tremolopos; + if (eg_tremolopos >= 105) { + eg_tremolopos = (210 - eg_tremolopos); + } + vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift)); + vsfence(); + sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am); + } + + if (update_deltafreq) { + for (sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + cgi = aymo_(sgi_to_cgi)(sgi); + cg = &chip->cg[cgi]; + sg = &chip->sg[sgi]; + aymo_(pg_update_deltafreq)(chip, cg, sg); + } + } +} + + +static +void aymo_(write_40h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h); + struct aymo_ymf262_reg_40h reg_40h_prev = *reg_40h; + *(uint8_t*)(void*)reg_40h = value; + + if ((reg_40h->tl != reg_40h_prev.tl) || (reg_40h->ksl != reg_40h_prev.ksl)) { + aymo_(eg_update_ksl)(chip, word); + } +} + + +static +void aymo_(write_60h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_60h* reg_60h = &(chip->slot_regs[slot].reg_60h); + struct aymo_ymf262_reg_60h reg_60h_prev = *reg_60h; + *(uint8_t*)(void*)reg_60h = value; + + if ((reg_60h->dr != reg_60h_prev.dr) || (reg_60h->ar != reg_60h_prev.ar)) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->dr = reg_60h->dr; + eg_adsr->ar = reg_60h->ar; + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + } +} + + +static +void aymo_(write_80h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_80h* reg_80h = &(chip->slot_regs[slot].reg_80h); + struct aymo_ymf262_reg_80h reg_80h_prev = *reg_80h; + *(uint8_t*)(void*)reg_80h = value; + + if ((reg_80h->rr != reg_80h_prev.rr) || (reg_80h->sl != reg_80h_prev.sl)) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->sr = (chip->slot_regs[slot].reg_20h.egt ? 0 : reg_80h->rr); + eg_adsr->rr = reg_80h->rr; + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + int16_t eg_sl = (int16_t)reg_80h->sl; + if (eg_sl == 0x0F) { + eg_sl = 0x1F; + } + vinsertv(sg->eg_sl, eg_sl, sgo); + } +} + + +static +void aymo_(write_E0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_E0h* reg_E0h = &(chip->slot_regs[slot].reg_E0h); + struct aymo_ymf262_reg_E0h reg_E0h_prev = *reg_E0h; + *(uint8_t*)(void*)reg_E0h = value; + + if (!chip->chip_regs.reg_105h.newm) { + reg_E0h->ws &= 3; + } + + if (reg_E0h->ws != reg_E0h_prev.ws) { + const struct aymo_(wave)* wave = &aymo_(wave_table)[reg_E0h->ws]; + vinsertv(sg->wg_phase_shl, wave->wg_phase_shl, sgo); + vinsertv(sg->wg_phase_zero, wave->wg_phase_zero, sgo); + vinsertv(sg->wg_phase_neg, wave->wg_phase_neg, sgo); + vinsertv(sg->wg_phase_flip, wave->wg_phase_flip, sgo); + vinsertv(sg->wg_phase_mask, wave->wg_phase_mask, sgo); + vinsertv(sg->wg_sine_gate, wave->wg_sine_gate, sgo); + } +} + + +static +void aymo_(write_A0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) { + return; + } + if (!ch2x_is_pairing || ch2x_is_secondary) { + ch2p = -1; + } + + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_A0h reg_A0h_prev = *reg_A0h; + *(uint8_t*)(void*)reg_A0h = value; + + if (reg_A0h->fnum_lo != reg_A0h_prev.fnum_lo) { + aymo_(ch2x_update_fnum)(chip, ch2x, ch2p); + } +} + + +static +void aymo_(write_B0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) { + return; + } + if (!ch2x_is_pairing || ch2x_is_secondary) { + ch2p = -1; + } + + if (address == 0xBD) { + struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh; + struct aymo_ymf262_reg_BDh reg_BDh_prev = *reg_BDh; + *(uint8_t*)(void*)reg_BDh = value; + + chip->eg_tremoloshift = (((reg_BDh->dam ^ 1) << 1) + 2); + chip->eg_vibshift = (reg_BDh->dvb ^ 1); + aymo_(cm_rewire_rhythm)(chip, ®_BDh_prev); + } + else { + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_B0h reg_B0h_prev = *reg_B0h; + *(uint8_t*)(void*)reg_B0h = value; + + if ((reg_B0h->fnum_hi != reg_B0h_prev.fnum_hi) || (reg_B0h->block != reg_B0h_prev.block)) { + aymo_(ch2x_update_fnum)(chip, ch2x, ch2p); + } + + if (reg_B0h->kon != reg_B0h_prev.kon) { + if (reg_B0h->kon) { + aymo_(ch2x_key_on)(chip, ch2x); + } else { + aymo_(ch2x_key_off)(chip, ch2x); + } + } + } +} + + +static +void aymo_(write_C0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + struct aymo_ymf262_reg_C0h* reg_C0h = &(chip->ch2x_regs[ch2x].reg_C0h); + struct aymo_ymf262_reg_C0h reg_C0h_prev = *reg_C0h; + if (!chip->chip_regs.reg_105h.newm) { + value = ((value | 0x30) & 0x3F); + } + *(uint8_t*)(void*)reg_C0h = value; + + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int sgo = (ch2x_word0 % AYMO_(SLOT_GROUP_LENGTH)); + int sgi0 = (ch2x_word0 / AYMO_(SLOT_GROUP_LENGTH)); + int sgi1 = (ch2x_word1 / AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg0 = &chip->sg[sgi0]; + struct aymo_(slot_group)* sg1 = &chip->sg[sgi1]; + int cgi = aymo_(sgi_to_cgi)(sgi0); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + + if (reg_C0h->cha != reg_C0h_prev.cha) { + int16_t og_ch_gate_a = -(int16_t)reg_C0h->cha; + vinsertv(cg->og_ch_gate_a, og_ch_gate_a, sgo); + vinsertv(sg0->og_out_ch_gate_a, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_a), sgo); + vinsertv(sg1->og_out_ch_gate_a, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_a), sgo); + } + if (reg_C0h->chb != reg_C0h_prev.chb) { + int16_t og_ch_gate_b = -(int16_t)reg_C0h->chb; + vinsertv(cg->og_ch_gate_b, og_ch_gate_b, sgo); + vinsertv(sg0->og_out_ch_gate_b, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_b), sgo); + vinsertv(sg1->og_out_ch_gate_b, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_b), sgo); + } + if (reg_C0h->chc != reg_C0h_prev.chc) { + int16_t og_ch_gate_c = -(int16_t)reg_C0h->chc; + vinsertv(cg->og_ch_gate_c, og_ch_gate_c, sgo); + vinsertv(sg0->og_out_ch_gate_c, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_c), sgo); + vinsertv(sg1->og_out_ch_gate_c, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_c), sgo); + } + if (reg_C0h->chd != reg_C0h_prev.chd) { + int16_t og_ch_gate_d = -(int16_t)reg_C0h->chd; + vinsertv(cg->og_ch_gate_d, og_ch_gate_d, sgo); + vinsertv(sg0->og_out_ch_gate_d, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_d), sgo); + vinsertv(sg1->og_out_ch_gate_d, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_d), sgo); + } + + if (reg_C0h->fb != reg_C0h_prev.fb) { + int16_t fb_shs = (reg_C0h->fb ? -(int16_t)(9u - reg_C0h->fb) : +16); + vinsertv(sg0->wg_fb_shs, fb_shs, sgo); + vinsertv(sg1->wg_fb_shs, fb_shs, sgo); + } + + if (chip->chip_regs.reg_105h.stereo) { + // TODO + } + + if (reg_C0h->cnt != reg_C0h_prev.cnt) { + aymo_(cm_rewire_ch2x)(chip, ch2x); + } +} + + +static +void aymo_(write_D0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + *(uint8_t*)(void*)&(chip->ch2x_regs[ch2x].reg_C0h) = value; + + if (chip->chip_regs.reg_105h.stereo) { + // TODO + } +} + + +static +int aymo_(rq_enqueue)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + uint16_t rq_tail = chip->rq_tail; + uint16_t rq_next = (rq_tail + 1); + if (rq_next >= AYMO_(REG_QUEUE_LENGTH)) { + rq_next = 0u; + } + + if (rq_next != chip->rq_head) { + chip->rq_buffer[rq_tail].address = address; + chip->rq_buffer[rq_tail].value = value; + chip->rq_tail = rq_next; + return 1; + } + return 0; +} + + +const struct aymo_ymf262_vt* aymo_(get_vt)(void) +{ + return &(aymo_(vt)); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip) +{ + assert(chip); + + // Wipe everything, except VT + const struct aymo_ymf262_vt* vt = chip->parent.vt; + aymo_memset(chip, 0, sizeof(*chip)); + chip->parent.vt = vt; + + // Initialize slots + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + sg->wg_fb_shs = vset1(16); + sg->eg_rout = vset1(0x01FF); + sg->eg_out = vset1(0x01FF); + sg->eg_gen = vset1(AYMO_(EG_GEN_RELEASE)); + sg->eg_gen_shl = vset1(AYMO_(EG_GEN_SHL_RELEASE)); + sg->pg_mult_x2 = vset1(aymo_ymf262_pg_mult_x2_table[0]); + sg->og_prout_ac = vsetm(aymo_(og_prout_ac)[sgi]); + sg->og_prout_bd = vsetm(aymo_(og_prout_bd)[sgi]); + + const struct aymo_(wave)* wave = &aymo_(wave_table)[0]; + sg->wg_phase_shl = vset1(wave->wg_phase_shl); + sg->wg_phase_zero = vset1(wave->wg_phase_zero); + sg->wg_phase_neg = vset1(wave->wg_phase_neg); + sg->wg_phase_flip = vset1(wave->wg_phase_flip); + sg->wg_phase_mask = vset1(wave->wg_phase_mask); + sg->wg_sine_gate = vset1(wave->wg_sine_gate); + } + + // Initialize channels + for (int cgi = 0; cgi < (AYMO_(SLOT_GROUP_NUM) / 2); ++cgi) { + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + cg->og_ch_gate_a = vset1(-1); + cg->og_ch_gate_b = vset1(-1); + } + for (int ch2x = 0; ch2x < AYMO_(CHANNEL_NUM_MAX); ++ch2x) { + aymo_(cm_rewire_ch2x)(chip, ch2x); + } + + // Initialize chip + chip->ng_noise = 1; + + chip->eg_tremoloshift = 4; + chip->eg_vibshift = 1; +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + AYMO_UNUSED_VAR(chip); + AYMO_UNUSED_VAR(address); + assert(chip); + + // not supported + return 0u; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address > 0x1FF) { + return; + } + + switch (address & 0xF0) { + case 0x00: { + aymo_(write_00h)(chip, address, value); + break; + } + case 0x20: + case 0x30: { + aymo_(write_20h)(chip, address, value); + break; + } + case 0x40: + case 0x50: { + aymo_(write_40h)(chip, address, value); + break; + } + case 0x60: + case 0x70: { + aymo_(write_60h)(chip, address, value); + break; + } + case 0x80: + case 0x90: { + aymo_(write_80h)(chip, address, value); + break; + } + case 0xE0: + case 0xF0: { + aymo_(write_E0h)(chip, address, value); + break; + } + case 0xA0: { + aymo_(write_A0h)(chip, address, value); + break; + } + case 0xB0: { + aymo_(write_B0h)(chip, address, value); + break; + } + case 0xC0: { + aymo_(write_C0h)(chip, address, value); + break; + } + case 0xD0: { + aymo_(write_D0h)(chip, address, value); + break; + } + } + vsfence(); +} + + +int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address < 0x8000u) { + return aymo_(rq_enqueue)(chip, address, value); + } + return 0; +} + + +int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + if (count < 0x8000u) { + uint16_t address = (uint16_t)((count >> 8) | 0x8000u); + uint8_t value = (uint8_t)(count & 0xFFu); + return aymo_(rq_enqueue)(chip, address, value); + } + return 0; +} + + +int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel) +{ + assert(chip); + + switch (channel) { + case 0u: return vget_lane_s16(chip->og_out, 0); + case 1u: return vget_lane_s16(chip->og_out, 1); + case 2u: return vget_lane_s16(chip->og_out, 2); + case 3u: return vget_lane_s16(chip->og_out, 3); + default: return 0; + } +} + + +void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + while (count--) { + aymo_(tick_once)(chip); + } +} + + +void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 3u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + *(int32_t*)(void*)y = vget_lane_s32(vreinterpret_s32_s16(chip->og_out), 0); + y += 2u; + } +} + + +void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 7u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vst1_s16(y, chip->og_out); + y += 4u; + } +} + + +void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 7u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vi32x4_t s32 = vmovl_s16(chip->og_out); + vf32x2_t f32 = vcvt_f32_s32(vget_low_s32(s32)); + vst1_f32(y, f32); + y += 2u; + } +} + + +void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 15u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vi32x4_t s32 = vmovl_s16(chip->og_out); + vf32x4_t f32 = vcvtq_f32_s32(s32); + vst1q_f32(y, f32); + y += 4u; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_ARM_NEON diff --git a/src/aymo_ymf262_common.c b/src/aymo_ymf262_common.c new file mode 100644 index 0000000..5fbdc66 --- /dev/null +++ b/src/aymo_ymf262_common.c @@ -0,0 +1,263 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_ymf262_common.h" + +AYMO_CXX_EXTERN_C_BEGIN + + +// TODO: common tables + + +// Exponential look-up table +// Values are pre-multiplied by 2 +const int16_t AYMO_ALIGN(4) aymo_ymf262_exp_x2_table[256 + 4] = +{ + 0x0FF4, 0x0FEA, 0x0FDE, 0x0FD4, 0x0FC8, 0x0FBE, 0x0FB4, 0x0FA8, + 0x0F9E, 0x0F92, 0x0F88, 0x0F7E, 0x0F72, 0x0F68, 0x0F5C, 0x0F52, + 0x0F48, 0x0F3E, 0x0F32, 0x0F28, 0x0F1E, 0x0F14, 0x0F08, 0x0EFE, + 0x0EF4, 0x0EEA, 0x0EE0, 0x0ED4, 0x0ECA, 0x0EC0, 0x0EB6, 0x0EAC, + 0x0EA2, 0x0E98, 0x0E8E, 0x0E84, 0x0E7A, 0x0E70, 0x0E66, 0x0E5C, + 0x0E52, 0x0E48, 0x0E3E, 0x0E34, 0x0E2A, 0x0E20, 0x0E16, 0x0E0C, + 0x0E04, 0x0DFA, 0x0DF0, 0x0DE6, 0x0DDC, 0x0DD2, 0x0DCA, 0x0DC0, + 0x0DB6, 0x0DAC, 0x0DA4, 0x0D9A, 0x0D90, 0x0D88, 0x0D7E, 0x0D74, + 0x0D6A, 0x0D62, 0x0D58, 0x0D50, 0x0D46, 0x0D3C, 0x0D34, 0x0D2A, + 0x0D22, 0x0D18, 0x0D10, 0x0D06, 0x0CFE, 0x0CF4, 0x0CEC, 0x0CE2, + 0x0CDA, 0x0CD0, 0x0CC8, 0x0CBE, 0x0CB6, 0x0CAE, 0x0CA4, 0x0C9C, + 0x0C92, 0x0C8A, 0x0C82, 0x0C78, 0x0C70, 0x0C68, 0x0C60, 0x0C56, + 0x0C4E, 0x0C46, 0x0C3C, 0x0C34, 0x0C2C, 0x0C24, 0x0C1C, 0x0C12, + 0x0C0A, 0x0C02, 0x0BFA, 0x0BF2, 0x0BEA, 0x0BE0, 0x0BD8, 0x0BD0, + 0x0BC8, 0x0BC0, 0x0BB8, 0x0BB0, 0x0BA8, 0x0BA0, 0x0B98, 0x0B90, + 0x0B88, 0x0B80, 0x0B78, 0x0B70, 0x0B68, 0x0B60, 0x0B58, 0x0B50, + 0x0B48, 0x0B40, 0x0B38, 0x0B32, 0x0B2A, 0x0B22, 0x0B1A, 0x0B12, + 0x0B0A, 0x0B02, 0x0AFC, 0x0AF4, 0x0AEC, 0x0AE4, 0x0ADE, 0x0AD6, + 0x0ACE, 0x0AC6, 0x0AC0, 0x0AB8, 0x0AB0, 0x0AA8, 0x0AA2, 0x0A9A, + 0x0A92, 0x0A8C, 0x0A84, 0x0A7C, 0x0A76, 0x0A6E, 0x0A68, 0x0A60, + 0x0A58, 0x0A52, 0x0A4A, 0x0A44, 0x0A3C, 0x0A36, 0x0A2E, 0x0A28, + 0x0A20, 0x0A18, 0x0A12, 0x0A0C, 0x0A04, 0x09FE, 0x09F6, 0x09F0, + 0x09E8, 0x09E2, 0x09DA, 0x09D4, 0x09CE, 0x09C6, 0x09C0, 0x09B8, + 0x09B2, 0x09AC, 0x09A4, 0x099E, 0x0998, 0x0990, 0x098A, 0x0984, + 0x097C, 0x0976, 0x0970, 0x096A, 0x0962, 0x095C, 0x0956, 0x0950, + 0x0948, 0x0942, 0x093C, 0x0936, 0x0930, 0x0928, 0x0922, 0x091C, + 0x0916, 0x0910, 0x090A, 0x0904, 0x08FC, 0x08F6, 0x08F0, 0x08EA, + 0x08E4, 0x08DE, 0x08D8, 0x08D2, 0x08CC, 0x08C6, 0x08C0, 0x08BA, + 0x08B4, 0x08AE, 0x08A8, 0x08A2, 0x089C, 0x0896, 0x0890, 0x088A, + 0x0884, 0x087E, 0x0878, 0x0872, 0x086C, 0x0866, 0x0860, 0x085A, + 0x0854, 0x0850, 0x084A, 0x0844, 0x083E, 0x0838, 0x0832, 0x082C, + 0x0828, 0x0822, 0x081C, 0x0816, 0x0810, 0x080C, 0x0806, 0x0800, + 0x0800, 0x0800, 0x0800, 0x0800 +}; + + +// Logsin look-up table +const int16_t AYMO_ALIGN(4) aymo_ymf262_logsin_table[256 + 4] = +{ + 0x0859, 0x06C3, 0x0607, 0x058B, 0x052E, 0x04E4, 0x04A6, 0x0471, + 0x0443, 0x041A, 0x03F5, 0x03D3, 0x03B5, 0x0398, 0x037E, 0x0365, + 0x034E, 0x0339, 0x0324, 0x0311, 0x02FF, 0x02ED, 0x02DC, 0x02CD, + 0x02BD, 0x02AF, 0x02A0, 0x0293, 0x0286, 0x0279, 0x026D, 0x0261, + 0x0256, 0x024B, 0x0240, 0x0236, 0x022C, 0x0222, 0x0218, 0x020F, + 0x0206, 0x01FD, 0x01F5, 0x01EC, 0x01E4, 0x01DC, 0x01D4, 0x01CD, + 0x01C5, 0x01BE, 0x01B7, 0x01B0, 0x01A9, 0x01A2, 0x019B, 0x0195, + 0x018F, 0x0188, 0x0182, 0x017C, 0x0177, 0x0171, 0x016B, 0x0166, + 0x0160, 0x015B, 0x0155, 0x0150, 0x014B, 0x0146, 0x0141, 0x013C, + 0x0137, 0x0133, 0x012E, 0x0129, 0x0125, 0x0121, 0x011C, 0x0118, + 0x0114, 0x010F, 0x010B, 0x0107, 0x0103, 0x00FF, 0x00FB, 0x00F8, + 0x00F4, 0x00F0, 0x00EC, 0x00E9, 0x00E5, 0x00E2, 0x00DE, 0x00DB, + 0x00D7, 0x00D4, 0x00D1, 0x00CD, 0x00CA, 0x00C7, 0x00C4, 0x00C1, + 0x00BE, 0x00BB, 0x00B8, 0x00B5, 0x00B2, 0x00AF, 0x00AC, 0x00A9, + 0x00A7, 0x00A4, 0x00A1, 0x009F, 0x009C, 0x0099, 0x0097, 0x0094, + 0x0092, 0x008F, 0x008D, 0x008A, 0x0088, 0x0086, 0x0083, 0x0081, + 0x007F, 0x007D, 0x007A, 0x0078, 0x0076, 0x0074, 0x0072, 0x0070, + 0x006E, 0x006C, 0x006A, 0x0068, 0x0066, 0x0064, 0x0062, 0x0060, + 0x005E, 0x005C, 0x005B, 0x0059, 0x0057, 0x0055, 0x0053, 0x0052, + 0x0050, 0x004E, 0x004D, 0x004B, 0x004A, 0x0048, 0x0046, 0x0045, + 0x0043, 0x0042, 0x0040, 0x003F, 0x003E, 0x003C, 0x003B, 0x0039, + 0x0038, 0x0037, 0x0035, 0x0034, 0x0033, 0x0031, 0x0030, 0x002F, + 0x002E, 0x002D, 0x002B, 0x002A, 0x0029, 0x0028, 0x0027, 0x0026, + 0x0025, 0x0024, 0x0023, 0x0022, 0x0021, 0x0020, 0x001F, 0x001E, + 0x001D, 0x001C, 0x001B, 0x001A, 0x0019, 0x0018, 0x0017, 0x0017, + 0x0016, 0x0015, 0x0014, 0x0014, 0x0013, 0x0012, 0x0011, 0x0011, + 0x0010, 0x000F, 0x000F, 0x000E, 0x000D, 0x000D, 0x000C, 0x000C, + 0x000B, 0x000A, 0x000A, 0x0009, 0x0009, 0x0008, 0x0008, 0x0007, + 0x0007, 0x0007, 0x0006, 0x0006, 0x0005, 0x0005, 0x0005, 0x0004, + 0x0004, 0x0004, 0x0003, 0x0003, 0x0003, 0x0002, 0x0002, 0x0002, + 0x0002, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000 +}; + + +// Word index to Slot index +const int8_t aymo_ymf262_word_to_slot[AYMO_YMF262_SLOT_NUM_MAX] = +{ + 0, 1, 2, 48, 18, 19, 20, 52, + 12, 13, 14, 56, 30, 31, 32, 60, + 3, 4, 5, 49, 21, 22, 23, 53, + 15, 16, 17, 57, 33, 34, 35, 61, + 6, 7, 8, 50, 24, 25, 26, 54, + 42, 43, 44, 58, 36, 37, 38, 62, + 9, 10, 11, 51, 27, 28, 29, 55, + 45, 46, 47, 59, 39, 40, 41, 63 +}; + +// Slot index to Word index +const int8_t aymo_ymf262_slot_to_word[AYMO_YMF262_SLOT_NUM_MAX] = +{ + 0, 1, 2, 16, 17, 18, 32, 33, + 34, 48, 49, 50, 8, 9, 10, 24, + 25, 26, 4, 5, 6, 20, 21, 22, + 36, 37, 38, 52, 53, 54, 12, 13, + 14, 28, 29, 30, 44, 45, 46, 60, + 61, 62, 40, 41, 42, 56, 57, 58, + 3, 19, 35, 51, 7, 23, 39, 55, + 11, 27, 43, 59, 15, 31, 47, 63 +}; + + +// Word index to Channel_2xOP index +const int8_t aymo_ymf262_word_to_ch2x[AYMO_YMF262_SLOT_NUM_MAX] = +{ + 0, 1, 2, 24, 9, 10, 11, 26, + 6, 7, 8, 28, 15, 16, 17, 30, + 0, 1, 2, 24, 9, 10, 11, 26, + 6, 7, 8, 28, 15, 16, 17, 30, + 3, 4, 5, 25, 12, 13, 14, 27, + 21, 22, 23, 29, 18, 19, 20, 31, + 3, 4, 5, 25, 12, 13, 14, 27, + 21, 22, 23, 29, 18, 19, 20, 31 +}; + +// Channel_2xOP index to Word index +const int8_t aymo_ymf262_ch2x_to_word[AYMO_YMF262_SLOT_NUM_MAX / 2][2/* slot */] = +{ + { 0, 16 }, { 1, 17 }, { 2, 18 }, { 32, 48 }, + { 33, 49 }, { 34, 50 }, { 8, 24 }, { 9, 25 }, + { 10, 26 }, { 4, 20 }, { 5, 21 }, { 6, 22 }, + { 36, 52 }, { 37, 53 }, { 38, 54 }, { 12, 28 }, + { 13, 29 }, { 14, 30 }, { 44, 60 }, { 45, 61 }, + { 46, 62 }, { 40, 56 }, { 41, 57 }, { 42, 58 }, + { 3, 19 }, { 35, 51 }, { 7, 23 }, { 39, 55 }, + { 11, 27 }, { 43, 59 }, { 15, 31 }, { 47, 63 } +}; + + +// Word index to Channel_4xOP index +const int8_t aymo_ymf262_word_to_ch4x[AYMO_YMF262_SLOT_NUM_MAX] = +{ + 0, 1, 2, 12, 3, 4, 5, 13, + 6, 7, 8, 14, 9, 10, 11, 15, + 0, 1, 2, 12, 3, 4, 5, 13, + 6, 7, 8, 14, 9, 10, 11, 15, + 0, 1, 2, 12, 3, 4, 5, 13, + 6, 7, 8, 14, 9, 10, 11, 15, + 0, 1, 2, 12, 3, 4, 5, 13, + 6, 7, 8, 14, 9, 10, 11, 15 +}; + +// Channel_4xOP index to Word index +const int8_t aymo_ymf262_ch4x_to_word[AYMO_YMF262_SLOT_NUM_MAX / 4][4/* slot */] = +{ + { 0, 16, 32, 48 }, { 1, 17, 33, 49 }, + { 2, 18, 34, 50 }, { 4, 20, 36, 52 }, + { 5, 21, 37, 53 }, { 6, 22, 38, 54 }, + { 8, 24, 40, 56 }, { 9, 25, 41, 57 }, + { 10, 26, 42, 58 }, { 12, 28, 44, 60 }, + { 13, 29, 45, 61 }, { 14, 30, 46, 62 }, + { 3, 19, 35, 51 }, { 7, 23, 39, 55 }, + { 11, 27, 43, 59 }, { 15, 31, 47, 63 } +}; + +// Channel_4xOP index to Channel_2xOP index pairs +const int8_t aymo_ymf262_ch4x_to_pair[AYMO_YMF262_CHANNEL_NUM_MAX / 2][2/* slot */] = +{ + { 0, 3 }, { 1, 4 }, { 2, 5 }, + { 9, 12 }, { 10, 13 }, { 11, 14 }, + { 6, 21 }, { 7, 22 }, { 8, 23 }, + { 15, 18 }, { 16, 19 }, { 17, 20 }, + { 24, 25 }, { 26, 27 }, { 28, 29 }, { 30, 31 } +}; + +// Paired Channel_2xOP index +const int8_t aymo_ymf262_ch2x_paired[AYMO_YMF262_CHANNEL_NUM_MAX] = +{ + 3, 4, 5, + 0, 1, 2, + 21, 22, 23, + 12, 13, 14, + 9, 10, 11, + 18, 19, 20, + 15, 16, 17, + 6, 7, 8, + 25, 24, 27, 26, + 29, 28, 31, 30 +}; + + +// Sub-address to Slot index +const int8_t aymo_ymf262_subaddr_to_slot[AYMO_YMF262_SLOT_NUM_MAX] = +{ + 0, 1, 2, 3, 4, 5, 48, 49, + 6, 7, 8, 9, 10, 11, 50, 51, + 12, 13, 14, 15, 16, 17, 52, 53, + 36, 37, 38, 39, 40, 41, 54, 55, + + 18, 19, 20, 21, 22, 23, 56, 57, + 24, 25, 26, 27, 28, 29, 58, 59, + 30, 31, 32, 33, 34, 35, 60, 61, + 42, 43, 44, 45, 46, 47, 62, 63 +}; + + +// TODO: slot_to_addr[] + + +// Sub-addres to Channel_2xOP index +const int8_t aymo_ymf262_subaddr_to_ch2x[AYMO_YMF262_CHANNEL_NUM_MAX] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, + 18, 19, 20, 21, 22, 23, 24, + + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 25, 26, 27, 28, 29, 30, 31 +}; + + +// TODO: ch2x_to_addr[] + + +const int8_t aymo_ymf262_pg_mult_x2_table[16] = +{ + 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 20, 24, 24, 30, 30 +}; + + +const int8_t aymo_ymf262_eg_ksl_table[16] = +{ + 0, 32, 40, 45, 48, 51, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64 +}; + +const int8_t aymo_ymf262_eg_kslsh_table[4] = +{ + 8, 1, 2, 0 +}; + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_ymf262_none.c b/src/aymo_ymf262_none.c new file mode 100644 index 0000000..ce3c5e7 --- /dev/null +++ b/src/aymo_ymf262_none.c @@ -0,0 +1,200 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_ymf262.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_none.h" + +#include "opl3.h" + +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_ymf262_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_ymf262_ctor_f)&(aymo_(ctor)), + (aymo_ymf262_dtor_f)&(aymo_(dtor)), + (aymo_ymf262_read_f)&(aymo_(read)), + (aymo_ymf262_write_f)&(aymo_(write)), + (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)), + (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)), + (aymo_ymf262_get_output_f)&(aymo_(get_output)), + (aymo_ymf262_tick_f)&(aymo_(tick)), + (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)), + (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)), + (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)), + (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4)) +}; + + +const struct aymo_ymf262_vt* aymo_(get_vt)(void) +{ + return &(aymo_(vt)); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip) +{ + assert(chip); + + // Wipe everything, except VT + const struct aymo_ymf262_vt* vt = chip->parent.vt; + aymo_memset(chip, 0, sizeof(*chip)); + chip->parent.vt = vt; + + OPL3_Reset(&chip->opl3, (uint32_t)AYMO_YMF262_SAMPLE_RATE); +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + AYMO_UNUSED_VAR(chip); + AYMO_UNUSED_VAR(address); + assert(chip); + + // not supported + return 0u; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + OPL3_WriteReg(&chip->opl3, address, value); +} + + +int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + // not checked + OPL3_WriteRegBuffered(&chip->opl3, address, value); + return 1; +} + + +int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + // not supported + (void)chip; + (void)count; + return 0; +} + + +int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel) +{ + assert(chip); + + if (channel < 4u) { + return chip->outs[channel]; + } + return 0; +} + + +void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + while (count--) { + OPL3_Generate4Ch(&chip->opl3, chip->outs); + } +} + + +void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + + while (count--) { + OPL3_Generate4Ch(&chip->opl3, chip->outs); + y[0] = chip->outs[0]; + y[1] = chip->outs[1]; + y += 2u; + } +} + + +void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + + while (count--) { + OPL3_Generate4Ch(&chip->opl3, chip->outs); + y[0] = chip->outs[0]; + y[1] = chip->outs[1]; + y[2] = chip->outs[2]; + y[3] = chip->outs[3]; + y += 4u; + } +} + + +void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + + while (count--) { + OPL3_Generate4Ch(&chip->opl3, chip->outs); + y[0] = (float)chip->outs[0]; + y[1] = (float)chip->outs[1]; + y += 2u; + } +} + + +void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + + while (count--) { + OPL3_Generate4Ch(&chip->opl3, chip->outs); + y[0] = (float)chip->outs[0]; + y[1] = (float)chip->outs[1]; + y[2] = (float)chip->outs[2]; + y[3] = (float)chip->outs[3]; + y += 4u; + } +} + + +AYMO_CXX_EXTERN_C_END diff --git a/src/aymo_ymf262_x86_avx.c b/src/aymo_ymf262_x86_avx.c new file mode 100644 index 0000000..0bdcd88 --- /dev/null +++ b/src/aymo_ymf262_x86_avx.c @@ -0,0 +1,1691 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include +#include "aymo_cpu_x86_sse41_inline.h" // actually using SSE4.1 +#include "aymo_ymf262.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_x86_avx.h" + +#ifdef AYMO_CPU_SUPPORT_X86_AVX + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_ymf262_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_ymf262_ctor_f)&(aymo_(ctor)), + (aymo_ymf262_dtor_f)&(aymo_(dtor)), + (aymo_ymf262_read_f)&(aymo_(read)), + (aymo_ymf262_write_f)&(aymo_(write)), + (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)), + (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)), + (aymo_ymf262_get_output_f)&(aymo_(get_output)), + (aymo_ymf262_tick_f)&(aymo_(tick)), + (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)), + (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)), + (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)), + (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4)) +}; + + +// 32-bit Slot Group side (lo/hi) +const int8_t aymo_(sgo_side)[8] = +{ + 0, 0, 0, 0, 1, 1, 1, 1 +}; + +// 32-bit Slot Group cell +const int8_t aymo_(sgo_cell)[8] = +{ + 0, 1, 2, 3, 0, 1, 2, 3 +}; + + +const uint16_t aymo_(eg_incstep_table)[4] = +{ + ((1 << 15) | (1 << 14) | (1 << 13)), + ((0 << 15) | (0 << 14) | (1 << 13)), + ((0 << 15) | (1 << 14) | (1 << 13)), + ((0 << 15) | (0 << 14) | (0 << 13)) +}; + + +// Wave descriptors +const struct aymo_(wave) aymo_(wave_table)[8] = // TODO: share bits; select vit shifts +{ + { 1, 0x0000, 0x0200, 0x0100, 0x00FF, -1 }, + { 1, 0x0200, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0000, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0100, 0x0000, 0x0100, 0x00FF, -1 }, + { 2, 0x0400, 0x0200, 0x0100, 0x00FF, -1 }, + { 2, 0x0400, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0000, 0x0200, 0x0200, 0x0001, 0 }, + { 8, 0x0000, 0x1000, 0x1000, 0x1FFF, 0 } +}; + + +// 2-channel connection descriptors +const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */] = +{ + { + { -1, 0, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, -1 } + }, +}; + +// 4-channel connection descriptors +const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */] = +{ + { + { -1, 0, 0 }, + { 0, -1, 0 }, + { 0, -1, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, 0 }, + { 0, -1, -1 }, + { 0, 0, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, 0 }, + { 0, -1, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, 0 }, + { 0, -1, -1 }, + { 0, 0, -1 } + }, +}; + +// Rhythm connection descriptors +const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */] = +{ + // Channel 6: BD, FM + { + { -1, 0, 0 }, + { 0, -1, -1 } + }, + // Channel 6: BD, AM + { + { -1, 0, 0 }, + { 0, 0, -1 } + }, + // Channel 7: HH + SD + { + { 0, 0, -1 }, + { 0, 0, -1 } + }, + // Channel 8: TT + TC + { + { 0, 0, -1 }, + { 0, 0, -1 } + } +}; + + +// Slot mask output delay for outputs A and C +const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)] = // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag +{ + 0xF8, + 0xF8, + 0xF8, + 0xFF, + 0xF8, + 0xFF, + 0xF8, + 0xFF +}; + + +// Slot mask output delay for outputs B and D +const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)] = // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag +{ + 0x88, + 0xF8, + 0x88, + 0xF8, + 0x88, + 0xFF, + 0x88, + 0xFF +}; + + +// Updates phase generator +static inline +void aymo_(pg_update_deltafreq)( + struct aymo_(chip)* chip, + struct aymo_(ch2x_group)* cg, + struct aymo_(slot_group)* sg +) +{ + // Update phase + vi16_t fnum = cg->pg_fnum; + vi16_t range = vand(fnum, vset1(7 << 7)); + range = vmulihi(range, vand(sg->pg_vib, chip->pg_vib_mulhi)); + range = vsub(vxor(range, chip->pg_vib_neg), chip->pg_vib_neg); // flip sign + fnum = vadd(fnum, range); + + vi32_t zero = vsetz(); + vi32_t fnum_lo = vunpacklo(fnum, zero); + vi32_t fnum_hi = vunpackhi(fnum, zero); + vi32_t block_sll_lo = vunpacklo(cg->pg_block, zero); + vi32_t block_sll_hi = vunpackhi(cg->pg_block, zero); + vi32_t basefreq_lo = vvsrli(vvsllv(fnum_lo, block_sll_lo), 1); + vi32_t basefreq_hi = vvsrli(vvsllv(fnum_hi, block_sll_hi), 1); + vi32_t pg_mult_x2_lo = vunpacklo(sg->pg_mult_x2, zero); + vi32_t pg_mult_x2_hi = vunpackhi(sg->pg_mult_x2, zero); + vi32_t deltafreq_lo = vvsrli(vvmullo(basefreq_lo, pg_mult_x2_lo), 1); + vi32_t deltafreq_hi = vvsrli(vvmullo(basefreq_hi, pg_mult_x2_hi), 1); + sg->pg_deltafreq_lo = deltafreq_lo; + sg->pg_deltafreq_hi = deltafreq_hi; +} + + +// Updates noise generator +static inline +void aymo_(ng_update)(struct aymo_(chip)* chip, unsigned times) +{ + // Update noise + uint32_t noise = chip->ng_noise; + while (times--) { + uint32_t n_bit = (((noise >> 14) ^ noise) & 1); + noise = ((noise >> 1) | (n_bit << 22)); + } + chip->ng_noise = noise; +} + + +// Updates rhythm manager, slot group 1 +static inline +void aymo_(rm_update_sg1)(struct aymo_(chip)* chip) +{ + struct aymo_(slot_group)* sg = &chip->sg[1]; + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Double rhythm outputs + vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0); + vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask); + chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d)); + } + + vi16_t phase = sg->pg_phase_out; + uint16_t phase13 = (uint16_t)vextract(phase, 1); + + // Update noise bits + chip->rm_hh_bit2 = ((phase13 >> 2) & 1); + chip->rm_hh_bit3 = ((phase13 >> 3) & 1); + chip->rm_hh_bit7 = ((phase13 >> 7) & 1); + chip->rm_hh_bit8 = ((phase13 >> 8) & 1); + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Calculate noise bit + uint16_t rm_xor = ( + (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) | + (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) | + (chip->rm_tc_bit3 ^ chip->rm_tc_bit5) + ); + + // Update HH + uint16_t noise = (uint16_t)chip->ng_noise; + phase13 = (rm_xor << 9); + if (rm_xor ^ (noise & 1)) { + phase13 |= 0xD0; + } else { + phase13 |= 0x34; + } + phase = vinsert(phase, (int16_t)phase13, 1); + + sg->pg_phase_out = phase; + } +} + + +// Updates rhythm manager, slot group 3 +static inline +void aymo_(rm_update_sg3)(struct aymo_(chip)* chip) +{ + struct aymo_(slot_group)* sg = &chip->sg[3]; + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Double rhythm outputs + vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0); + vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask); + chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d)); + + // Calculate noise bit + uint16_t rm_xor = ( + (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) | + (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) | + (chip->rm_tc_bit3 ^ chip->rm_tc_bit5) + ); + vi16_t phase = sg->pg_phase_out; + + // Update SD + uint16_t noise = (uint16_t)chip->ng_noise; + uint16_t phase16 = ( + ((uint16_t)chip->rm_hh_bit8 << 9) | + ((uint16_t)(chip->rm_hh_bit8 ^ (noise & 1)) << 8) + ); + phase = vinsert(phase, (int16_t)phase16, 1); + + // Update TC + uint32_t phase17 = vextract(phase, 2); + chip->rm_tc_bit3 = ((phase17 >> 3) & 1); + chip->rm_tc_bit5 = ((phase17 >> 5) & 1); + phase17 = ((rm_xor << 9) | 0x80); + phase = vinsert(phase, (int16_t)phase17, 2); + + sg->pg_phase_out = phase; + } +} + + +// Updates slot generators +static +void aymo_(sg_update)( + struct aymo_(chip)* chip, + struct aymo_(slot_group)* sg +) +{ + // EG: Compute envelope output + vi16_t sg_eg_rout = sg->eg_rout; + sg->eg_out = vadd(vadd(sg_eg_rout, sg->eg_tremolo_am), sg->eg_ksl_sh_tl_x4); + + // PG: Compute phase output + vi32_t phase_out_mask = vvset1(0xFFFF); + vi32_t phase_out_lo = vvand(vvsrli(sg->pg_phase_lo, 9), phase_out_mask); + vi32_t phase_out_hi = vvand(vvsrli(sg->pg_phase_hi, 9), phase_out_mask); + vi16_t phase_out = vvpackus(phase_out_lo, phase_out_hi); + sg->pg_phase_out = phase_out; + + // EG: Compute rate + vi16_t eg_prgen = sg->eg_gen; + vi16_t eg_gen_rel = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_RELEASE))); + vi16_t notreset = vcmpz(vand(sg->eg_key, eg_gen_rel)); + vi16_t eg_gen_mullo = vblendv(vset1(AYMO_(EG_GEN_MULLO_ATTACK)), sg->eg_gen_mullo, notreset); + vi16_t reg_rate = vu2i(vmululo(vi2u(sg->eg_adsr), vi2u(eg_gen_mullo))); // move to top nibble + vi16_t rate_temp = vand(reg_rate, vset1((int16_t)0xF000)); // keep top nibble + rate_temp = vsrli(rate_temp, AYMO_(EG_GEN_SRLHI)); + vi16_t rate = vadd(sg->eg_ks, rate_temp); + vi16_t rate_lo = vand(rate, vset1(3)); + vi16_t rate_hi = vsrli(rate, 2); + rate_hi = vmini(rate_hi, vset1(15)); + + // PG: Update phase + vi32_t notreset_lo = vunpacklo(notreset, notreset); + vi32_t notreset_hi = vunpackhi(notreset, notreset); + vi32_t pg_phase_lo = vvand(notreset_lo, sg->pg_phase_lo); + vi32_t pg_phase_hi = vvand(notreset_hi, sg->pg_phase_hi); + sg->pg_phase_lo = vvadd(pg_phase_lo, sg->pg_deltafreq_lo); + sg->pg_phase_hi = vvadd(pg_phase_hi, sg->pg_deltafreq_hi); + + // EG: Compute shift (< 12) + vi16_t eg_shift = vadd(rate_hi, chip->eg_add); + vi16_t rate_pre_lt12 = vor(vslli(rate_lo, 1), vset1(8)); + vi16_t shift_lt12 = vsrlv(rate_pre_lt12, vsubsu(vset1(15), eg_shift)); + vi16_t eg_state = vset1((int16_t)chip->eg_state); + shift_lt12 = vand(shift_lt12, eg_state); + + // WG: Compute feedback and modulation inputs + vi16_t fbsum = vslli(vadd(sg->wg_out, sg->wg_prout), 1); + vi16_t fbsum_sh = vmulihi(fbsum, sg->wg_fb_mulhi); + vi16_t prmod = vand(chip->wg_mod, sg->wg_prmod_gate); + vi16_t fbmod = vand(fbsum_sh, sg->wg_fbmod_gate); + sg->wg_prout = sg->wg_out; + + // WG: Compute operator phase input + vi16_t modsum = vadd(fbmod, prmod); + vi16_t phase = vadd(phase_out, modsum); + + // EG: Compute shift (>= 12) + vu16_t rate_lo_muluhi = vi2u(vslli(vpow2m1lt4(rate_lo), 1)); + vi16_t incstep_ge12 = vand(vu2i(vmuluhi(chip->eg_incstep, rate_lo_muluhi)), vset1(1)); + vi16_t shift_ge12 = vadd(vand(rate_hi, vset1(3)), incstep_ge12); + shift_ge12 = vmini(shift_ge12, vset1(3)); + shift_ge12 = vblendv(shift_ge12, eg_state, vcmpz(shift_ge12)); + + vi16_t shift = vblendv(shift_lt12, shift_ge12, vcmpgt(rate_hi, vset1(11))); + shift = vandnot(vcmpz(rate_temp), shift); + + // EG: Instant attack + vi16_t eg_rout = sg_eg_rout; + eg_rout = vandnot(vandnot(notreset, vcmpeq(rate_hi, vset1(15))), eg_rout); + + // WG: Process phase + vi16_t phase_sped = vu2i(vmululo(vi2u(phase), sg->wg_phase_mullo)); + vi16_t phase_gate = vcmpz(vand(phase_sped, sg->wg_phase_zero)); + vi16_t phase_flip = vcmpp(vand(phase_sped, sg->wg_phase_flip)); + vi16_t phase_mask = sg->wg_phase_mask; + vi16_t phase_xor = vand(phase_flip, phase_mask); + vi16_t phase_idx = vxor(phase_sped, phase_xor); + phase_out = vand(vand(phase_gate, phase_mask), phase_idx); + + // EG: Envelope off + vi16_t eg_off = vcmpgt(sg_eg_rout, vset1(0x01F7)); + vi16_t eg_gen_natk_and_nrst = vand(vcmpp(eg_prgen), notreset); + eg_rout = vblendv(eg_rout, vset1(0x01FF), vand(eg_gen_natk_and_nrst, eg_off)); + + // WG: Compute logsin variant + vi16_t phase_lo = phase_out; // vgather() masks to low byte + vi16_t logsin_val = vgather(aymo_ymf262_logsin_table, phase_lo); + logsin_val = vblendv(vset1(0x1000), logsin_val, phase_gate); + + // EG: Compute common increment not in attack state + vi16_t eg_inc_natk_cond = vand(vand(notreset, vcmpz(eg_off)), vcmpp(shift)); + vi16_t eg_inc_natk = vand(eg_inc_natk_cond, vpow2m1lt4(shift)); + vi16_t eg_gen = eg_prgen; + + // WG: Compute exponential output + vi16_t exp_in = vblendv(phase_out, logsin_val, sg->wg_sine_gate); + vi16_t exp_level = vadd(exp_in, vslli(sg->eg_out, 3)); + exp_level = vmini(exp_level, vset1(0x1FFF)); + vi16_t exp_level_lo = exp_level; // vgather() masks to low byte + vi16_t exp_level_hi = vsrli(exp_level, 8); + vi16_t exp_value = vgather(aymo_ymf262_exp_x2_table, exp_level_lo); + vi16_t exp_out = vsrlv(exp_value, exp_level_hi); + + // EG: Move attack to decay state + vi16_t eg_inc_atk_cond = vand(vand(vcmpp(sg->eg_key), vcmpp(shift)), + vand(vcmpz(eg_prgen), vcmpgt(vset1(15), rate_hi))); + vi16_t eg_inc_atk_ninc = vsrlv(sg_eg_rout, vsub(vset1(4), shift)); + vi16_t eg_inc = vandnot(eg_inc_atk_ninc, eg_inc_atk_cond); + vi16_t eg_gen_atk_to_dec = vcmpz(vor(eg_prgen, sg_eg_rout)); + eg_gen = vsub(eg_gen, eg_gen_atk_to_dec); // 0 --> 1 + eg_inc = vblendv(eg_inc_natk, eg_inc, vcmpz(eg_prgen)); + eg_inc = vandnot(eg_gen_atk_to_dec, eg_inc); + + // WG: Compute operator wave output + vi16_t wave_pos = vcmpz(vand(phase_sped, sg->wg_phase_neg)); + vi16_t wave_neg = vandnot(wave_pos, phase_gate); + vi16_t wave_out = vxor(exp_out, wave_neg); + sg->wg_out = wave_out; + chip->wg_mod = wave_out; + + // EG: Move decay to sustain state + vi16_t eg_gen_dec = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_DECAY))); + vi16_t sl_hit = vcmpeq(vsrli(sg_eg_rout, 4), sg->eg_sl); + vi16_t eg_gen_dec_to_sus = vand(eg_gen_dec, sl_hit); + eg_gen = vsub(eg_gen, eg_gen_dec_to_sus); // 1 --> 2 + eg_inc = vandnot(eg_gen_dec_to_sus, eg_inc); + + // WG: Update chip output accumulators, with quirky slot output delay + vi16_t og_out_ac = vblendv(wave_out, sg->og_prout, sg->og_prout_ac); + vi16_t og_out_bd = vblendv(wave_out, sg->og_prout, sg->og_prout_bd); + sg->og_prout = wave_out; + chip->og_acc_a = vadd(chip->og_acc_a, vand(og_out_ac, sg->og_out_ch_gate_a)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(og_out_ac, sg->og_out_ch_gate_c)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(og_out_bd, sg->og_out_ch_gate_b)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(og_out_bd, sg->og_out_ch_gate_d)); + + // EG: Move back to attack state + eg_gen = vand(notreset, eg_gen); // * --> 0 + + // EG: Move to release state + eg_gen = vor(eg_gen, vsrli(vcmpz(sg->eg_key), 14)); // * --> 3 + + // EG: Update envelope generator + eg_rout = vadd(eg_rout, eg_inc); + eg_rout = vand(eg_rout, vset1(0x01FF)); + sg->eg_rout = eg_rout; + sg->eg_gen = eg_gen; + sg->eg_gen_mullo = vsllv(vset1(1), vslli(eg_gen, 2)); + +#ifdef AYMO_DEBUG + sg->eg_rate = rate; + sg->eg_inc = eg_inc; + sg->wg_fbmod = fbsum_sh; + sg->wg_mod = modsum; +#endif +} + + +// Clear output accumulators +static inline +void aymo_(og_clear)(struct aymo_(chip)* chip) +{ + chip->og_acc_a = vsetz(); + chip->og_acc_b = vsetz(); + chip->og_acc_c = vsetz(); + chip->og_acc_d = vsetz(); +} + + +// Updates output mixdown +static inline +void aymo_(og_update)(struct aymo_(chip)* chip) +{ + vi16x8_t one = _mm_set1_epi16(1); + vi32x4_t tot_a = _mm_madd_epi16(chip->og_acc_a, one); + vi32x4_t tot_b = _mm_madd_epi16(chip->og_acc_b, one); + vi32x4_t tot_c = _mm_madd_epi16(chip->og_acc_c, one); + vi32x4_t tot_d = _mm_madd_epi16(chip->og_acc_d, one); + + tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(2, 3, 0, 1))); + tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(2, 3, 0, 1))); + tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(2, 3, 0, 1))); + tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(2, 3, 0, 1))); + + tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(1, 0, 3, 2))); + tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(1, 0, 3, 2))); + tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(1, 0, 3, 2))); + tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(1, 0, 3, 2))); + + vi32x4_t tot_ab = _mm_blend_epi16(tot_a, tot_b, 0xCC); + vi32x4_t tot_cd = _mm_blend_epi16(tot_c, tot_d, 0x33); + vi32x4_t tot_abcd = _mm_blend_epi16(tot_ab, tot_cd, 0xF0); + vi16x8_t sat_abcd = _mm_packs_epi32(tot_abcd, tot_abcd); + + vi16x8_t old_abcd = _mm_shuffle_epi32(chip->og_out, _MM_SHUFFLE(1, 0, 3, 2)); + vi16x8_t out_abcd = _mm_blend_epi16(old_abcd, sat_abcd, 0xF5); + + chip->og_out = out_abcd; +} + + +// Updates timer management +static inline +void aymo_(tm_update)(struct aymo_(chip)* chip) +{ + // Update tremolo + if AYMO_UNLIKELY((chip->tm_timer & 0x3F) == 0x3F) { + chip->eg_tremolopos = ((chip->eg_tremolopos + 1) % 210); + + uint16_t eg_tremolopos = chip->eg_tremolopos; + if (eg_tremolopos >= 105) { + eg_tremolopos = (210 - eg_tremolopos); + } + vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift)); + + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am); + } + } + + // Update vibrato + if AYMO_UNLIKELY((chip->tm_timer & 0x3FF) == 0x3FF) { + chip->pg_vibpos = ((chip->pg_vibpos + 1) & 7); + uint8_t vibpos = chip->pg_vibpos; + int16_t pg_vib_mulhi = (0x10000 >> 7); + int16_t pg_vib_neg = 0; + + if (!(vibpos & 3)) { + pg_vib_mulhi = 0; + } + else if (vibpos & 1) { + pg_vib_mulhi >>= 1; + } + pg_vib_mulhi >>= chip->eg_vibshift; + pg_vib_mulhi &= 0x7F80; + + if (vibpos & 4) { + pg_vib_neg = -1; + } + chip->pg_vib_mulhi = vset1(pg_vib_mulhi); + chip->pg_vib_neg = vset1(pg_vib_neg); + + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + aymo_(pg_update_deltafreq)(chip, cg, sg); + } + } + + chip->tm_timer++; + uint16_t eg_incstep = aymo_(eg_incstep_table)[chip->tm_timer & 3]; + chip->eg_incstep = vi2u(vset1((int16_t)eg_incstep)); + + // Update timed envelope patterns + int16_t eg_shift = (int16_t)uffsll(chip->eg_timer); + int16_t eg_add = ((eg_shift > 13) ? 0 : eg_shift); + chip->eg_add = vset1(eg_add); + + // Update envelope timer and flip state + if (chip->eg_state | chip->eg_timerrem) { + if (chip->eg_timer < ((1ULL << AYMO_YMF262_SLOT_NUM) - 1ULL)) { + chip->eg_timer++; + chip->eg_timerrem = 0; + } + else { + chip->eg_timer = 0; + chip->eg_timerrem = 1; + } + } + chip->eg_state ^= 1; +} + + +// Updates the register queue +static inline +void aymo_(rq_update)(struct aymo_(chip)* chip) +{ + if (chip->rq_delay) { + if (--chip->rq_delay) { + return; + } + } + if (chip->rq_head != chip->rq_tail) { + struct aymo_(reg_queue_item)* item = &chip->rq_buffer[chip->rq_head]; + + if (item->address & 0x8000u) { + chip->rq_delay = AYMO_(REG_QUEUE_LATENCY); + chip->rq_delay += (((uint32_t)(item->address & 0x7FFFu) << 16) | item->value); + } + else { + aymo_(write)(chip, item->address, item->value); + } + + if (++chip->rq_head >= AYMO_(REG_QUEUE_LENGTH)) { + chip->rq_head = 0; + } + } +} + + +static +void aymo_(tick_once)(struct aymo_(chip)* chip) +{ + int sgi; + + // Clear output accumulators + aymo_(og_clear)(chip); + + // Process slot group 0 + sgi = 0; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 2 + sgi = 2; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 4 + sgi = 4; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 6 + sgi = 6; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 1 + sgi = 1; + aymo_(sg_update)(chip, &chip->sg[sgi]); + aymo_(ng_update)(chip, (36 - 3)); // slot 16 --> slot 13 + aymo_(rm_update_sg1)(chip); + + // Process slot group 3 + sgi = 3; + aymo_(sg_update)(chip, &chip->sg[sgi]); + aymo_(ng_update)(chip, 3); // slot 13 --> slot 16 + aymo_(rm_update_sg3)(chip); + + if AYMO_UNLIKELY(chip->process_all_slots) { + // Process slot group 5 + sgi = 5; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 7 + sgi = 7; + aymo_(sg_update)(chip, &chip->sg[sgi]); + } + + // Update outputs + aymo_(og_update)(chip); + + // Update timers + aymo_(tm_update)(chip); + + // Dequeue registers + aymo_(rq_update)(chip); +} + + +static +void aymo_(eg_update_ksl)(struct aymo_(chip)* chip, int word) +{ + int slot = aymo_ymf262_word_to_slot[word]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h); + + int16_t pg_fnum = vextractv(cg->pg_fnum, sgo); + int16_t pg_fnum_hn = ((pg_fnum >> 6) & 15); + + int ch2x = aymo_ymf262_word_to_ch2x[aymo_ymf262_slot_to_word[slot]]; + int16_t eg_block = (int16_t)(chip->ch2x_regs[ch2x].reg_B0h.block); + int16_t eg_ksl = aymo_ymf262_eg_ksl_table[pg_fnum_hn]; + eg_ksl = ((eg_ksl << 2) - ((8 - eg_block) << 5)); + if (eg_ksl < 0) { + eg_ksl = 0; + } + int16_t eg_kslsh = aymo_ymf262_eg_kslsh_table[reg_40h->ksl]; + int16_t eg_ksl_sh = (eg_ksl >> eg_kslsh); + + int16_t eg_tl_x4 = ((int16_t)reg_40h->tl << 2); + + int16_t eg_ksl_sh_tl_x4 = (eg_ksl_sh + eg_tl_x4); + vinsertv(sg->eg_ksl_sh_tl_x4, eg_ksl_sh_tl_x4, sgo); + +#ifdef AYMO_DEBUG + vinsertv(sg->eg_ksl, eg_ksl, sgo); +#endif +} + + +static +void aymo_(chip_pg_update_nts)(struct aymo_(chip)* chip) +{ + for (int slot = 0; slot < AYMO_(SLOT_NUM_MAX); ++slot) { + int word = aymo_ymf262_slot_to_word[slot]; + int ch2x = aymo_ymf262_word_to_ch2x[word]; + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h); + int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8)); + int16_t eg_ksv = ((reg_B0h->block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1)); + + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + + struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h); + int16_t ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1)); + + vinsertv(cg->eg_ksv, eg_ksv, sgo); + vinsertv(sg->eg_ks, ks, sgo); + } +} + + +static +void aymo_(pg_update_fnum)( + struct aymo_(chip)* chip, int ch2x, + int16_t pg_fnum, int16_t eg_ksv, int16_t pg_block +) +{ + int word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int sgi0 = (word0 / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word0 % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi0); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + + vinsertv(cg->pg_block, pg_block, sgo); + vinsertv(cg->pg_fnum, pg_fnum, sgo); + vinsertv(cg->eg_ksv, eg_ksv, sgo); + + struct aymo_(slot_group)* sg0 = &(chip->sg[sgi0]); + int slot0 = aymo_ymf262_word_to_slot[word0]; + struct aymo_ymf262_reg_20h* reg_20h0 = &(chip->slot_regs[slot0].reg_20h); + int16_t ks0 = (eg_ksv >> ((reg_20h0->ksr ^ 1) << 1)); + vinsertv(sg0->eg_ks, ks0, sgo); + aymo_(eg_update_ksl)(chip, word0); + aymo_(pg_update_deltafreq)(chip, cg, sg0); + + int word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int sgi1 = (word1 / AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg1 = &(chip->sg[sgi1]); + int slot1 = aymo_ymf262_word_to_slot[word1]; + struct aymo_ymf262_reg_20h* reg_20h1 = &(chip->slot_regs[slot1].reg_20h); + int16_t ks1 = (eg_ksv >> ((reg_20h1->ksr ^ 1) << 1)); + vinsertv(sg1->eg_ks, ks1, sgo); + aymo_(eg_update_ksl)(chip, word1); + aymo_(pg_update_deltafreq)(chip, cg, sg1); +} + + +static +void aymo_(ch2x_update_fnum)(struct aymo_(chip)* chip, int ch2x, int8_t ch2p) +{ + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h); + int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8)); + int16_t pg_block = (int16_t)reg_B0h->block; + int16_t eg_ksv = ((pg_block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1)); + + aymo_(pg_update_fnum)(chip, ch2x, pg_fnum, eg_ksv, pg_block); + + if (ch2p >= 0) { + aymo_(pg_update_fnum)(chip, ch2p, pg_fnum, eg_ksv, pg_block); + } +} + + +static inline +void aymo_(eg_key_on)(struct aymo_(chip)* chip, int word, int16_t mode) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + int16_t eg_key = vextractv(sg->eg_key, sgo); + eg_key |= mode; + vinsertv(sg->eg_key, eg_key, sgo); +} + + +static inline +void aymo_(eg_key_off)(struct aymo_(chip)* chip, int word, int16_t mode) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + int16_t eg_key = vextractv(sg->eg_key, sgo); + eg_key &= (int16_t)~mode; + vinsertv(sg->eg_key, eg_key, sgo); +} + + +static +void aymo_(ch2x_key_on)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm) { + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + unsigned ch2x_is_drum = (chip->og_ch2x_drum & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + + if (ch2x_is_pairing && !ch2x_is_secondary) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0]; + int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL)); + } + else if (!ch2x_is_pairing || ch2x_is_drum) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } + } + else { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } +} + + +static +void aymo_(ch2x_key_off)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm) { + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + unsigned ch2x_is_drum = (chip->og_ch2x_drum & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + + if (ch2x_is_pairing && !ch2x_is_secondary) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0]; + int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL)); + } + else if (!ch2x_is_pairing || ch2x_is_drum) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } + } + else { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } +} + + +static +void aymo_(cm_rewire_slot)(struct aymo_(chip)* chip, int word, const struct aymo_(conn)* conn) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + vinsertv(sg->wg_fbmod_gate, conn->wg_fbmod_gate, sgo); + vinsertv(sg->wg_prmod_gate, conn->wg_prmod_gate, sgo); + int16_t og_out_gate = conn->og_out_gate; + vinsertv(sg->og_out_gate, og_out_gate, sgo); + + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + vinsertv(sg->og_out_ch_gate_a, (vextractv(cg->og_ch_gate_a, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_b, (vextractv(cg->og_ch_gate_b, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_c, (vextractv(cg->og_ch_gate_c, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_d, (vextractv(cg->og_ch_gate_d, sgo) & og_out_gate), sgo); +} + + +static +void aymo_(cm_rewire_ch2x)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm && (chip->og_ch2x_pairing & (1UL << ch2x))) { + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (ch2x_is_secondary) { + int t = ch2x; + ch2x = ch2p; + ch2p = t; + } + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt); + const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]); + } + else { + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]); + } +} + + +static +void aymo_(cm_rewire_conn)( + struct aymo_(chip)* chip, + const struct aymo_ymf262_reg_104h* reg_104h_prev +) +{ + struct aymo_ymf262_reg_104h* reg_104h = &chip->chip_regs.reg_104h; + unsigned diff = (reg_104h_prev ? (reg_104h_prev->conn ^ reg_104h->conn) : 0xFF); + + for (int ch4x = 0; ch4x < (AYMO_(CHANNEL_NUM_MAX) / 2); ++ch4x) { + if (diff & (1 << ch4x)) { + int ch2x = aymo_ymf262_ch4x_to_pair[ch4x][0]; + int ch2p = aymo_ymf262_ch4x_to_pair[ch4x][1]; + + if (reg_104h->conn & (1 << ch4x)) { + chip->og_ch2x_pairing |= ((1UL << ch2x) | (1UL << ch2p)); + + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt); + const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]); + } + else { + chip->og_ch2x_pairing &= ~((1UL << ch2x) | (1UL << ch2p)); + + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]); + + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + const struct aymo_(conn)* ch2p_conn = aymo_(conn_ch2x_table)[ch2p_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch2p_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch2p_conn[1]); + } + } + } +} + + +static +void aymo_(cm_rewire_rhythm)( + struct aymo_(chip)* chip, + const struct aymo_ymf262_reg_BDh* reg_BDh_prev +) +{ + const struct aymo_ymf262_reg_BDh reg_BDh_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + const struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh; + int force_update = 0; + + if (reg_BDh->ryt) { + if (!reg_BDh_prev->ryt) { + // Apply special connection for rhythm mode + unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt; + const struct aymo_(conn)* ch6_conn = aymo_(conn_ryt_table)[ch6_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]); + + const struct aymo_(conn)* ch7_conn = aymo_(conn_ryt_table)[2]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]); + + const struct aymo_(conn)* ch8_conn = aymo_(conn_ryt_table)[3]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]); + + force_update = 1; + } + } + else { + if (reg_BDh_prev->ryt) { + // Apply standard Channel_2xOP connection + unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt; + const struct aymo_(conn)* ch6_conn = aymo_(conn_ch2x_table)[ch6_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]); + + unsigned ch7_cnt = chip->ch2x_regs[7].reg_C0h.cnt; + const struct aymo_(conn)* ch7_conn = aymo_(conn_ch2x_table)[ch7_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]); + + unsigned ch8_cnt = chip->ch2x_regs[8].reg_C0h.cnt; + const struct aymo_(conn)* ch8_conn = aymo_(conn_ch2x_table)[ch8_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]); + + reg_BDh = ®_BDh_zero; // force all keys off + force_update = 1; + } + } + + if ((reg_BDh->hh != reg_BDh_prev->hh) || force_update) { + int word_hh = aymo_ymf262_ch2x_to_word[7][0]; + if (reg_BDh->hh) { + aymo_(eg_key_on)(chip, word_hh, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_hh, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->tc != reg_BDh_prev->tc) || force_update) { + int word_tc = aymo_ymf262_ch2x_to_word[8][1]; + if (reg_BDh->tc) { + aymo_(eg_key_on)(chip, word_tc, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_tc, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->tom != reg_BDh_prev->tom) || force_update) { + int word_tom = aymo_ymf262_ch2x_to_word[8][0]; + if (reg_BDh->tom) { + aymo_(eg_key_on)(chip, word_tom, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_tom, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->sd != reg_BDh_prev->sd) || force_update) { + int word_sd = aymo_ymf262_ch2x_to_word[7][1]; + if (reg_BDh->sd) { + aymo_(eg_key_on)(chip, word_sd, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_sd, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->bd != reg_BDh_prev->bd) || force_update) { + int word_bd0 = aymo_ymf262_ch2x_to_word[6][0]; + int word_bd1 = aymo_ymf262_ch2x_to_word[6][1]; + if (reg_BDh->bd) { + aymo_(eg_key_on)(chip, word_bd0, AYMO_(EG_KEY_DRUM)); + aymo_(eg_key_on)(chip, word_bd1, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_bd0, AYMO_(EG_KEY_DRUM)); + aymo_(eg_key_off)(chip, word_bd1, AYMO_(EG_KEY_DRUM)); + } + } +} + + +static +void aymo_(write_00h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + switch (address) { + case 0x01: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_01h) = value; + break; + } + case 0x02: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_02h) = value; + break; + } + case 0x03: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_03h) = value; + break; + } + case 0x04: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_04h) = value; + break; + } + case 0x104: { + struct aymo_ymf262_reg_104h reg_104h_prev = chip->chip_regs.reg_104h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_104h) = value; + aymo_(cm_rewire_conn)(chip, ®_104h_prev); + break; + } + case 0x105: { + struct aymo_ymf262_reg_105h reg_105h_prev = chip->chip_regs.reg_105h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_105h) = value; + if (chip->chip_regs.reg_105h.newm != reg_105h_prev.newm) { + ; + } + break; + } + case 0x08: { + struct aymo_ymf262_reg_08h reg_08h_prev = chip->chip_regs.reg_08h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_08h) = value; + if (chip->chip_regs.reg_08h.nts != reg_08h_prev.nts) { + aymo_(chip_pg_update_nts)(chip); + } + break; + } + } +} + + +static +void aymo_(write_20h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int sgi = (aymo_ymf262_slot_to_word[slot] / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (aymo_ymf262_slot_to_word[slot] % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h); + struct aymo_ymf262_reg_20h reg_20h_prev = *reg_20h; + *(uint8_t*)(void*)reg_20h = value; + unsigned update_deltafreq = 0; + + if (reg_20h->mult != reg_20h_prev.mult) { + int16_t pg_mult_x2 = aymo_ymf262_pg_mult_x2_table[reg_20h->mult]; + vinsertv(sg->pg_mult_x2, pg_mult_x2, sgo); + update_deltafreq = 1; // force + } + + if (reg_20h->ksr != reg_20h_prev.ksr) { + int16_t eg_ksv = vextractv(cg->eg_ksv, sgo); + int16_t eg_ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1)); + vinsertv(sg->eg_ks, eg_ks, sgo); + } + + if (reg_20h->egt != reg_20h_prev.egt) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->sr = (reg_20h->egt ? 0 : chip->slot_regs[slot].reg_80h.rr); + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + } + + if (reg_20h->vib != reg_20h_prev.vib) { + int16_t pg_vib = -(int16_t)reg_20h->vib; + vinsertv(sg->pg_vib, pg_vib, sgo); + update_deltafreq = 1; // force + } + + if (reg_20h->am != reg_20h_prev.am) { + int16_t eg_am = -(int16_t)reg_20h->am; + vinsertv(sg->eg_am, eg_am, sgo); + + uint16_t eg_tremolopos = chip->eg_tremolopos; + if (eg_tremolopos >= 105) { + eg_tremolopos = (210 - eg_tremolopos); + } + vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift)); + vsfence(); + sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am); + } + + if (update_deltafreq) { + for (sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + cgi = aymo_(sgi_to_cgi)(sgi); + cg = &chip->cg[cgi]; + sg = &chip->sg[sgi]; + aymo_(pg_update_deltafreq)(chip, cg, sg); + } + } +} + + +static +void aymo_(write_40h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h); + struct aymo_ymf262_reg_40h reg_40h_prev = *reg_40h; + *(uint8_t*)(void*)reg_40h = value; + + if ((reg_40h->tl != reg_40h_prev.tl) || (reg_40h->ksl != reg_40h_prev.ksl)) { + aymo_(eg_update_ksl)(chip, word); + } +} + + +static +void aymo_(write_60h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_60h* reg_60h = &(chip->slot_regs[slot].reg_60h); + struct aymo_ymf262_reg_60h reg_60h_prev = *reg_60h; + *(uint8_t*)(void*)reg_60h = value; + + if ((reg_60h->dr != reg_60h_prev.dr) || (reg_60h->ar != reg_60h_prev.ar)) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->dr = reg_60h->dr; + eg_adsr->ar = reg_60h->ar; + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + } +} + + +static +void aymo_(write_80h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_80h* reg_80h = &(chip->slot_regs[slot].reg_80h); + struct aymo_ymf262_reg_80h reg_80h_prev = *reg_80h; + *(uint8_t*)(void*)reg_80h = value; + + if ((reg_80h->rr != reg_80h_prev.rr) || (reg_80h->sl != reg_80h_prev.sl)) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->sr = (chip->slot_regs[slot].reg_20h.egt ? 0 : reg_80h->rr); + eg_adsr->rr = reg_80h->rr; + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + int16_t eg_sl = (int16_t)reg_80h->sl; + if (eg_sl == 0x0F) { + eg_sl = 0x1F; + } + vinsertv(sg->eg_sl, eg_sl, sgo); + } +} + + +static +void aymo_(write_E0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_E0h* reg_E0h = &(chip->slot_regs[slot].reg_E0h); + struct aymo_ymf262_reg_E0h reg_E0h_prev = *reg_E0h; + *(uint8_t*)(void*)reg_E0h = value; + + if (!chip->chip_regs.reg_105h.newm) { + reg_E0h->ws &= 3; + } + + if (reg_E0h->ws != reg_E0h_prev.ws) { + const struct aymo_(wave)* wave = &aymo_(wave_table)[reg_E0h->ws]; + vinsertv(sg->wg_phase_mullo, wave->wg_phase_mullo, sgo); + vinsertv(sg->wg_phase_zero, wave->wg_phase_zero, sgo); + vinsertv(sg->wg_phase_neg, wave->wg_phase_neg, sgo); + vinsertv(sg->wg_phase_flip, wave->wg_phase_flip, sgo); + vinsertv(sg->wg_phase_mask, wave->wg_phase_mask, sgo); + vinsertv(sg->wg_sine_gate, wave->wg_sine_gate, sgo); + } +} + + +static +void aymo_(write_A0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) { + return; + } + if (!ch2x_is_pairing || ch2x_is_secondary) { + ch2p = -1; + } + + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_A0h reg_A0h_prev = *reg_A0h; + *(uint8_t*)(void*)reg_A0h = value; + + if (reg_A0h->fnum_lo != reg_A0h_prev.fnum_lo) { + aymo_(ch2x_update_fnum)(chip, ch2x, ch2p); + } +} + + +static +void aymo_(write_B0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) { + return; + } + if (!ch2x_is_pairing || ch2x_is_secondary) { + ch2p = -1; + } + + if (address == 0xBD) { + struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh; + struct aymo_ymf262_reg_BDh reg_BDh_prev = *reg_BDh; + *(uint8_t*)(void*)reg_BDh = value; + + chip->eg_tremoloshift = (((reg_BDh->dam ^ 1) << 1) + 2); + chip->eg_vibshift = (reg_BDh->dvb ^ 1); + aymo_(cm_rewire_rhythm)(chip, ®_BDh_prev); + } + else { + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_B0h reg_B0h_prev = *reg_B0h; + *(uint8_t*)(void*)reg_B0h = value; + + if ((reg_B0h->fnum_hi != reg_B0h_prev.fnum_hi) || (reg_B0h->block != reg_B0h_prev.block)) { + aymo_(ch2x_update_fnum)(chip, ch2x, ch2p); + } + + if (reg_B0h->kon != reg_B0h_prev.kon) { + if (reg_B0h->kon) { + aymo_(ch2x_key_on)(chip, ch2x); + } else { + aymo_(ch2x_key_off)(chip, ch2x); + } + } + } +} + + +static +void aymo_(write_C0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + struct aymo_ymf262_reg_C0h* reg_C0h = &(chip->ch2x_regs[ch2x].reg_C0h); + struct aymo_ymf262_reg_C0h reg_C0h_prev = *reg_C0h; + if (!chip->chip_regs.reg_105h.newm) { + value = ((value | 0x30) & 0x3F); + } + *(uint8_t*)(void*)reg_C0h = value; + + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int sgo = (ch2x_word0 % AYMO_(SLOT_GROUP_LENGTH)); + int sgi0 = (ch2x_word0 / AYMO_(SLOT_GROUP_LENGTH)); + int sgi1 = (ch2x_word1 / AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg0 = &chip->sg[sgi0]; + struct aymo_(slot_group)* sg1 = &chip->sg[sgi1]; + int cgi = aymo_(sgi_to_cgi)(sgi0); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + + if (reg_C0h->cha != reg_C0h_prev.cha) { + int16_t og_ch_gate_a = -(int16_t)reg_C0h->cha; + vinsertv(cg->og_ch_gate_a, og_ch_gate_a, sgo); + vinsertv(sg0->og_out_ch_gate_a, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_a), sgo); + vinsertv(sg1->og_out_ch_gate_a, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_a), sgo); + } + if (reg_C0h->chb != reg_C0h_prev.chb) { + int16_t og_ch_gate_b = -(int16_t)reg_C0h->chb; + vinsertv(cg->og_ch_gate_b, og_ch_gate_b, sgo); + vinsertv(sg0->og_out_ch_gate_b, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_b), sgo); + vinsertv(sg1->og_out_ch_gate_b, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_b), sgo); + } + if (reg_C0h->chc != reg_C0h_prev.chc) { + int16_t og_ch_gate_c = -(int16_t)reg_C0h->chc; + vinsertv(cg->og_ch_gate_c, og_ch_gate_c, sgo); + vinsertv(sg0->og_out_ch_gate_c, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_c), sgo); + vinsertv(sg1->og_out_ch_gate_c, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_c), sgo); + } + if (reg_C0h->chd != reg_C0h_prev.chd) { + int16_t og_ch_gate_d = -(int16_t)reg_C0h->chd; + vinsertv(cg->og_ch_gate_d, og_ch_gate_d, sgo); + vinsertv(sg0->og_out_ch_gate_d, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_d), sgo); + vinsertv(sg1->og_out_ch_gate_d, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_d), sgo); + } + + if (reg_C0h->fb != reg_C0h_prev.fb) { + int16_t fb_mulhi = (reg_C0h->fb ? (0x0040 << reg_C0h->fb) : 0); + vinsertv(sg0->wg_fb_mulhi, fb_mulhi, sgo); + vinsertv(sg1->wg_fb_mulhi, fb_mulhi, sgo); + } + + if (chip->chip_regs.reg_105h.stereo) { + // TODO + } + + if (reg_C0h->cnt != reg_C0h_prev.cnt) { + aymo_(cm_rewire_ch2x)(chip, ch2x); + } +} + + +static +void aymo_(write_D0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + *(uint8_t*)(void*)&(chip->ch2x_regs[ch2x].reg_C0h) = value; + + if (chip->chip_regs.reg_105h.stereo) { + // TODO + } +} + + +static +int aymo_(rq_enqueue)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + uint16_t rq_tail = chip->rq_tail; + uint16_t rq_next = (rq_tail + 1); + if (rq_next >= AYMO_(REG_QUEUE_LENGTH)) { + rq_next = 0u; + } + + if (rq_next != chip->rq_head) { + chip->rq_buffer[rq_tail].address = address; + chip->rq_buffer[rq_tail].value = value; + chip->rq_tail = rq_next; + return 1; + } + return 0; +} + + +const struct aymo_ymf262_vt* aymo_(get_vt)(void) +{ + return &(aymo_(vt)); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip) +{ + assert(chip); + + // Wipe everything, except VT + const struct aymo_ymf262_vt* vt = chip->parent.vt; + aymo_memset(chip, 0, sizeof(*chip)); + chip->parent.vt = vt; + + // Initialize slots + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + sg->eg_rout = vset1(0x01FF); + sg->eg_out = vset1(0x01FF); + sg->eg_gen = vset1(AYMO_(EG_GEN_RELEASE)); + sg->eg_gen_mullo = vset1(AYMO_(EG_GEN_MULLO_RELEASE)); + sg->pg_mult_x2 = vset1(aymo_ymf262_pg_mult_x2_table[0]); + sg->og_prout_ac = vsetm(aymo_(og_prout_ac)[sgi]); + sg->og_prout_bd = vsetm(aymo_(og_prout_bd)[sgi]); + + const struct aymo_(wave)* wave = &aymo_(wave_table)[0]; + sg->wg_phase_mullo = vset1(wave->wg_phase_mullo); + sg->wg_phase_zero = vset1(wave->wg_phase_zero); + sg->wg_phase_neg = vset1(wave->wg_phase_neg); + sg->wg_phase_flip = vset1(wave->wg_phase_flip); + sg->wg_phase_mask = vset1(wave->wg_phase_mask); + sg->wg_sine_gate = vset1(wave->wg_sine_gate); + } + + // Initialize channels + for (int cgi = 0; cgi < (AYMO_(SLOT_GROUP_NUM) / 2); ++cgi) { + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + cg->og_ch_gate_a = vset1(-1); + cg->og_ch_gate_b = vset1(-1); + } + for (int ch2x = 0; ch2x < AYMO_(CHANNEL_NUM_MAX); ++ch2x) { + aymo_(cm_rewire_ch2x)(chip, ch2x); + } + + // Initialize chip + chip->ng_noise = 1; + + chip->eg_tremoloshift = 4; + chip->eg_vibshift = 1; +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + AYMO_UNUSED_VAR(chip); + AYMO_UNUSED_VAR(address); + assert(chip); + + // not supported + return 0u; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address > 0x1FF) { + return; + } + + switch (address & 0xF0) { + case 0x00: { + aymo_(write_00h)(chip, address, value); + break; + } + case 0x20: + case 0x30: { + aymo_(write_20h)(chip, address, value); + break; + } + case 0x40: + case 0x50: { + aymo_(write_40h)(chip, address, value); + break; + } + case 0x60: + case 0x70: { + aymo_(write_60h)(chip, address, value); + break; + } + case 0x80: + case 0x90: { + aymo_(write_80h)(chip, address, value); + break; + } + case 0xE0: + case 0xF0: { + aymo_(write_E0h)(chip, address, value); + break; + } + case 0xA0: { + aymo_(write_A0h)(chip, address, value); + break; + } + case 0xB0: { + aymo_(write_B0h)(chip, address, value); + break; + } + case 0xC0: { + aymo_(write_C0h)(chip, address, value); + break; + } + case 0xD0: { + aymo_(write_D0h)(chip, address, value); + break; + } + } + vsfence(); +} + + +int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address < 0x8000u) { + return aymo_(rq_enqueue)(chip, address, value); + } + return 0; +} + + +int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + if (count < 0x8000u) { + uint16_t address = (uint16_t)((count >> 8) | 0x8000u); + uint8_t value = (uint8_t)(count & 0xFFu); + return aymo_(rq_enqueue)(chip, address, value); + } + return 0; +} + + +int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel) +{ + assert(chip); + + switch (channel) { + case 0u: return _mm_extract_epi16(chip->og_out, 0); + case 1u: return _mm_extract_epi16(chip->og_out, 1); + case 2u: return _mm_extract_epi16(chip->og_out, 2); + case 3u: return _mm_extract_epi16(chip->og_out, 3); + default: return 0; + } +} + + +void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + while (count--) { + aymo_(tick_once)(chip); + } +} + + +void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 3u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + *(int32_t*)y = _mm_cvtsi128_si32(chip->og_out); + y += 2u; + } +} + + +void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 7u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + _mm_storel_epi64((void*)y, chip->og_out); + y += 4u; + } +} + + +void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 7u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out); + vf32x4_t vf32 = _mm_cvtepi32_ps(vi32); + _mm_storel_pi((void*)y, vf32); + y += 2u; + } +} + + +void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 15u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out); + vf32x4_t vf32 = _mm_cvtepi32_ps(vi32); + _mm_store_ps(y, vf32); + y += 4u; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX diff --git a/src/aymo_ymf262_x86_avx2.c b/src/aymo_ymf262_x86_avx2.c new file mode 100644 index 0000000..30e19e0 --- /dev/null +++ b/src/aymo_ymf262_x86_avx2.c @@ -0,0 +1,1683 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include +#include "aymo_cpu_x86_avx2_inline.h" +#include "aymo_ymf262.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_x86_avx2.h" + +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_ymf262_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_ymf262_ctor_f)&(aymo_(ctor)), + (aymo_ymf262_dtor_f)&(aymo_(dtor)), + (aymo_ymf262_read_f)&(aymo_(read)), + (aymo_ymf262_write_f)&(aymo_(write)), + (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)), + (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)), + (aymo_ymf262_get_output_f)&(aymo_(get_output)), + (aymo_ymf262_tick_f)&(aymo_(tick)), + (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)), + (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)), + (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)), + (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4)) +}; + + +// 32-bit Slot Group side (lo/hi) +const int8_t aymo_(sgo_side)[16] = +{ + 0, 0, 0, 0, 1, 1, 1, 1, + 0, 0, 0, 0, 1, 1, 1, 1 +}; + +// 32-bit Slot Group cell +const int8_t aymo_(sgo_cell)[16] = +{ + 0, 1, 2, 3, 0, 1, 2, 3, + 4, 5, 6, 7, 4, 5, 6, 7 +}; + + +const uint16_t aymo_(eg_incstep_table)[4] = +{ + ((1 << 15) | (1 << 14) | (1 << 13)), + ((0 << 15) | (0 << 14) | (1 << 13)), + ((0 << 15) | (1 << 14) | (1 << 13)), + ((0 << 15) | (0 << 14) | (0 << 13)) +}; + + +// Wave descriptors +const struct aymo_(wave) aymo_(wave_table)[8] = // TODO: share bits; select vit shifts +{ + { 1, 0x0000, 0x0200, 0x0100, 0x00FF, -1 }, + { 1, 0x0200, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0000, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0100, 0x0000, 0x0100, 0x00FF, -1 }, + { 2, 0x0400, 0x0200, 0x0100, 0x00FF, -1 }, + { 2, 0x0400, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0000, 0x0200, 0x0200, 0x0001, 0 }, + { 8, 0x0000, 0x1000, 0x1000, 0x1FFF, 0 } +}; + + +// 2-channel connection descriptors +const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */] = +{ + { + { -1, 0, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, -1 } + }, +}; + +// 4-channel connection descriptors +const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */] = +{ + { + { -1, 0, 0 }, + { 0, -1, 0 }, + { 0, -1, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, 0 }, + { 0, -1, -1 }, + { 0, 0, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, 0 }, + { 0, -1, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, 0 }, + { 0, -1, -1 }, + { 0, 0, -1 } + }, +}; + +// Rhythm connection descriptors +const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */] = +{ + // Channel 6: BD, FM + { + { -1, 0, 0 }, + { 0, -1, -1 } + }, + // Channel 6: BD, AM + { + { -1, 0, 0 }, + { 0, 0, -1 } + }, + // Channel 7: HH + SD + { + { 0, 0, -1 }, + { 0, 0, -1 } + }, + // Channel 8: TT + TC + { + { 0, 0, -1 }, + { 0, 0, -1 } + } +}; + + +// Slot mask output delay for outputs A and C +const uint16_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)] = // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag +{ + 0xF8F8, + 0xFFF8, + 0xFFF8, + 0xFFF8 +}; + + +// Slot mask output delay for outputs B and D +const uint16_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)] = // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag +{ + 0xF888, + 0xF888, + 0xFF88, + 0xFF88 +}; + + +// Updates phase generator +static inline +void aymo_(pg_update_deltafreq)( + struct aymo_(chip)* chip, + struct aymo_(ch2x_group)* cg, + struct aymo_(slot_group)* sg +) +{ + // Update phase + vi16_t fnum = cg->pg_fnum; + vi16_t range = vand(fnum, vset1(7 << 7)); + range = vmulihi(range, vand(sg->pg_vib, chip->pg_vib_mulhi)); + range = vsub(vxor(range, chip->pg_vib_neg), chip->pg_vib_neg); // flip sign + fnum = vadd(fnum, range); + + vi32_t zero = vsetz(); + vi32_t fnum_lo = vunpacklo(fnum, zero); + vi32_t fnum_hi = vunpackhi(fnum, zero); + vi32_t block_sll_lo = vunpacklo(cg->pg_block, zero); + vi32_t block_sll_hi = vunpackhi(cg->pg_block, zero); + vi32_t basefreq_lo = vvsrli(vvsllv(fnum_lo, block_sll_lo), 1); + vi32_t basefreq_hi = vvsrli(vvsllv(fnum_hi, block_sll_hi), 1); + vi32_t pg_mult_x2_lo = vunpacklo(sg->pg_mult_x2, zero); + vi32_t pg_mult_x2_hi = vunpackhi(sg->pg_mult_x2, zero); + vi32_t deltafreq_lo = vvsrli(vvmullo(basefreq_lo, pg_mult_x2_lo), 1); + vi32_t deltafreq_hi = vvsrli(vvmullo(basefreq_hi, pg_mult_x2_hi), 1); + sg->pg_deltafreq_lo = deltafreq_lo; + sg->pg_deltafreq_hi = deltafreq_hi; +} + + +// Updates noise generator +static inline +void aymo_(ng_update)(struct aymo_(chip)* chip, unsigned times) +{ + // Update noise + uint32_t noise = chip->ng_noise; + while (times--) { + uint32_t n_bit = (((noise >> 14) ^ noise) & 1); + noise = ((noise >> 1) | (n_bit << 22)); + } + chip->ng_noise = noise; +} + + +// Updates rhythm manager, slot group 0 +static inline +void aymo_(rm_update_sg0)(struct aymo_(chip)* chip) +{ + struct aymo_(slot_group)* sg = &chip->sg[0]; + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Double rhythm outputs + vi16_t ryt_slot_mask = vsetr(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0); + vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask); + chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d)); + } + + vi16_t phase = sg->pg_phase_out; + uint16_t phase13 = (uint16_t)vextract(phase, 9); + + // Update noise bits + chip->rm_hh_bit2 = ((phase13 >> 2) & 1); + chip->rm_hh_bit3 = ((phase13 >> 3) & 1); + chip->rm_hh_bit7 = ((phase13 >> 7) & 1); + chip->rm_hh_bit8 = ((phase13 >> 8) & 1); + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Calculate noise bit + uint16_t rm_xor = ( + (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) | + (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) | + (chip->rm_tc_bit3 ^ chip->rm_tc_bit5) + ); + + // Update HH + uint16_t noise = (uint16_t)chip->ng_noise; + phase13 = (rm_xor << 9); + if (rm_xor ^ (noise & 1)) { + phase13 |= 0xD0; + } else { + phase13 |= 0x34; + } + phase = vinsert(phase, (int16_t)phase13, 9); + + sg->pg_phase_out = phase; + } +} + + +// Updates rhythm manager, slot group 1 +static inline +void aymo_(rm_update_sg1)(struct aymo_(chip)* chip) +{ + struct aymo_(slot_group)* sg = &chip->sg[1]; + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Double rhythm outputs + vi16_t ryt_slot_mask = vsetr(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0); + vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask); + chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d)); + + // Calculate noise bit + uint16_t rm_xor = ( + (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) | + (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) | + (chip->rm_tc_bit3 ^ chip->rm_tc_bit5) + ); + vi16_t phase = sg->pg_phase_out; + + // Update SD + uint16_t noise = (uint16_t)chip->ng_noise; + uint16_t phase16 = ( + ((uint16_t)chip->rm_hh_bit8 << 9) | + ((uint16_t)(chip->rm_hh_bit8 ^ (noise & 1)) << 8) + ); + phase = vinsert(phase, (int16_t)phase16, 9); + + // Update TC + uint32_t phase17 = vextract(phase, 10); + chip->rm_tc_bit3 = ((phase17 >> 3) & 1); + chip->rm_tc_bit5 = ((phase17 >> 5) & 1); + phase17 = ((rm_xor << 9) | 0x80); + phase = vinsert(phase, (int16_t)phase17, 10); + + sg->pg_phase_out = phase; + } +} + + +// Updates slot generators +static +void aymo_(sg_update)( + struct aymo_(chip)* chip, + struct aymo_(slot_group)* sg +) +{ + // EG: Compute envelope output + vi16_t sg_eg_rout = sg->eg_rout; + sg->eg_out = vadd(vadd(sg_eg_rout, sg->eg_tremolo_am), sg->eg_ksl_sh_tl_x4); + + // PG: Compute phase output + vi32_t phase_out_mask = vvset1(0xFFFF); + vi32_t phase_out_lo = vvand(vvsrli(sg->pg_phase_lo, 9), phase_out_mask); + vi32_t phase_out_hi = vvand(vvsrli(sg->pg_phase_hi, 9), phase_out_mask); + vi16_t phase_out = vvpackus(phase_out_lo, phase_out_hi); + sg->pg_phase_out = phase_out; + + // EG: Compute rate + vi16_t eg_prgen = sg->eg_gen; + vi16_t eg_gen_rel = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_RELEASE))); + vi16_t notreset = vcmpz(vand(sg->eg_key, eg_gen_rel)); + vi16_t eg_gen_mullo = vblendv(vset1(AYMO_(EG_GEN_MULLO_ATTACK)), sg->eg_gen_mullo, notreset); + vi16_t reg_rate = vu2i(vmululo(vi2u(sg->eg_adsr), vi2u(eg_gen_mullo))); // move to top nibble + vi16_t rate_temp = vand(reg_rate, vset1((int16_t)0xF000)); // keep top nibble + rate_temp = vsrli(rate_temp, AYMO_(EG_GEN_SRLHI)); + vi16_t rate = vadd(sg->eg_ks, rate_temp); + vi16_t rate_lo = vand(rate, vset1(3)); + vi16_t rate_hi = vsrli(rate, 2); + rate_hi = vmini(rate_hi, vset1(15)); + + // PG: Update phase + vi32_t notreset_lo = vunpacklo(notreset, notreset); + vi32_t notreset_hi = vunpackhi(notreset, notreset); + vi32_t pg_phase_lo = vvand(notreset_lo, sg->pg_phase_lo); + vi32_t pg_phase_hi = vvand(notreset_hi, sg->pg_phase_hi); + sg->pg_phase_lo = vvadd(pg_phase_lo, sg->pg_deltafreq_lo); + sg->pg_phase_hi = vvadd(pg_phase_hi, sg->pg_deltafreq_hi); + + // EG: Compute shift (< 12) + vi16_t eg_shift = vadd(rate_hi, chip->eg_add); + vi16_t rate_pre_lt12 = vor(vslli(rate_lo, 1), vset1(8)); + vi16_t shift_lt12 = vsrlv(rate_pre_lt12, vsubsu(vset1(15), eg_shift)); + vi16_t eg_state = vset1((int16_t)chip->eg_state); + shift_lt12 = vand(shift_lt12, eg_state); + + // WG: Compute feedback and modulation inputs + vi16_t fbsum = vslli(vadd(sg->wg_out, sg->wg_prout), 1); + vi16_t fbsum_sh = vmulihi(fbsum, sg->wg_fb_mulhi); + vi16_t prmod = vand(chip->wg_mod, sg->wg_prmod_gate); + vi16_t fbmod = vand(fbsum_sh, sg->wg_fbmod_gate); + sg->wg_prout = sg->wg_out; + + // WG: Compute operator phase input + vi16_t modsum = vadd(fbmod, prmod); + vi16_t phase = vadd(phase_out, modsum); + + // EG: Compute shift (>= 12) + vu16_t rate_lo_muluhi = vi2u(vslli(vpow2m1lt4(rate_lo), 1)); + vi16_t incstep_ge12 = vand(vu2i(vmuluhi(chip->eg_incstep, rate_lo_muluhi)), vset1(1)); + vi16_t shift_ge12 = vadd(vand(rate_hi, vset1(3)), incstep_ge12); + shift_ge12 = vmini(shift_ge12, vset1(3)); + shift_ge12 = vblendv(shift_ge12, eg_state, vcmpz(shift_ge12)); + + vi16_t shift = vblendv(shift_lt12, shift_ge12, vcmpgt(rate_hi, vset1(11))); + shift = vandnot(vcmpz(rate_temp), shift); + + // EG: Instant attack + vi16_t eg_rout = sg_eg_rout; + eg_rout = vandnot(vandnot(notreset, vcmpeq(rate_hi, vset1(15))), eg_rout); + + // WG: Process phase + vi16_t phase_sped = vu2i(vmululo(vi2u(phase), sg->wg_phase_mullo)); + vi16_t phase_gate = vcmpz(vand(phase_sped, sg->wg_phase_zero)); + vi16_t phase_flip = vcmpp(vand(phase_sped, sg->wg_phase_flip)); + vi16_t phase_mask = sg->wg_phase_mask; + vi16_t phase_xor = vand(phase_flip, phase_mask); + vi16_t phase_idx = vxor(phase_sped, phase_xor); + phase_out = vand(vand(phase_gate, phase_mask), phase_idx); + + // EG: Envelope off + vi16_t eg_off = vcmpgt(sg_eg_rout, vset1(0x01F7)); + vi16_t eg_gen_natk_and_nrst = vand(vcmpp(eg_prgen), notreset); + eg_rout = vblendv(eg_rout, vset1(0x01FF), vand(eg_gen_natk_and_nrst, eg_off)); + + // WG: Compute logsin variant + vi16_t phase_lo = phase_out; // vgather() masks to low byte + vi16_t logsin_val = vgather(aymo_ymf262_logsin_table, phase_lo); + logsin_val = vblendv(vset1(0x1000), logsin_val, phase_gate); + + // EG: Compute common increment not in attack state + vi16_t eg_inc_natk_cond = vand(vand(notreset, vcmpz(eg_off)), vcmpp(shift)); + vi16_t eg_inc_natk = vand(eg_inc_natk_cond, vpow2m1lt4(shift)); + vi16_t eg_gen = eg_prgen; + + // WG: Compute exponential output + vi16_t exp_in = vblendv(phase_out, logsin_val, sg->wg_sine_gate); + vi16_t exp_level = vadd(exp_in, vslli(sg->eg_out, 3)); + exp_level = vmini(exp_level, vset1(0x1FFF)); + vi16_t exp_level_lo = exp_level; // vgather() masks to low byte + vi16_t exp_level_hi = vsrli(exp_level, 8); + vi16_t exp_value = vgather(aymo_ymf262_exp_x2_table, exp_level_lo); + vi16_t exp_out = vsrlv(exp_value, exp_level_hi); + + // EG: Move attack to decay state + vi16_t eg_inc_atk_cond = vand(vand(vcmpp(sg->eg_key), vcmpp(shift)), + vand(vcmpz(eg_prgen), vcmpgt(vset1(15), rate_hi))); + vi16_t eg_inc_atk_ninc = vsrlv(sg_eg_rout, vsub(vset1(4), shift)); + vi16_t eg_inc = vandnot(eg_inc_atk_ninc, eg_inc_atk_cond); + vi16_t eg_gen_atk_to_dec = vcmpz(vor(eg_prgen, sg_eg_rout)); + eg_gen = vsub(eg_gen, eg_gen_atk_to_dec); // 0 --> 1 + eg_inc = vblendv(eg_inc_natk, eg_inc, vcmpz(eg_prgen)); + eg_inc = vandnot(eg_gen_atk_to_dec, eg_inc); + + // WG: Compute operator wave output + vi16_t wave_pos = vcmpz(vand(phase_sped, sg->wg_phase_neg)); + vi16_t wave_neg = vandnot(wave_pos, phase_gate); + vi16_t wave_out = vxor(exp_out, wave_neg); + sg->wg_out = wave_out; + chip->wg_mod = wave_out; + + // EG: Move decay to sustain state + vi16_t eg_gen_dec = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_DECAY))); + vi16_t sl_hit = vcmpeq(vsrli(sg_eg_rout, 4), sg->eg_sl); + vi16_t eg_gen_dec_to_sus = vand(eg_gen_dec, sl_hit); + eg_gen = vsub(eg_gen, eg_gen_dec_to_sus); // 1 --> 2 + eg_inc = vandnot(eg_gen_dec_to_sus, eg_inc); + + // WG: Update chip output accumulators, with quirky slot output delay + vi16_t og_out_ac = vblendv(wave_out, sg->og_prout, sg->og_prout_ac); + vi16_t og_out_bd = vblendv(wave_out, sg->og_prout, sg->og_prout_bd); + sg->og_prout = wave_out; + chip->og_acc_a = vadd(chip->og_acc_a, vand(og_out_ac, sg->og_out_ch_gate_a)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(og_out_ac, sg->og_out_ch_gate_c)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(og_out_bd, sg->og_out_ch_gate_b)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(og_out_bd, sg->og_out_ch_gate_d)); + + // EG: Move back to attack state + eg_gen = vand(notreset, eg_gen); // * --> 0 + + // EG: Move to release state + eg_gen = vor(eg_gen, vsrli(vcmpz(sg->eg_key), 14)); // * --> 3 + + // EG: Update envelope generator + eg_rout = vadd(eg_rout, eg_inc); + eg_rout = vand(eg_rout, vset1(0x01FF)); + sg->eg_rout = eg_rout; + sg->eg_gen = eg_gen; + sg->eg_gen_mullo = vsllv(vset1(1), vslli(eg_gen, 2)); + +#ifdef AYMO_DEBUG + sg->eg_rate = rate; + sg->eg_inc = eg_inc; + sg->wg_fbmod = fbsum_sh; + sg->wg_mod = modsum; +#endif +} + + +// Clear output accumulators +static inline +void aymo_(og_clear)(struct aymo_(chip)* chip) +{ + chip->og_acc_a = vsetz(); + chip->og_acc_b = vsetz(); + chip->og_acc_c = vsetz(); + chip->og_acc_d = vsetz(); +} + + +// Updates output mixdown +static inline +void aymo_(og_update)(struct aymo_(chip)* chip) +{ + vi16x16_t one = _mm256_set1_epi16(1); + vi32x8_t sum_a = _mm256_madd_epi16(chip->og_acc_a, one); + vi32x8_t sum_b = _mm256_madd_epi16(chip->og_acc_b, one); + vi32x8_t sum_c = _mm256_madd_epi16(chip->og_acc_c, one); + vi32x8_t sum_d = _mm256_madd_epi16(chip->og_acc_d, one); + + vi32x4_t sum_a_lo = _mm256_castsi256_si128(sum_a); + vi32x4_t sum_a_hi = _mm256_extracti128_si256(sum_a, 1); + vi32x4_t tot_a = _mm_add_epi32(sum_a_lo, sum_a_hi); + + vi32x4_t sum_b_lo = _mm256_castsi256_si128(sum_b); + vi32x4_t sum_b_hi = _mm256_extracti128_si256(sum_b, 1); + vi32x4_t tot_b = _mm_add_epi32(sum_b_lo, sum_b_hi); + + vi32x4_t sum_c_lo = _mm256_castsi256_si128(sum_c); + vi32x4_t sum_c_hi = _mm256_extracti128_si256(sum_c, 1); + vi32x4_t tot_c = _mm_add_epi32(sum_c_lo, sum_c_hi); + + vi32x4_t sum_d_lo = _mm256_castsi256_si128(sum_d); + vi32x4_t sum_d_hi = _mm256_extracti128_si256(sum_d, 1); + vi32x4_t tot_d = _mm_add_epi32(sum_d_lo, sum_d_hi); + + tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(2, 3, 0, 1))); + tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(2, 3, 0, 1))); + tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(2, 3, 0, 1))); + tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(2, 3, 0, 1))); + + tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(1, 0, 3, 2))); + tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(1, 0, 3, 2))); + tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(1, 0, 3, 2))); + tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(1, 0, 3, 2))); + + vi32x4_t tot_ab = _mm_blend_epi32(tot_a, tot_b, 0xA); + vi32x4_t tot_cd = _mm_blend_epi32(tot_c, tot_d, 0x5); + vi32x4_t tot_abcd = _mm_blend_epi32(tot_ab, tot_cd, 0xC); + vi16x8_t sat_abcd = _mm_packs_epi32(tot_abcd, tot_abcd); + + vi16x8_t old_abcd = _mm_shuffle_epi32(chip->og_out, _MM_SHUFFLE(1, 0, 3, 2)); + vi16x8_t out_abcd = _mm_blend_epi16(old_abcd, sat_abcd, 0xF5); + + chip->og_out = out_abcd; +} + + +// Updates timer management +static inline +void aymo_(tm_update)(struct aymo_(chip)* chip) +{ + // Update tremolo + if AYMO_UNLIKELY((chip->tm_timer & 0x3F) == 0x3F) { + chip->eg_tremolopos = ((chip->eg_tremolopos + 1) % 210); + + uint16_t eg_tremolopos = chip->eg_tremolopos; + if (eg_tremolopos >= 105) { + eg_tremolopos = (210 - eg_tremolopos); + } + vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift)); + + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am); + } + } + + // Update vibrato + if AYMO_UNLIKELY((chip->tm_timer & 0x3FF) == 0x3FF) { + chip->pg_vibpos = ((chip->pg_vibpos + 1) & 7); + uint8_t vibpos = chip->pg_vibpos; + int16_t pg_vib_mulhi = (0x10000 >> 7); + int16_t pg_vib_neg = 0; + + if (!(vibpos & 3)) { + pg_vib_mulhi = 0; + } + else if (vibpos & 1) { + pg_vib_mulhi >>= 1; + } + pg_vib_mulhi >>= chip->eg_vibshift; + pg_vib_mulhi &= 0x7F80; + + if (vibpos & 4) { + pg_vib_neg = -1; + } + chip->pg_vib_mulhi = vset1(pg_vib_mulhi); + chip->pg_vib_neg = vset1(pg_vib_neg); + + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + aymo_(pg_update_deltafreq)(chip, cg, sg); + } + } + + chip->tm_timer++; + uint16_t eg_incstep = aymo_(eg_incstep_table)[chip->tm_timer & 3]; + chip->eg_incstep = vi2u(vset1((int16_t)eg_incstep)); + + // Update timed envelope patterns + int16_t eg_shift = (int16_t)uffsll(chip->eg_timer); + int16_t eg_add = ((eg_shift > 13) ? 0 : eg_shift); + chip->eg_add = vset1(eg_add); + + // Update envelope timer and flip state + if (chip->eg_state | chip->eg_timerrem) { + if (chip->eg_timer < ((1ULL << AYMO_YMF262_SLOT_NUM) - 1ULL)) { + chip->eg_timer++; + chip->eg_timerrem = 0; + } + else { + chip->eg_timer = 0; + chip->eg_timerrem = 1; + } + } + chip->eg_state ^= 1; +} + + +// Updates the register queue +static inline +void aymo_(rq_update)(struct aymo_(chip)* chip) +{ + if (chip->rq_delay) { + if (--chip->rq_delay) { + return; + } + } + if (chip->rq_head != chip->rq_tail) { + struct aymo_(reg_queue_item)* item = &chip->rq_buffer[chip->rq_head]; + + if (item->address & 0x8000u) { + chip->rq_delay = AYMO_(REG_QUEUE_LATENCY); + chip->rq_delay += (((uint32_t)(item->address & 0x7FFFu) << 16) | item->value); + } + else { + aymo_(write)(chip, item->address, item->value); + } + + if (++chip->rq_head >= AYMO_(REG_QUEUE_LENGTH)) { + chip->rq_head = 0; + } + } +} + + +static +void aymo_(tick_once)(struct aymo_(chip)* chip) +{ + int sgi; + + // Clear output accumulators + aymo_(og_clear)(chip); + + // Process slot group 0 + sgi = 0; + aymo_(sg_update)(chip, &chip->sg[sgi]); + aymo_(ng_update)(chip, (36 - 3)); // slot 16 --> slot 13 + aymo_(rm_update_sg0)(chip); + + // Process slot group 1 + sgi = 1; + aymo_(sg_update)(chip, &chip->sg[sgi]); + aymo_(ng_update)(chip, 3); // slot 13 --> slot 16 + aymo_(rm_update_sg1)(chip); + + // Process slot group 2 + sgi = 2; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 3 + sgi = 3; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Update outputs + aymo_(og_update)(chip); + + // Update timers + aymo_(tm_update)(chip); + + // Dequeue registers + aymo_(rq_update)(chip); +} + + +static +void aymo_(eg_update_ksl)(struct aymo_(chip)* chip, int word) +{ + int slot = aymo_ymf262_word_to_slot[word]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h); + + int16_t pg_fnum = vextractv(cg->pg_fnum, sgo); + int16_t pg_fnum_hn = ((pg_fnum >> 6) & 15); + + int ch2x = aymo_ymf262_word_to_ch2x[aymo_ymf262_slot_to_word[slot]]; + int16_t eg_block = (int16_t)(chip->ch2x_regs[ch2x].reg_B0h.block); + int16_t eg_ksl = aymo_ymf262_eg_ksl_table[pg_fnum_hn]; + eg_ksl = ((eg_ksl << 2) - ((8 - eg_block) << 5)); + if (eg_ksl < 0) { + eg_ksl = 0; + } + int16_t eg_kslsh = aymo_ymf262_eg_kslsh_table[reg_40h->ksl]; + int16_t eg_ksl_sh = (eg_ksl >> eg_kslsh); + + int16_t eg_tl_x4 = ((int16_t)reg_40h->tl << 2); + + int16_t eg_ksl_sh_tl_x4 = (eg_ksl_sh + eg_tl_x4); + vinsertv(sg->eg_ksl_sh_tl_x4, eg_ksl_sh_tl_x4, sgo); + +#ifdef AYMO_DEBUG + vinsertv(sg->eg_ksl, eg_ksl, sgo); +#endif +} + + +static +void aymo_(chip_pg_update_nts)(struct aymo_(chip)* chip) +{ + for (int slot = 0; slot < AYMO_(SLOT_NUM_MAX); ++slot) { + int word = aymo_ymf262_slot_to_word[slot]; + int ch2x = aymo_ymf262_word_to_ch2x[word]; + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h); + int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8)); + int16_t eg_ksv = ((reg_B0h->block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1)); + + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + + struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h); + int16_t ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1)); + + vinsertv(cg->eg_ksv, eg_ksv, sgo); + vinsertv(sg->eg_ks, ks, sgo); + } +} + + +static +void aymo_(pg_update_fnum)( + struct aymo_(chip)* chip, int ch2x, + int16_t pg_fnum, int16_t eg_ksv, int16_t pg_block +) +{ + int word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int sgi0 = (word0 / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word0 % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi0); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + + vinsertv(cg->pg_block, pg_block, sgo); + vinsertv(cg->pg_fnum, pg_fnum, sgo); + vinsertv(cg->eg_ksv, eg_ksv, sgo); + + struct aymo_(slot_group)* sg0 = &(chip->sg[sgi0]); + int slot0 = aymo_ymf262_word_to_slot[word0]; + struct aymo_ymf262_reg_20h* reg_20h0 = &(chip->slot_regs[slot0].reg_20h); + int16_t ks0 = (eg_ksv >> ((reg_20h0->ksr ^ 1) << 1)); + vinsertv(sg0->eg_ks, ks0, sgo); + aymo_(eg_update_ksl)(chip, word0); + aymo_(pg_update_deltafreq)(chip, cg, sg0); + + int word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int sgi1 = (word1 / AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg1 = &(chip->sg[sgi1]); + int slot1 = aymo_ymf262_word_to_slot[word1]; + struct aymo_ymf262_reg_20h* reg_20h1 = &(chip->slot_regs[slot1].reg_20h); + int16_t ks1 = (eg_ksv >> ((reg_20h1->ksr ^ 1) << 1)); + vinsertv(sg1->eg_ks, ks1, sgo); + aymo_(eg_update_ksl)(chip, word1); + aymo_(pg_update_deltafreq)(chip, cg, sg1); +} + + +static +void aymo_(ch2x_update_fnum)(struct aymo_(chip)* chip, int ch2x, int8_t ch2p) +{ + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h); + int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8)); + int16_t pg_block = (int16_t)reg_B0h->block; + int16_t eg_ksv = ((pg_block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1)); + + aymo_(pg_update_fnum)(chip, ch2x, pg_fnum, eg_ksv, pg_block); + + if (ch2p >= 0) { + aymo_(pg_update_fnum)(chip, ch2p, pg_fnum, eg_ksv, pg_block); + } +} + + +static inline +void aymo_(eg_key_on)(struct aymo_(chip)* chip, int word, int16_t mode) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + int16_t eg_key = vextractv(sg->eg_key, sgo); + eg_key |= mode; + vinsertv(sg->eg_key, eg_key, sgo); +} + + +static inline +void aymo_(eg_key_off)(struct aymo_(chip)* chip, int word, int16_t mode) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + int16_t eg_key = vextractv(sg->eg_key, sgo); + eg_key &= (int16_t)~mode; + vinsertv(sg->eg_key, eg_key, sgo); +} + + +static +void aymo_(ch2x_key_on)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm) { + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + unsigned ch2x_is_drum = (chip->og_ch2x_drum & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + + if (ch2x_is_pairing && !ch2x_is_secondary) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0]; + int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL)); + } + else if (!ch2x_is_pairing || ch2x_is_drum) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } + } + else { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } +} + + +static +void aymo_(ch2x_key_off)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm) { + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + unsigned ch2x_is_drum = (chip->og_ch2x_drum & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + + if (ch2x_is_pairing && !ch2x_is_secondary) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0]; + int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL)); + } + else if (!ch2x_is_pairing || ch2x_is_drum) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } + } + else { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } +} + + +static +void aymo_(cm_rewire_slot)(struct aymo_(chip)* chip, int word, const struct aymo_(conn)* conn) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + vinsertv(sg->wg_fbmod_gate, conn->wg_fbmod_gate, sgo); + vinsertv(sg->wg_prmod_gate, conn->wg_prmod_gate, sgo); + int16_t og_out_gate = conn->og_out_gate; + vinsertv(sg->og_out_gate, og_out_gate, sgo); + + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + vinsertv(sg->og_out_ch_gate_a, (vextractv(cg->og_ch_gate_a, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_b, (vextractv(cg->og_ch_gate_b, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_c, (vextractv(cg->og_ch_gate_c, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_d, (vextractv(cg->og_ch_gate_d, sgo) & og_out_gate), sgo); +} + + +static +void aymo_(cm_rewire_ch2x)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm && (chip->og_ch2x_pairing & (1UL << ch2x))) { + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (ch2x_is_secondary) { + int t = ch2x; + ch2x = ch2p; + ch2p = t; + } + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt); + const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]); + } + else { + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]); + } +} + + +static +void aymo_(cm_rewire_conn)( + struct aymo_(chip)* chip, + const struct aymo_ymf262_reg_104h* reg_104h_prev +) +{ + struct aymo_ymf262_reg_104h* reg_104h = &chip->chip_regs.reg_104h; + unsigned diff = (reg_104h_prev ? (reg_104h_prev->conn ^ reg_104h->conn) : 0xFF); + + for (int ch4x = 0; ch4x < (AYMO_(CHANNEL_NUM_MAX) / 2); ++ch4x) { + if (diff & (1 << ch4x)) { + int ch2x = aymo_ymf262_ch4x_to_pair[ch4x][0]; + int ch2p = aymo_ymf262_ch4x_to_pair[ch4x][1]; + + if (reg_104h->conn & (1 << ch4x)) { + chip->og_ch2x_pairing |= ((1UL << ch2x) | (1UL << ch2p)); + + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt); + const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]); + } + else { + chip->og_ch2x_pairing &= ~((1UL << ch2x) | (1UL << ch2p)); + + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]); + + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + const struct aymo_(conn)* ch2p_conn = aymo_(conn_ch2x_table)[ch2p_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch2p_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch2p_conn[1]); + } + } + } +} + + +static +void aymo_(cm_rewire_rhythm)( + struct aymo_(chip)* chip, + const struct aymo_ymf262_reg_BDh* reg_BDh_prev +) +{ + const struct aymo_ymf262_reg_BDh reg_BDh_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + const struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh; + int force_update = 0; + + if (reg_BDh->ryt) { + if (!reg_BDh_prev->ryt) { + // Apply special connection for rhythm mode + unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt; + const struct aymo_(conn)* ch6_conn = aymo_(conn_ryt_table)[ch6_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]); + + const struct aymo_(conn)* ch7_conn = aymo_(conn_ryt_table)[2]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]); + + const struct aymo_(conn)* ch8_conn = aymo_(conn_ryt_table)[3]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]); + + force_update = 1; + } + } + else { + if (reg_BDh_prev->ryt) { + // Apply standard Channel_2xOP connection + unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt; + const struct aymo_(conn)* ch6_conn = aymo_(conn_ch2x_table)[ch6_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]); + + unsigned ch7_cnt = chip->ch2x_regs[7].reg_C0h.cnt; + const struct aymo_(conn)* ch7_conn = aymo_(conn_ch2x_table)[ch7_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]); + + unsigned ch8_cnt = chip->ch2x_regs[8].reg_C0h.cnt; + const struct aymo_(conn)* ch8_conn = aymo_(conn_ch2x_table)[ch8_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]); + + reg_BDh = ®_BDh_zero; // force all keys off + force_update = 1; + } + } + + if ((reg_BDh->hh != reg_BDh_prev->hh) || force_update) { + int word_hh = aymo_ymf262_ch2x_to_word[7][0]; + if (reg_BDh->hh) { + aymo_(eg_key_on)(chip, word_hh, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_hh, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->tc != reg_BDh_prev->tc) || force_update) { + int word_tc = aymo_ymf262_ch2x_to_word[8][1]; + if (reg_BDh->tc) { + aymo_(eg_key_on)(chip, word_tc, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_tc, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->tom != reg_BDh_prev->tom) || force_update) { + int word_tom = aymo_ymf262_ch2x_to_word[8][0]; + if (reg_BDh->tom) { + aymo_(eg_key_on)(chip, word_tom, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_tom, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->sd != reg_BDh_prev->sd) || force_update) { + int word_sd = aymo_ymf262_ch2x_to_word[7][1]; + if (reg_BDh->sd) { + aymo_(eg_key_on)(chip, word_sd, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_sd, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->bd != reg_BDh_prev->bd) || force_update) { + int word_bd0 = aymo_ymf262_ch2x_to_word[6][0]; + int word_bd1 = aymo_ymf262_ch2x_to_word[6][1]; + if (reg_BDh->bd) { + aymo_(eg_key_on)(chip, word_bd0, AYMO_(EG_KEY_DRUM)); + aymo_(eg_key_on)(chip, word_bd1, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_bd0, AYMO_(EG_KEY_DRUM)); + aymo_(eg_key_off)(chip, word_bd1, AYMO_(EG_KEY_DRUM)); + } + } +} + + +static +void aymo_(write_00h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + switch (address) { + case 0x01: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_01h) = value; + break; + } + case 0x02: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_02h) = value; + break; + } + case 0x03: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_03h) = value; + break; + } + case 0x04: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_04h) = value; + break; + } + case 0x104: { + struct aymo_ymf262_reg_104h reg_104h_prev = chip->chip_regs.reg_104h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_104h) = value; + aymo_(cm_rewire_conn)(chip, ®_104h_prev); + break; + } + case 0x105: { + struct aymo_ymf262_reg_105h reg_105h_prev = chip->chip_regs.reg_105h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_105h) = value; + if (chip->chip_regs.reg_105h.newm != reg_105h_prev.newm) { + ; + } + break; + } + case 0x08: { + struct aymo_ymf262_reg_08h reg_08h_prev = chip->chip_regs.reg_08h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_08h) = value; + if (chip->chip_regs.reg_08h.nts != reg_08h_prev.nts) { + aymo_(chip_pg_update_nts)(chip); + } + break; + } + } +} + + +static +void aymo_(write_20h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int sgi = (aymo_ymf262_slot_to_word[slot] / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (aymo_ymf262_slot_to_word[slot] % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h); + struct aymo_ymf262_reg_20h reg_20h_prev = *reg_20h; + *(uint8_t*)(void*)reg_20h = value; + unsigned update_deltafreq = 0; + + if (reg_20h->mult != reg_20h_prev.mult) { + int16_t pg_mult_x2 = aymo_ymf262_pg_mult_x2_table[reg_20h->mult]; + vinsertv(sg->pg_mult_x2, pg_mult_x2, sgo); + update_deltafreq = 1; // force + } + + if (reg_20h->ksr != reg_20h_prev.ksr) { + int16_t eg_ksv = vextractv(cg->eg_ksv, sgo); + int16_t eg_ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1)); + vinsertv(sg->eg_ks, eg_ks, sgo); + } + + if (reg_20h->egt != reg_20h_prev.egt) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->sr = (reg_20h->egt ? 0 : chip->slot_regs[slot].reg_80h.rr); + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + } + + if (reg_20h->vib != reg_20h_prev.vib) { + int16_t pg_vib = -(int16_t)reg_20h->vib; + vinsertv(sg->pg_vib, pg_vib, sgo); + update_deltafreq = 1; // force + } + + if (reg_20h->am != reg_20h_prev.am) { + int16_t eg_am = -(int16_t)reg_20h->am; + vinsertv(sg->eg_am, eg_am, sgo); + + uint16_t eg_tremolopos = chip->eg_tremolopos; + if (eg_tremolopos >= 105) { + eg_tremolopos = (210 - eg_tremolopos); + } + vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift)); + vsfence(); + sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am); + } + + if (update_deltafreq) { + for (sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + cgi = aymo_(sgi_to_cgi)(sgi); + cg = &chip->cg[cgi]; + sg = &chip->sg[sgi]; + aymo_(pg_update_deltafreq)(chip, cg, sg); + } + } +} + + +static +void aymo_(write_40h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h); + struct aymo_ymf262_reg_40h reg_40h_prev = *reg_40h; + *(uint8_t*)(void*)reg_40h = value; + + if ((reg_40h->tl != reg_40h_prev.tl) || (reg_40h->ksl != reg_40h_prev.ksl)) { + aymo_(eg_update_ksl)(chip, word); + } +} + + +static +void aymo_(write_60h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_60h* reg_60h = &(chip->slot_regs[slot].reg_60h); + struct aymo_ymf262_reg_60h reg_60h_prev = *reg_60h; + *(uint8_t*)(void*)reg_60h = value; + + if ((reg_60h->dr != reg_60h_prev.dr) || (reg_60h->ar != reg_60h_prev.ar)) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->dr = reg_60h->dr; + eg_adsr->ar = reg_60h->ar; + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + } +} + + +static +void aymo_(write_80h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_80h* reg_80h = &(chip->slot_regs[slot].reg_80h); + struct aymo_ymf262_reg_80h reg_80h_prev = *reg_80h; + *(uint8_t*)(void*)reg_80h = value; + + if ((reg_80h->rr != reg_80h_prev.rr) || (reg_80h->sl != reg_80h_prev.sl)) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->sr = (chip->slot_regs[slot].reg_20h.egt ? 0 : reg_80h->rr); + eg_adsr->rr = reg_80h->rr; + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + int16_t eg_sl = (int16_t)reg_80h->sl; + if (eg_sl == 0x0F) { + eg_sl = 0x1F; + } + vinsertv(sg->eg_sl, eg_sl, sgo); + } +} + + +static +void aymo_(write_E0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_E0h* reg_E0h = &(chip->slot_regs[slot].reg_E0h); + struct aymo_ymf262_reg_E0h reg_E0h_prev = *reg_E0h; + *(uint8_t*)(void*)reg_E0h = value; + + if (!chip->chip_regs.reg_105h.newm) { + reg_E0h->ws &= 3; + } + + if (reg_E0h->ws != reg_E0h_prev.ws) { + const struct aymo_(wave)* wave = &aymo_(wave_table)[reg_E0h->ws]; + vinsertv(sg->wg_phase_mullo, wave->wg_phase_mullo, sgo); + vinsertv(sg->wg_phase_zero, wave->wg_phase_zero, sgo); + vinsertv(sg->wg_phase_neg, wave->wg_phase_neg, sgo); + vinsertv(sg->wg_phase_flip, wave->wg_phase_flip, sgo); + vinsertv(sg->wg_phase_mask, wave->wg_phase_mask, sgo); + vinsertv(sg->wg_sine_gate, wave->wg_sine_gate, sgo); + } +} + + +static +void aymo_(write_A0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) { + return; + } + if (!ch2x_is_pairing || ch2x_is_secondary) { + ch2p = -1; + } + + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_A0h reg_A0h_prev = *reg_A0h; + *(uint8_t*)(void*)reg_A0h = value; + + if (reg_A0h->fnum_lo != reg_A0h_prev.fnum_lo) { + aymo_(ch2x_update_fnum)(chip, ch2x, ch2p); + } +} + + +static +void aymo_(write_B0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) { + return; + } + if (!ch2x_is_pairing || ch2x_is_secondary) { + ch2p = -1; + } + + if (address == 0xBD) { + struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh; + struct aymo_ymf262_reg_BDh reg_BDh_prev = *reg_BDh; + *(uint8_t*)(void*)reg_BDh = value; + + chip->eg_tremoloshift = (((reg_BDh->dam ^ 1) << 1) + 2); + chip->eg_vibshift = (reg_BDh->dvb ^ 1); + aymo_(cm_rewire_rhythm)(chip, ®_BDh_prev); + } + else { + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_B0h reg_B0h_prev = *reg_B0h; + *(uint8_t*)(void*)reg_B0h = value; + + if ((reg_B0h->fnum_hi != reg_B0h_prev.fnum_hi) || (reg_B0h->block != reg_B0h_prev.block)) { + aymo_(ch2x_update_fnum)(chip, ch2x, ch2p); + } + + if (reg_B0h->kon != reg_B0h_prev.kon) { + if (reg_B0h->kon) { + aymo_(ch2x_key_on)(chip, ch2x); + } else { + aymo_(ch2x_key_off)(chip, ch2x); + } + } + } +} + + +static +void aymo_(write_C0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + struct aymo_ymf262_reg_C0h* reg_C0h = &(chip->ch2x_regs[ch2x].reg_C0h); + struct aymo_ymf262_reg_C0h reg_C0h_prev = *reg_C0h; + if (!chip->chip_regs.reg_105h.newm) { + value = ((value | 0x30) & 0x3F); + } + *(uint8_t*)(void*)reg_C0h = value; + + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int sgo = (ch2x_word0 % AYMO_(SLOT_GROUP_LENGTH)); + int sgi0 = (ch2x_word0 / AYMO_(SLOT_GROUP_LENGTH)); + int sgi1 = (ch2x_word1 / AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg0 = &chip->sg[sgi0]; + struct aymo_(slot_group)* sg1 = &chip->sg[sgi1]; + int cgi = aymo_(sgi_to_cgi)(sgi0); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + + if (reg_C0h->cha != reg_C0h_prev.cha) { + int16_t og_ch_gate_a = -(int16_t)reg_C0h->cha; + vinsertv(cg->og_ch_gate_a, og_ch_gate_a, sgo); + vinsertv(sg0->og_out_ch_gate_a, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_a), sgo); + vinsertv(sg1->og_out_ch_gate_a, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_a), sgo); + } + if (reg_C0h->chb != reg_C0h_prev.chb) { + int16_t og_ch_gate_b = -(int16_t)reg_C0h->chb; + vinsertv(cg->og_ch_gate_b, og_ch_gate_b, sgo); + vinsertv(sg0->og_out_ch_gate_b, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_b), sgo); + vinsertv(sg1->og_out_ch_gate_b, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_b), sgo); + } + if (reg_C0h->chc != reg_C0h_prev.chc) { + int16_t og_ch_gate_c = -(int16_t)reg_C0h->chc; + vinsertv(cg->og_ch_gate_c, og_ch_gate_c, sgo); + vinsertv(sg0->og_out_ch_gate_c, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_c), sgo); + vinsertv(sg1->og_out_ch_gate_c, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_c), sgo); + } + if (reg_C0h->chd != reg_C0h_prev.chd) { + int16_t og_ch_gate_d = -(int16_t)reg_C0h->chd; + vinsertv(cg->og_ch_gate_d, og_ch_gate_d, sgo); + vinsertv(sg0->og_out_ch_gate_d, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_d), sgo); + vinsertv(sg1->og_out_ch_gate_d, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_d), sgo); + } + + if (reg_C0h->fb != reg_C0h_prev.fb) { + int16_t fb_mulhi = (reg_C0h->fb ? (0x0040 << reg_C0h->fb) : 0); + vinsertv(sg0->wg_fb_mulhi, fb_mulhi, sgo); + vinsertv(sg1->wg_fb_mulhi, fb_mulhi, sgo); + } + + if (chip->chip_regs.reg_105h.stereo) { + // TODO + } + + if (reg_C0h->cnt != reg_C0h_prev.cnt) { + aymo_(cm_rewire_ch2x)(chip, ch2x); + } +} + + +static +void aymo_(write_D0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + *(uint8_t*)(void*)&(chip->ch2x_regs[ch2x].reg_C0h) = value; + + if (chip->chip_regs.reg_105h.stereo) { + // TODO + } +} + + +static +int aymo_(rq_enqueue)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + uint16_t rq_tail = chip->rq_tail; + uint16_t rq_next = (rq_tail + 1); + if (rq_next >= AYMO_(REG_QUEUE_LENGTH)) { + rq_next = 0u; + } + + if (rq_next != chip->rq_head) { + chip->rq_buffer[rq_tail].address = address; + chip->rq_buffer[rq_tail].value = value; + chip->rq_tail = rq_next; + return 1; + } + return 0; +} + + +const struct aymo_ymf262_vt* aymo_(get_vt)(void) +{ + return &(aymo_(vt)); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip) +{ + assert(chip); + + // Wipe everything, except VT + const struct aymo_ymf262_vt* vt = chip->parent.vt; + aymo_memset(chip, 0, sizeof(*chip)); + chip->parent.vt = vt; + + // Initialize slots + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + sg->eg_rout = vset1(0x01FF); + sg->eg_out = vset1(0x01FF); + sg->eg_gen = vset1(AYMO_(EG_GEN_RELEASE)); + sg->eg_gen_mullo = vset1(AYMO_(EG_GEN_MULLO_RELEASE)); + sg->pg_mult_x2 = vset1(aymo_ymf262_pg_mult_x2_table[0]); + sg->og_prout_ac = vsetm(aymo_(og_prout_ac)[sgi]); + sg->og_prout_bd = vsetm(aymo_(og_prout_bd)[sgi]); + + const struct aymo_(wave)* wave = &aymo_(wave_table)[0]; + sg->wg_phase_mullo = vset1(wave->wg_phase_mullo); + sg->wg_phase_zero = vset1(wave->wg_phase_zero); + sg->wg_phase_neg = vset1(wave->wg_phase_neg); + sg->wg_phase_flip = vset1(wave->wg_phase_flip); + sg->wg_phase_mask = vset1(wave->wg_phase_mask); + sg->wg_sine_gate = vset1(wave->wg_sine_gate); + } + + // Initialize channels + for (int cgi = 0; cgi < (AYMO_(SLOT_GROUP_NUM) / 2); ++cgi) { + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + cg->og_ch_gate_a = vset1(-1); + cg->og_ch_gate_b = vset1(-1); + } + for (int ch2x = 0; ch2x < AYMO_(CHANNEL_NUM_MAX); ++ch2x) { + aymo_(cm_rewire_ch2x)(chip, ch2x); + } + + // Initialize chip + chip->ng_noise = 1; + + chip->eg_tremoloshift = 4; + chip->eg_vibshift = 1; +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + AYMO_UNUSED_VAR(chip); + AYMO_UNUSED_VAR(address); + assert(chip); + + // not supported + return 0u; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address > 0x1FF) { + return; + } + + switch (address & 0xF0) { + case 0x00: { + aymo_(write_00h)(chip, address, value); + break; + } + case 0x20: + case 0x30: { + aymo_(write_20h)(chip, address, value); + break; + } + case 0x40: + case 0x50: { + aymo_(write_40h)(chip, address, value); + break; + } + case 0x60: + case 0x70: { + aymo_(write_60h)(chip, address, value); + break; + } + case 0x80: + case 0x90: { + aymo_(write_80h)(chip, address, value); + break; + } + case 0xE0: + case 0xF0: { + aymo_(write_E0h)(chip, address, value); + break; + } + case 0xA0: { + aymo_(write_A0h)(chip, address, value); + break; + } + case 0xB0: { + aymo_(write_B0h)(chip, address, value); + break; + } + case 0xC0: { + aymo_(write_C0h)(chip, address, value); + break; + } + case 0xD0: { + aymo_(write_D0h)(chip, address, value); + break; + } + } + vsfence(); +} + + +int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address < 0x8000u) { + return aymo_(rq_enqueue)(chip, address, value); + } + return 0; +} + + +int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + if (count < 0x8000u) { + uint16_t address = (uint16_t)((count >> 8) | 0x8000u); + uint8_t value = (uint8_t)(count & 0xFFu); + return aymo_(rq_enqueue)(chip, address, value); + } + return 0; +} + + +int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel) +{ + assert(chip); + + switch (channel) { + case 0u: return _mm_extract_epi16(chip->og_out, 0); + case 1u: return _mm_extract_epi16(chip->og_out, 1); + case 2u: return _mm_extract_epi16(chip->og_out, 2); + case 3u: return _mm_extract_epi16(chip->og_out, 3); + default: return 0; + } +} + + +void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + while (count--) { + aymo_(tick_once)(chip); + } +} + + +void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 3u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + *(int32_t*)y = _mm_cvtsi128_si32(chip->og_out); + y += 2u; + } +} + + +void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 7u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + _mm_storel_epi64((void*)y, chip->og_out); + y += 4u; + } +} + + +void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 7u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out); + vf32x4_t vf32 = _mm_cvtepi32_ps(vi32); + _mm_storel_pi((void*)y, vf32); + y += 2u; + } +} + + +void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 15u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out); + vf32x4_t vf32 = _mm_cvtepi32_ps(vi32); + _mm_store_ps(y, vf32); + y += 4u; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 diff --git a/src/aymo_ymf262_x86_sse41.c b/src/aymo_ymf262_x86_sse41.c new file mode 100644 index 0000000..f3eba29 --- /dev/null +++ b/src/aymo_ymf262_x86_sse41.c @@ -0,0 +1,1691 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include +#include "aymo_cpu_x86_sse41_inline.h" +#include "aymo_ymf262.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_x86_sse41.h" + +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +AYMO_CXX_EXTERN_C_BEGIN + + +const struct aymo_ymf262_vt aymo_(vt) = +{ + AYMO_STRINGIFY2(aymo_(vt)), + (aymo_ymf262_get_sizeof_f)&(aymo_(get_sizeof)), + (aymo_ymf262_ctor_f)&(aymo_(ctor)), + (aymo_ymf262_dtor_f)&(aymo_(dtor)), + (aymo_ymf262_read_f)&(aymo_(read)), + (aymo_ymf262_write_f)&(aymo_(write)), + (aymo_ymf262_enqueue_write_f)&(aymo_(enqueue_write)), + (aymo_ymf262_enqueue_delay_f)&(aymo_(enqueue_delay)), + (aymo_ymf262_get_output_f)&(aymo_(get_output)), + (aymo_ymf262_tick_f)&(aymo_(tick)), + (aymo_ymf262_generate_i16x2_f)&(aymo_(generate_i16x2)), + (aymo_ymf262_generate_i16x4_f)&(aymo_(generate_i16x4)), + (aymo_ymf262_generate_f32x2_f)&(aymo_(generate_f32x2)), + (aymo_ymf262_generate_f32x4_f)&(aymo_(generate_f32x4)) +}; + + +// 32-bit Slot Group side (lo/hi) +const int8_t aymo_(sgo_side)[8] = +{ + 0, 0, 0, 0, 1, 1, 1, 1 +}; + +// 32-bit Slot Group cell +const int8_t aymo_(sgo_cell)[8] = +{ + 0, 1, 2, 3, 0, 1, 2, 3 +}; + + +const uint16_t aymo_(eg_incstep_table)[4] = +{ + ((1 << 15) | (1 << 14) | (1 << 13)), + ((0 << 15) | (0 << 14) | (1 << 13)), + ((0 << 15) | (1 << 14) | (1 << 13)), + ((0 << 15) | (0 << 14) | (0 << 13)) +}; + + +// Wave descriptors +const struct aymo_(wave) aymo_(wave_table)[8] = // TODO: share bits; select vit shifts +{ + { 1, 0x0000, 0x0200, 0x0100, 0x00FF, -1 }, + { 1, 0x0200, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0000, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0100, 0x0000, 0x0100, 0x00FF, -1 }, + { 2, 0x0400, 0x0200, 0x0100, 0x00FF, -1 }, + { 2, 0x0400, 0x0000, 0x0100, 0x00FF, -1 }, + { 1, 0x0000, 0x0200, 0x0200, 0x0001, 0 }, + { 8, 0x0000, 0x1000, 0x1000, 0x1FFF, 0 } +}; + + +// 2-channel connection descriptors +const struct aymo_(conn) aymo_(conn_ch2x_table)[2/* cnt */][2/* slot */] = +{ + { + { -1, 0, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, -1 } + }, +}; + +// 4-channel connection descriptors +const struct aymo_(conn) aymo_(conn_ch4x_table)[4/* cnt */][4/* slot */] = +{ + { + { -1, 0, 0 }, + { 0, -1, 0 }, + { 0, -1, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, 0 }, + { 0, -1, -1 }, + { 0, 0, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, 0 }, + { 0, -1, 0 }, + { 0, -1, -1 } + }, + { + { -1, 0, -1 }, + { 0, 0, 0 }, + { 0, -1, -1 }, + { 0, 0, -1 } + }, +}; + +// Rhythm connection descriptors +const struct aymo_(conn) aymo_(conn_ryt_table)[4][2/* slot */] = +{ + // Channel 6: BD, FM + { + { -1, 0, 0 }, + { 0, -1, -1 } + }, + // Channel 6: BD, AM + { + { -1, 0, 0 }, + { 0, 0, -1 } + }, + // Channel 7: HH + SD + { + { 0, 0, -1 }, + { 0, 0, -1 } + }, + // Channel 8: TT + TC + { + { 0, 0, -1 }, + { 0, 0, -1 } + } +}; + + +// Slot mask output delay for outputs A and C +const uint8_t aymo_(og_prout_ac)[AYMO_(SLOT_GROUP_NUM)] = // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag +{ + 0xF8, + 0xF8, + 0xF8, + 0xFF, + 0xF8, + 0xFF, + 0xF8, + 0xFF +}; + + +// Slot mask output delay for outputs B and D +const uint8_t aymo_(og_prout_bd)[AYMO_(SLOT_GROUP_NUM)] = // TODO: TBV: use a shared mask; use bit 7 as mask flag; <<=1 for the next flag +{ + 0x88, + 0xF8, + 0x88, + 0xF8, + 0x88, + 0xFF, + 0x88, + 0xFF +}; + + +// Updates phase generator +static inline +void aymo_(pg_update_deltafreq)( + struct aymo_(chip)* chip, + struct aymo_(ch2x_group)* cg, + struct aymo_(slot_group)* sg +) +{ + // Update phase + vi16_t fnum = cg->pg_fnum; + vi16_t range = vand(fnum, vset1(7 << 7)); + range = vmulihi(range, vand(sg->pg_vib, chip->pg_vib_mulhi)); + range = vsub(vxor(range, chip->pg_vib_neg), chip->pg_vib_neg); // flip sign + fnum = vadd(fnum, range); + + vi32_t zero = vsetz(); + vi32_t fnum_lo = vunpacklo(fnum, zero); + vi32_t fnum_hi = vunpackhi(fnum, zero); + vi32_t block_sll_lo = vunpacklo(cg->pg_block, zero); + vi32_t block_sll_hi = vunpackhi(cg->pg_block, zero); + vi32_t basefreq_lo = vvsrli(vvsllv(fnum_lo, block_sll_lo), 1); + vi32_t basefreq_hi = vvsrli(vvsllv(fnum_hi, block_sll_hi), 1); + vi32_t pg_mult_x2_lo = vunpacklo(sg->pg_mult_x2, zero); + vi32_t pg_mult_x2_hi = vunpackhi(sg->pg_mult_x2, zero); + vi32_t deltafreq_lo = vvsrli(vvmullo(basefreq_lo, pg_mult_x2_lo), 1); + vi32_t deltafreq_hi = vvsrli(vvmullo(basefreq_hi, pg_mult_x2_hi), 1); + sg->pg_deltafreq_lo = deltafreq_lo; + sg->pg_deltafreq_hi = deltafreq_hi; +} + + +// Updates noise generator +static inline +void aymo_(ng_update)(struct aymo_(chip)* chip, unsigned times) +{ + // Update noise + uint32_t noise = chip->ng_noise; + while (times--) { + uint32_t n_bit = (((noise >> 14) ^ noise) & 1); + noise = ((noise >> 1) | (n_bit << 22)); + } + chip->ng_noise = noise; +} + + +// Updates rhythm manager, slot group 1 +static inline +void aymo_(rm_update_sg1)(struct aymo_(chip)* chip) +{ + struct aymo_(slot_group)* sg = &chip->sg[1]; + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Double rhythm outputs + vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0); + vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask); + chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d)); + } + + vi16_t phase = sg->pg_phase_out; + uint16_t phase13 = (uint16_t)vextract(phase, 1); + + // Update noise bits + chip->rm_hh_bit2 = ((phase13 >> 2) & 1); + chip->rm_hh_bit3 = ((phase13 >> 3) & 1); + chip->rm_hh_bit7 = ((phase13 >> 7) & 1); + chip->rm_hh_bit8 = ((phase13 >> 8) & 1); + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Calculate noise bit + uint16_t rm_xor = ( + (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) | + (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) | + (chip->rm_tc_bit3 ^ chip->rm_tc_bit5) + ); + + // Update HH + uint16_t noise = (uint16_t)chip->ng_noise; + phase13 = (rm_xor << 9); + if (rm_xor ^ (noise & 1)) { + phase13 |= 0xD0; + } else { + phase13 |= 0x34; + } + phase = vinsert(phase, (int16_t)phase13, 1); + + sg->pg_phase_out = phase; + } +} + + +// Updates rhythm manager, slot group 3 +static inline +void aymo_(rm_update_sg3)(struct aymo_(chip)* chip) +{ + struct aymo_(slot_group)* sg = &chip->sg[3]; + + if AYMO_UNLIKELY(chip->chip_regs.reg_BDh.ryt) { + // Double rhythm outputs + vi16_t ryt_slot_mask = vsetr(-1, -1, -1, 0, 0, 0, 0, 0); + vi16_t wave_out = vand(sg->wg_out, ryt_slot_mask); + chip->og_acc_a = vadd(chip->og_acc_a, vand(wave_out, sg->og_out_ch_gate_a)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(wave_out, sg->og_out_ch_gate_b)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(wave_out, sg->og_out_ch_gate_c)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(wave_out, sg->og_out_ch_gate_d)); + + // Calculate noise bit + uint16_t rm_xor = ( + (chip->rm_hh_bit2 ^ chip->rm_hh_bit7) | + (chip->rm_hh_bit3 ^ chip->rm_tc_bit5) | + (chip->rm_tc_bit3 ^ chip->rm_tc_bit5) + ); + vi16_t phase = sg->pg_phase_out; + + // Update SD + uint16_t noise = (uint16_t)chip->ng_noise; + uint16_t phase16 = ( + ((uint16_t)chip->rm_hh_bit8 << 9) | + ((uint16_t)(chip->rm_hh_bit8 ^ (noise & 1)) << 8) + ); + phase = vinsert(phase, (int16_t)phase16, 1); + + // Update TC + uint32_t phase17 = vextract(phase, 2); + chip->rm_tc_bit3 = ((phase17 >> 3) & 1); + chip->rm_tc_bit5 = ((phase17 >> 5) & 1); + phase17 = ((rm_xor << 9) | 0x80); + phase = vinsert(phase, (int16_t)phase17, 2); + + sg->pg_phase_out = phase; + } +} + + +// Updates slot generators +static +void aymo_(sg_update)( + struct aymo_(chip)* chip, + struct aymo_(slot_group)* sg +) +{ + // EG: Compute envelope output + vi16_t sg_eg_rout = sg->eg_rout; + sg->eg_out = vadd(vadd(sg_eg_rout, sg->eg_tremolo_am), sg->eg_ksl_sh_tl_x4); + + // PG: Compute phase output + vi32_t phase_out_mask = vvset1(0xFFFF); + vi32_t phase_out_lo = vvand(vvsrli(sg->pg_phase_lo, 9), phase_out_mask); + vi32_t phase_out_hi = vvand(vvsrli(sg->pg_phase_hi, 9), phase_out_mask); + vi16_t phase_out = vvpackus(phase_out_lo, phase_out_hi); + sg->pg_phase_out = phase_out; + + // EG: Compute rate + vi16_t eg_prgen = sg->eg_gen; + vi16_t eg_gen_rel = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_RELEASE))); + vi16_t notreset = vcmpz(vand(sg->eg_key, eg_gen_rel)); + vi16_t eg_gen_mullo = vblendv(vset1(AYMO_(EG_GEN_MULLO_ATTACK)), sg->eg_gen_mullo, notreset); + vi16_t reg_rate = vu2i(vmululo(vi2u(sg->eg_adsr), vi2u(eg_gen_mullo))); // move to top nibble + vi16_t rate_temp = vand(reg_rate, vset1((int16_t)0xF000)); // keep top nibble + rate_temp = vsrli(rate_temp, AYMO_(EG_GEN_SRLHI)); + vi16_t rate = vadd(sg->eg_ks, rate_temp); + vi16_t rate_lo = vand(rate, vset1(3)); + vi16_t rate_hi = vsrli(rate, 2); + rate_hi = vmini(rate_hi, vset1(15)); + + // PG: Update phase + vi32_t notreset_lo = vunpacklo(notreset, notreset); + vi32_t notreset_hi = vunpackhi(notreset, notreset); + vi32_t pg_phase_lo = vvand(notreset_lo, sg->pg_phase_lo); + vi32_t pg_phase_hi = vvand(notreset_hi, sg->pg_phase_hi); + sg->pg_phase_lo = vvadd(pg_phase_lo, sg->pg_deltafreq_lo); + sg->pg_phase_hi = vvadd(pg_phase_hi, sg->pg_deltafreq_hi); + + // EG: Compute shift (< 12) + vi16_t eg_shift = vadd(rate_hi, chip->eg_add); + vi16_t rate_pre_lt12 = vor(vslli(rate_lo, 1), vset1(8)); + vi16_t shift_lt12 = vsrlv(rate_pre_lt12, vsubsu(vset1(15), eg_shift)); + vi16_t eg_state = vset1((int16_t)chip->eg_state); + shift_lt12 = vand(shift_lt12, eg_state); + + // WG: Compute feedback and modulation inputs + vi16_t fbsum = vslli(vadd(sg->wg_out, sg->wg_prout), 1); + vi16_t fbsum_sh = vmulihi(fbsum, sg->wg_fb_mulhi); + vi16_t prmod = vand(chip->wg_mod, sg->wg_prmod_gate); + vi16_t fbmod = vand(fbsum_sh, sg->wg_fbmod_gate); + sg->wg_prout = sg->wg_out; + + // WG: Compute operator phase input + vi16_t modsum = vadd(fbmod, prmod); + vi16_t phase = vadd(phase_out, modsum); + + // EG: Compute shift (>= 12) + vu16_t rate_lo_muluhi = vi2u(vslli(vpow2m1lt4(rate_lo), 1)); + vi16_t incstep_ge12 = vand(vu2i(vmuluhi(chip->eg_incstep, rate_lo_muluhi)), vset1(1)); + vi16_t shift_ge12 = vadd(vand(rate_hi, vset1(3)), incstep_ge12); + shift_ge12 = vmini(shift_ge12, vset1(3)); + shift_ge12 = vblendv(shift_ge12, eg_state, vcmpz(shift_ge12)); + + vi16_t shift = vblendv(shift_lt12, shift_ge12, vcmpgt(rate_hi, vset1(11))); + shift = vandnot(vcmpz(rate_temp), shift); + + // EG: Instant attack + vi16_t eg_rout = sg_eg_rout; + eg_rout = vandnot(vandnot(notreset, vcmpeq(rate_hi, vset1(15))), eg_rout); + + // WG: Process phase + vi16_t phase_sped = vu2i(vmululo(vi2u(phase), sg->wg_phase_mullo)); + vi16_t phase_gate = vcmpz(vand(phase_sped, sg->wg_phase_zero)); + vi16_t phase_flip = vcmpp(vand(phase_sped, sg->wg_phase_flip)); + vi16_t phase_mask = sg->wg_phase_mask; + vi16_t phase_xor = vand(phase_flip, phase_mask); + vi16_t phase_idx = vxor(phase_sped, phase_xor); + phase_out = vand(vand(phase_gate, phase_mask), phase_idx); + + // EG: Envelope off + vi16_t eg_off = vcmpgt(sg_eg_rout, vset1(0x01F7)); + vi16_t eg_gen_natk_and_nrst = vand(vcmpp(eg_prgen), notreset); + eg_rout = vblendv(eg_rout, vset1(0x01FF), vand(eg_gen_natk_and_nrst, eg_off)); + + // WG: Compute logsin variant + vi16_t phase_lo = phase_out; // vgather() masks to low byte + vi16_t logsin_val = vgather(aymo_ymf262_logsin_table, phase_lo); + logsin_val = vblendv(vset1(0x1000), logsin_val, phase_gate); + + // EG: Compute common increment not in attack state + vi16_t eg_inc_natk_cond = vand(vand(notreset, vcmpz(eg_off)), vcmpp(shift)); + vi16_t eg_inc_natk = vand(eg_inc_natk_cond, vpow2m1lt4(shift)); + vi16_t eg_gen = eg_prgen; + + // WG: Compute exponential output + vi16_t exp_in = vblendv(phase_out, logsin_val, sg->wg_sine_gate); + vi16_t exp_level = vadd(exp_in, vslli(sg->eg_out, 3)); + exp_level = vmini(exp_level, vset1(0x1FFF)); + vi16_t exp_level_lo = exp_level; // vgather() masks to low byte + vi16_t exp_level_hi = vsrli(exp_level, 8); + vi16_t exp_value = vgather(aymo_ymf262_exp_x2_table, exp_level_lo); + vi16_t exp_out = vsrlv(exp_value, exp_level_hi); + + // EG: Move attack to decay state + vi16_t eg_inc_atk_cond = vand(vand(vcmpp(sg->eg_key), vcmpp(shift)), + vand(vcmpz(eg_prgen), vcmpgt(vset1(15), rate_hi))); + vi16_t eg_inc_atk_ninc = vsrlv(sg_eg_rout, vsub(vset1(4), shift)); + vi16_t eg_inc = vandnot(eg_inc_atk_ninc, eg_inc_atk_cond); + vi16_t eg_gen_atk_to_dec = vcmpz(vor(eg_prgen, sg_eg_rout)); + eg_gen = vsub(eg_gen, eg_gen_atk_to_dec); // 0 --> 1 + eg_inc = vblendv(eg_inc_natk, eg_inc, vcmpz(eg_prgen)); + eg_inc = vandnot(eg_gen_atk_to_dec, eg_inc); + + // WG: Compute operator wave output + vi16_t wave_pos = vcmpz(vand(phase_sped, sg->wg_phase_neg)); + vi16_t wave_neg = vandnot(wave_pos, phase_gate); + vi16_t wave_out = vxor(exp_out, wave_neg); + sg->wg_out = wave_out; + chip->wg_mod = wave_out; + + // EG: Move decay to sustain state + vi16_t eg_gen_dec = vcmpeq(eg_prgen, vset1(AYMO_(EG_GEN_DECAY))); + vi16_t sl_hit = vcmpeq(vsrli(sg_eg_rout, 4), sg->eg_sl); + vi16_t eg_gen_dec_to_sus = vand(eg_gen_dec, sl_hit); + eg_gen = vsub(eg_gen, eg_gen_dec_to_sus); // 1 --> 2 + eg_inc = vandnot(eg_gen_dec_to_sus, eg_inc); + + // WG: Update chip output accumulators, with quirky slot output delay + vi16_t og_out_ac = vblendv(wave_out, sg->og_prout, sg->og_prout_ac); + vi16_t og_out_bd = vblendv(wave_out, sg->og_prout, sg->og_prout_bd); + sg->og_prout = wave_out; + chip->og_acc_a = vadd(chip->og_acc_a, vand(og_out_ac, sg->og_out_ch_gate_a)); + chip->og_acc_c = vadd(chip->og_acc_c, vand(og_out_ac, sg->og_out_ch_gate_c)); + chip->og_acc_b = vadd(chip->og_acc_b, vand(og_out_bd, sg->og_out_ch_gate_b)); + chip->og_acc_d = vadd(chip->og_acc_d, vand(og_out_bd, sg->og_out_ch_gate_d)); + + // EG: Move back to attack state + eg_gen = vand(notreset, eg_gen); // * --> 0 + + // EG: Move to release state + eg_gen = vor(eg_gen, vsrli(vcmpz(sg->eg_key), 14)); // * --> 3 + + // EG: Update envelope generator + eg_rout = vadd(eg_rout, eg_inc); + eg_rout = vand(eg_rout, vset1(0x01FF)); + sg->eg_rout = eg_rout; + sg->eg_gen = eg_gen; + sg->eg_gen_mullo = vsllv(vset1(1), vslli(eg_gen, 2)); + +#ifdef AYMO_DEBUG + sg->eg_rate = rate; + sg->eg_inc = eg_inc; + sg->wg_fbmod = fbsum_sh; + sg->wg_mod = modsum; +#endif +} + + +// Clear output accumulators +static inline +void aymo_(og_clear)(struct aymo_(chip)* chip) +{ + chip->og_acc_a = vsetz(); + chip->og_acc_b = vsetz(); + chip->og_acc_c = vsetz(); + chip->og_acc_d = vsetz(); +} + + +// Updates output mixdown +static inline +void aymo_(og_update)(struct aymo_(chip)* chip) +{ + vi16x8_t one = _mm_set1_epi16(1); + vi32x4_t tot_a = _mm_madd_epi16(chip->og_acc_a, one); + vi32x4_t tot_b = _mm_madd_epi16(chip->og_acc_b, one); + vi32x4_t tot_c = _mm_madd_epi16(chip->og_acc_c, one); + vi32x4_t tot_d = _mm_madd_epi16(chip->og_acc_d, one); + + tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(2, 3, 0, 1))); + tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(2, 3, 0, 1))); + tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(2, 3, 0, 1))); + tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(2, 3, 0, 1))); + + tot_a = _mm_add_epi32(tot_a, _mm_shuffle_epi32(tot_a, _MM_SHUFFLE(1, 0, 3, 2))); + tot_b = _mm_add_epi32(tot_b, _mm_shuffle_epi32(tot_b, _MM_SHUFFLE(1, 0, 3, 2))); + tot_c = _mm_add_epi32(tot_c, _mm_shuffle_epi32(tot_c, _MM_SHUFFLE(1, 0, 3, 2))); + tot_d = _mm_add_epi32(tot_d, _mm_shuffle_epi32(tot_d, _MM_SHUFFLE(1, 0, 3, 2))); + + vi32x4_t tot_ab = _mm_blend_epi16(tot_a, tot_b, 0xCC); + vi32x4_t tot_cd = _mm_blend_epi16(tot_c, tot_d, 0x33); + vi32x4_t tot_abcd = _mm_blend_epi16(tot_ab, tot_cd, 0xF0); + vi16x8_t sat_abcd = _mm_packs_epi32(tot_abcd, tot_abcd); + + vi16x8_t old_abcd = _mm_shuffle_epi32(chip->og_out, _MM_SHUFFLE(1, 0, 3, 2)); + vi16x8_t out_abcd = _mm_blend_epi16(old_abcd, sat_abcd, 0xF5); + + chip->og_out = out_abcd; +} + + +// Updates timer management +static inline +void aymo_(tm_update)(struct aymo_(chip)* chip) +{ + // Update tremolo + if AYMO_UNLIKELY((chip->tm_timer & 0x3F) == 0x3F) { + chip->eg_tremolopos = ((chip->eg_tremolopos + 1) % 210); + + uint16_t eg_tremolopos = chip->eg_tremolopos; + if (eg_tremolopos >= 105) { + eg_tremolopos = (210 - eg_tremolopos); + } + vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift)); + + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am); + } + } + + // Update vibrato + if AYMO_UNLIKELY((chip->tm_timer & 0x3FF) == 0x3FF) { + chip->pg_vibpos = ((chip->pg_vibpos + 1) & 7); + uint8_t vibpos = chip->pg_vibpos; + int16_t pg_vib_mulhi = (0x10000 >> 7); + int16_t pg_vib_neg = 0; + + if (!(vibpos & 3)) { + pg_vib_mulhi = 0; + } + else if (vibpos & 1) { + pg_vib_mulhi >>= 1; + } + pg_vib_mulhi >>= chip->eg_vibshift; + pg_vib_mulhi &= 0x7F80; + + if (vibpos & 4) { + pg_vib_neg = -1; + } + chip->pg_vib_mulhi = vset1(pg_vib_mulhi); + chip->pg_vib_neg = vset1(pg_vib_neg); + + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + aymo_(pg_update_deltafreq)(chip, cg, sg); + } + } + + chip->tm_timer++; + uint16_t eg_incstep = aymo_(eg_incstep_table)[chip->tm_timer & 3]; + chip->eg_incstep = vi2u(vset1((int16_t)eg_incstep)); + + // Update timed envelope patterns + int16_t eg_shift = (int16_t)uffsll(chip->eg_timer); + int16_t eg_add = ((eg_shift > 13) ? 0 : eg_shift); + chip->eg_add = vset1(eg_add); + + // Update envelope timer and flip state + if (chip->eg_state | chip->eg_timerrem) { + if (chip->eg_timer < ((1ULL << AYMO_YMF262_SLOT_NUM) - 1ULL)) { + chip->eg_timer++; + chip->eg_timerrem = 0; + } + else { + chip->eg_timer = 0; + chip->eg_timerrem = 1; + } + } + chip->eg_state ^= 1; +} + + +// Updates the register queue +static inline +void aymo_(rq_update)(struct aymo_(chip)* chip) +{ + if (chip->rq_delay) { + if (--chip->rq_delay) { + return; + } + } + if (chip->rq_head != chip->rq_tail) { + struct aymo_(reg_queue_item)* item = &chip->rq_buffer[chip->rq_head]; + + if (item->address & 0x8000u) { + chip->rq_delay = AYMO_(REG_QUEUE_LATENCY); + chip->rq_delay += (((uint32_t)(item->address & 0x7FFFu) << 16) | item->value); + } + else { + aymo_(write)(chip, item->address, item->value); + } + + if (++chip->rq_head >= AYMO_(REG_QUEUE_LENGTH)) { + chip->rq_head = 0; + } + } +} + + +static +void aymo_(tick_once)(struct aymo_(chip)* chip) +{ + int sgi; + + // Clear output accumulators + aymo_(og_clear)(chip); + + // Process slot group 0 + sgi = 0; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 2 + sgi = 2; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 4 + sgi = 4; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 6 + sgi = 6; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 1 + sgi = 1; + aymo_(sg_update)(chip, &chip->sg[sgi]); + aymo_(ng_update)(chip, (36 - 3)); // slot 16 --> slot 13 + aymo_(rm_update_sg1)(chip); + + // Process slot group 3 + sgi = 3; + aymo_(sg_update)(chip, &chip->sg[sgi]); + aymo_(ng_update)(chip, 3); // slot 13 --> slot 16 + aymo_(rm_update_sg3)(chip); + + if AYMO_UNLIKELY(chip->process_all_slots) { + // Process slot group 5 + sgi = 5; + aymo_(sg_update)(chip, &chip->sg[sgi]); + + // Process slot group 7 + sgi = 7; + aymo_(sg_update)(chip, &chip->sg[sgi]); + } + + // Update outputs + aymo_(og_update)(chip); + + // Update timers + aymo_(tm_update)(chip); + + // Dequeue registers + aymo_(rq_update)(chip); +} + + +static +void aymo_(eg_update_ksl)(struct aymo_(chip)* chip, int word) +{ + int slot = aymo_ymf262_word_to_slot[word]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h); + + int16_t pg_fnum = vextractv(cg->pg_fnum, sgo); + int16_t pg_fnum_hn = ((pg_fnum >> 6) & 15); + + int ch2x = aymo_ymf262_word_to_ch2x[aymo_ymf262_slot_to_word[slot]]; + int16_t eg_block = (int16_t)(chip->ch2x_regs[ch2x].reg_B0h.block); + int16_t eg_ksl = aymo_ymf262_eg_ksl_table[pg_fnum_hn]; + eg_ksl = ((eg_ksl << 2) - ((8 - eg_block) << 5)); + if (eg_ksl < 0) { + eg_ksl = 0; + } + int16_t eg_kslsh = aymo_ymf262_eg_kslsh_table[reg_40h->ksl]; + int16_t eg_ksl_sh = (eg_ksl >> eg_kslsh); + + int16_t eg_tl_x4 = ((int16_t)reg_40h->tl << 2); + + int16_t eg_ksl_sh_tl_x4 = (eg_ksl_sh + eg_tl_x4); + vinsertv(sg->eg_ksl_sh_tl_x4, eg_ksl_sh_tl_x4, sgo); + +#ifdef AYMO_DEBUG + vinsertv(sg->eg_ksl, eg_ksl, sgo); +#endif +} + + +static +void aymo_(chip_pg_update_nts)(struct aymo_(chip)* chip) +{ + for (int slot = 0; slot < AYMO_(SLOT_NUM_MAX); ++slot) { + int word = aymo_ymf262_slot_to_word[slot]; + int ch2x = aymo_ymf262_word_to_ch2x[word]; + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h); + int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8)); + int16_t eg_ksv = ((reg_B0h->block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1)); + + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + + struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h); + int16_t ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1)); + + vinsertv(cg->eg_ksv, eg_ksv, sgo); + vinsertv(sg->eg_ks, ks, sgo); + } +} + + +static +void aymo_(pg_update_fnum)( + struct aymo_(chip)* chip, int ch2x, + int16_t pg_fnum, int16_t eg_ksv, int16_t pg_block +) +{ + int word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int sgi0 = (word0 / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word0 % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi0); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + + vinsertv(cg->pg_block, pg_block, sgo); + vinsertv(cg->pg_fnum, pg_fnum, sgo); + vinsertv(cg->eg_ksv, eg_ksv, sgo); + + struct aymo_(slot_group)* sg0 = &(chip->sg[sgi0]); + int slot0 = aymo_ymf262_word_to_slot[word0]; + struct aymo_ymf262_reg_20h* reg_20h0 = &(chip->slot_regs[slot0].reg_20h); + int16_t ks0 = (eg_ksv >> ((reg_20h0->ksr ^ 1) << 1)); + vinsertv(sg0->eg_ks, ks0, sgo); + aymo_(eg_update_ksl)(chip, word0); + aymo_(pg_update_deltafreq)(chip, cg, sg0); + + int word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int sgi1 = (word1 / AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg1 = &(chip->sg[sgi1]); + int slot1 = aymo_ymf262_word_to_slot[word1]; + struct aymo_ymf262_reg_20h* reg_20h1 = &(chip->slot_regs[slot1].reg_20h); + int16_t ks1 = (eg_ksv >> ((reg_20h1->ksr ^ 1) << 1)); + vinsertv(sg1->eg_ks, ks1, sgo); + aymo_(eg_update_ksl)(chip, word1); + aymo_(pg_update_deltafreq)(chip, cg, sg1); +} + + +static +void aymo_(ch2x_update_fnum)(struct aymo_(chip)* chip, int ch2x, int8_t ch2p) +{ + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_08h* reg_08h = &(chip->chip_regs.reg_08h); + int16_t pg_fnum = (int16_t)(reg_A0h->fnum_lo | ((uint16_t)reg_B0h->fnum_hi << 8)); + int16_t pg_block = (int16_t)reg_B0h->block; + int16_t eg_ksv = ((pg_block << 1) | ((pg_fnum >> (9 - reg_08h->nts)) & 1)); + + aymo_(pg_update_fnum)(chip, ch2x, pg_fnum, eg_ksv, pg_block); + + if (ch2p >= 0) { + aymo_(pg_update_fnum)(chip, ch2p, pg_fnum, eg_ksv, pg_block); + } +} + + +static inline +void aymo_(eg_key_on)(struct aymo_(chip)* chip, int word, int16_t mode) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + int16_t eg_key = vextractv(sg->eg_key, sgo); + eg_key |= mode; + vinsertv(sg->eg_key, eg_key, sgo); +} + + +static inline +void aymo_(eg_key_off)(struct aymo_(chip)* chip, int word, int16_t mode) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + int16_t eg_key = vextractv(sg->eg_key, sgo); + eg_key &= (int16_t)~mode; + vinsertv(sg->eg_key, eg_key, sgo); +} + + +static +void aymo_(ch2x_key_on)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm) { + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + unsigned ch2x_is_drum = (chip->og_ch2x_drum & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + + if (ch2x_is_pairing && !ch2x_is_secondary) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0]; + int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL)); + } + else if (!ch2x_is_pairing || ch2x_is_drum) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } + } + else { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_on)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_on)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } +} + + +static +void aymo_(ch2x_key_off)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm) { + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + unsigned ch2x_is_drum = (chip->og_ch2x_drum & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + + if (ch2x_is_pairing && !ch2x_is_secondary) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int ch2p_word0 = aymo_ymf262_ch2x_to_word[ch2p][0]; + int ch2p_word1 = aymo_ymf262_ch2x_to_word[ch2p][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2p_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2p_word1, AYMO_(EG_KEY_NORMAL)); + } + else if (!ch2x_is_pairing || ch2x_is_drum) { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } + } + else { + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + aymo_(eg_key_off)(chip, ch2x_word0, AYMO_(EG_KEY_NORMAL)); + aymo_(eg_key_off)(chip, ch2x_word1, AYMO_(EG_KEY_NORMAL)); + } +} + + +static +void aymo_(cm_rewire_slot)(struct aymo_(chip)* chip, int word, const struct aymo_(conn)* conn) +{ + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &chip->sg[sgi]; + vinsertv(sg->wg_fbmod_gate, conn->wg_fbmod_gate, sgo); + vinsertv(sg->wg_prmod_gate, conn->wg_prmod_gate, sgo); + int16_t og_out_gate = conn->og_out_gate; + vinsertv(sg->og_out_gate, og_out_gate, sgo); + + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + vinsertv(sg->og_out_ch_gate_a, (vextractv(cg->og_ch_gate_a, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_b, (vextractv(cg->og_ch_gate_b, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_c, (vextractv(cg->og_ch_gate_c, sgo) & og_out_gate), sgo); + vinsertv(sg->og_out_ch_gate_d, (vextractv(cg->og_ch_gate_d, sgo) & og_out_gate), sgo); +} + + +static +void aymo_(cm_rewire_ch2x)(struct aymo_(chip)* chip, int ch2x) +{ + if (chip->chip_regs.reg_105h.newm && (chip->og_ch2x_pairing & (1UL << ch2x))) { + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (ch2x_is_secondary) { + int t = ch2x; + ch2x = ch2p; + ch2p = t; + } + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt); + const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]); + } + else { + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]); + } +} + + +static +void aymo_(cm_rewire_conn)( + struct aymo_(chip)* chip, + const struct aymo_ymf262_reg_104h* reg_104h_prev +) +{ + struct aymo_ymf262_reg_104h* reg_104h = &chip->chip_regs.reg_104h; + unsigned diff = (reg_104h_prev ? (reg_104h_prev->conn ^ reg_104h->conn) : 0xFF); + + for (int ch4x = 0; ch4x < (AYMO_(CHANNEL_NUM_MAX) / 2); ++ch4x) { + if (diff & (1 << ch4x)) { + int ch2x = aymo_ymf262_ch4x_to_pair[ch4x][0]; + int ch2p = aymo_ymf262_ch4x_to_pair[ch4x][1]; + + if (reg_104h->conn & (1 << ch4x)) { + chip->og_ch2x_pairing |= ((1UL << ch2x) | (1UL << ch2p)); + + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + unsigned ch4x_cnt = ((ch2x_cnt << 1) | ch2p_cnt); + const struct aymo_(conn)* ch4x_conn = aymo_(conn_ch4x_table)[ch4x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch4x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch4x_conn[1]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch4x_conn[2]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch4x_conn[3]); + } + else { + chip->og_ch2x_pairing &= ~((1UL << ch2x) | (1UL << ch2p)); + + unsigned ch2x_cnt = chip->ch2x_regs[ch2x].reg_C0h.cnt; + const struct aymo_(conn)* ch2x_conn = aymo_(conn_ch2x_table)[ch2x_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][0], &ch2x_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2x][1], &ch2x_conn[1]); + + unsigned ch2p_cnt = chip->ch2x_regs[ch2p].reg_C0h.cnt; + const struct aymo_(conn)* ch2p_conn = aymo_(conn_ch2x_table)[ch2p_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][0], &ch2p_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[ch2p][1], &ch2p_conn[1]); + } + } + } +} + + +static +void aymo_(cm_rewire_rhythm)( + struct aymo_(chip)* chip, + const struct aymo_ymf262_reg_BDh* reg_BDh_prev +) +{ + const struct aymo_ymf262_reg_BDh reg_BDh_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + const struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh; + int force_update = 0; + + if (reg_BDh->ryt) { + if (!reg_BDh_prev->ryt) { + // Apply special connection for rhythm mode + unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt; + const struct aymo_(conn)* ch6_conn = aymo_(conn_ryt_table)[ch6_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]); + + const struct aymo_(conn)* ch7_conn = aymo_(conn_ryt_table)[2]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]); + + const struct aymo_(conn)* ch8_conn = aymo_(conn_ryt_table)[3]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]); + + force_update = 1; + } + } + else { + if (reg_BDh_prev->ryt) { + // Apply standard Channel_2xOP connection + unsigned ch6_cnt = chip->ch2x_regs[6].reg_C0h.cnt; + const struct aymo_(conn)* ch6_conn = aymo_(conn_ch2x_table)[ch6_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][0], &ch6_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[6][1], &ch6_conn[1]); + + unsigned ch7_cnt = chip->ch2x_regs[7].reg_C0h.cnt; + const struct aymo_(conn)* ch7_conn = aymo_(conn_ch2x_table)[ch7_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][0], &ch7_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[7][1], &ch7_conn[1]); + + unsigned ch8_cnt = chip->ch2x_regs[8].reg_C0h.cnt; + const struct aymo_(conn)* ch8_conn = aymo_(conn_ch2x_table)[ch8_cnt]; + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][0], &ch8_conn[0]); + aymo_(cm_rewire_slot)(chip, aymo_ymf262_ch2x_to_word[8][1], &ch8_conn[1]); + + reg_BDh = ®_BDh_zero; // force all keys off + force_update = 1; + } + } + + if ((reg_BDh->hh != reg_BDh_prev->hh) || force_update) { + int word_hh = aymo_ymf262_ch2x_to_word[7][0]; + if (reg_BDh->hh) { + aymo_(eg_key_on)(chip, word_hh, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_hh, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->tc != reg_BDh_prev->tc) || force_update) { + int word_tc = aymo_ymf262_ch2x_to_word[8][1]; + if (reg_BDh->tc) { + aymo_(eg_key_on)(chip, word_tc, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_tc, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->tom != reg_BDh_prev->tom) || force_update) { + int word_tom = aymo_ymf262_ch2x_to_word[8][0]; + if (reg_BDh->tom) { + aymo_(eg_key_on)(chip, word_tom, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_tom, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->sd != reg_BDh_prev->sd) || force_update) { + int word_sd = aymo_ymf262_ch2x_to_word[7][1]; + if (reg_BDh->sd) { + aymo_(eg_key_on)(chip, word_sd, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_sd, AYMO_(EG_KEY_DRUM)); + } + } + + if ((reg_BDh->bd != reg_BDh_prev->bd) || force_update) { + int word_bd0 = aymo_ymf262_ch2x_to_word[6][0]; + int word_bd1 = aymo_ymf262_ch2x_to_word[6][1]; + if (reg_BDh->bd) { + aymo_(eg_key_on)(chip, word_bd0, AYMO_(EG_KEY_DRUM)); + aymo_(eg_key_on)(chip, word_bd1, AYMO_(EG_KEY_DRUM)); + } else { + aymo_(eg_key_off)(chip, word_bd0, AYMO_(EG_KEY_DRUM)); + aymo_(eg_key_off)(chip, word_bd1, AYMO_(EG_KEY_DRUM)); + } + } +} + + +static +void aymo_(write_00h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + switch (address) { + case 0x01: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_01h) = value; + break; + } + case 0x02: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_02h) = value; + break; + } + case 0x03: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_03h) = value; + break; + } + case 0x04: { + *(uint8_t*)(void*)&(chip->chip_regs.reg_04h) = value; + break; + } + case 0x104: { + struct aymo_ymf262_reg_104h reg_104h_prev = chip->chip_regs.reg_104h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_104h) = value; + aymo_(cm_rewire_conn)(chip, ®_104h_prev); + break; + } + case 0x105: { + struct aymo_ymf262_reg_105h reg_105h_prev = chip->chip_regs.reg_105h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_105h) = value; + if (chip->chip_regs.reg_105h.newm != reg_105h_prev.newm) { + ; + } + break; + } + case 0x08: { + struct aymo_ymf262_reg_08h reg_08h_prev = chip->chip_regs.reg_08h; + *(uint8_t*)(void*)&(chip->chip_regs.reg_08h) = value; + if (chip->chip_regs.reg_08h.nts != reg_08h_prev.nts) { + aymo_(chip_pg_update_nts)(chip); + } + break; + } + } +} + + +static +void aymo_(write_20h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int sgi = (aymo_ymf262_slot_to_word[slot] / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (aymo_ymf262_slot_to_word[slot] % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_20h* reg_20h = &(chip->slot_regs[slot].reg_20h); + struct aymo_ymf262_reg_20h reg_20h_prev = *reg_20h; + *(uint8_t*)(void*)reg_20h = value; + unsigned update_deltafreq = 0; + + if (reg_20h->mult != reg_20h_prev.mult) { + int16_t pg_mult_x2 = aymo_ymf262_pg_mult_x2_table[reg_20h->mult]; + vinsertv(sg->pg_mult_x2, pg_mult_x2, sgo); + update_deltafreq = 1; // force + } + + if (reg_20h->ksr != reg_20h_prev.ksr) { + int16_t eg_ksv = vextractv(cg->eg_ksv, sgo); + int16_t eg_ks = (eg_ksv >> ((reg_20h->ksr ^ 1) << 1)); + vinsertv(sg->eg_ks, eg_ks, sgo); + } + + if (reg_20h->egt != reg_20h_prev.egt) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->sr = (reg_20h->egt ? 0 : chip->slot_regs[slot].reg_80h.rr); + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + } + + if (reg_20h->vib != reg_20h_prev.vib) { + int16_t pg_vib = -(int16_t)reg_20h->vib; + vinsertv(sg->pg_vib, pg_vib, sgo); + update_deltafreq = 1; // force + } + + if (reg_20h->am != reg_20h_prev.am) { + int16_t eg_am = -(int16_t)reg_20h->am; + vinsertv(sg->eg_am, eg_am, sgo); + + uint16_t eg_tremolopos = chip->eg_tremolopos; + if (eg_tremolopos >= 105) { + eg_tremolopos = (210 - eg_tremolopos); + } + vi16_t eg_tremolo = vset1((int16_t)(eg_tremolopos >> chip->eg_tremoloshift)); + vsfence(); + sg->eg_tremolo_am = vand(eg_tremolo, sg->eg_am); + } + + if (update_deltafreq) { + for (sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + cgi = aymo_(sgi_to_cgi)(sgi); + cg = &chip->cg[cgi]; + sg = &chip->sg[sgi]; + aymo_(pg_update_deltafreq)(chip, cg, sg); + } + } +} + + +static +void aymo_(write_40h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + struct aymo_ymf262_reg_40h* reg_40h = &(chip->slot_regs[slot].reg_40h); + struct aymo_ymf262_reg_40h reg_40h_prev = *reg_40h; + *(uint8_t*)(void*)reg_40h = value; + + if ((reg_40h->tl != reg_40h_prev.tl) || (reg_40h->ksl != reg_40h_prev.ksl)) { + aymo_(eg_update_ksl)(chip, word); + } +} + + +static +void aymo_(write_60h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_60h* reg_60h = &(chip->slot_regs[slot].reg_60h); + struct aymo_ymf262_reg_60h reg_60h_prev = *reg_60h; + *(uint8_t*)(void*)reg_60h = value; + + if ((reg_60h->dr != reg_60h_prev.dr) || (reg_60h->ar != reg_60h_prev.ar)) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->dr = reg_60h->dr; + eg_adsr->ar = reg_60h->ar; + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + } +} + + +static +void aymo_(write_80h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_80h* reg_80h = &(chip->slot_regs[slot].reg_80h); + struct aymo_ymf262_reg_80h reg_80h_prev = *reg_80h; + *(uint8_t*)(void*)reg_80h = value; + + if ((reg_80h->rr != reg_80h_prev.rr) || (reg_80h->sl != reg_80h_prev.sl)) { + int16_t eg_adsr_word = vextractv(sg->eg_adsr, sgo); + struct aymo_(eg_adsr)* eg_adsr = (struct aymo_(eg_adsr)*)(void*)&eg_adsr_word; + eg_adsr->sr = (chip->slot_regs[slot].reg_20h.egt ? 0 : reg_80h->rr); + eg_adsr->rr = reg_80h->rr; + vinsertv(sg->eg_adsr, eg_adsr_word, sgo); + int16_t eg_sl = (int16_t)reg_80h->sl; + if (eg_sl == 0x0F) { + eg_sl = 0x1F; + } + vinsertv(sg->eg_sl, eg_sl, sgo); + } +} + + +static +void aymo_(write_E0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int slot = aymo_(addr_to_slot)(address); + int word = aymo_ymf262_slot_to_word[slot]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + struct aymo_ymf262_reg_E0h* reg_E0h = &(chip->slot_regs[slot].reg_E0h); + struct aymo_ymf262_reg_E0h reg_E0h_prev = *reg_E0h; + *(uint8_t*)(void*)reg_E0h = value; + + if (!chip->chip_regs.reg_105h.newm) { + reg_E0h->ws &= 3; + } + + if (reg_E0h->ws != reg_E0h_prev.ws) { + const struct aymo_(wave)* wave = &aymo_(wave_table)[reg_E0h->ws]; + vinsertv(sg->wg_phase_mullo, wave->wg_phase_mullo, sgo); + vinsertv(sg->wg_phase_zero, wave->wg_phase_zero, sgo); + vinsertv(sg->wg_phase_neg, wave->wg_phase_neg, sgo); + vinsertv(sg->wg_phase_flip, wave->wg_phase_flip, sgo); + vinsertv(sg->wg_phase_mask, wave->wg_phase_mask, sgo); + vinsertv(sg->wg_sine_gate, wave->wg_sine_gate, sgo); + } +} + + +static +void aymo_(write_A0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) { + return; + } + if (!ch2x_is_pairing || ch2x_is_secondary) { + ch2p = -1; + } + + struct aymo_ymf262_reg_A0h* reg_A0h = &(chip->ch2x_regs[ch2x].reg_A0h); + struct aymo_ymf262_reg_A0h reg_A0h_prev = *reg_A0h; + *(uint8_t*)(void*)reg_A0h = value; + + if (reg_A0h->fnum_lo != reg_A0h_prev.fnum_lo) { + aymo_(ch2x_update_fnum)(chip, ch2x, ch2p); + } +} + + +static +void aymo_(write_B0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + unsigned ch2x_is_pairing = (chip->og_ch2x_pairing & (1UL << ch2x)); + int ch2p = aymo_ymf262_ch2x_paired[ch2x]; + int ch2x_is_secondary = (ch2p < ch2x); + if (chip->chip_regs.reg_105h.newm && ch2x_is_pairing && ch2x_is_secondary) { + return; + } + if (!ch2x_is_pairing || ch2x_is_secondary) { + ch2p = -1; + } + + if (address == 0xBD) { + struct aymo_ymf262_reg_BDh* reg_BDh = &chip->chip_regs.reg_BDh; + struct aymo_ymf262_reg_BDh reg_BDh_prev = *reg_BDh; + *(uint8_t*)(void*)reg_BDh = value; + + chip->eg_tremoloshift = (((reg_BDh->dam ^ 1) << 1) + 2); + chip->eg_vibshift = (reg_BDh->dvb ^ 1); + aymo_(cm_rewire_rhythm)(chip, ®_BDh_prev); + } + else { + struct aymo_ymf262_reg_B0h* reg_B0h = &(chip->ch2x_regs[ch2x].reg_B0h); + struct aymo_ymf262_reg_B0h reg_B0h_prev = *reg_B0h; + *(uint8_t*)(void*)reg_B0h = value; + + if ((reg_B0h->fnum_hi != reg_B0h_prev.fnum_hi) || (reg_B0h->block != reg_B0h_prev.block)) { + aymo_(ch2x_update_fnum)(chip, ch2x, ch2p); + } + + if (reg_B0h->kon != reg_B0h_prev.kon) { + if (reg_B0h->kon) { + aymo_(ch2x_key_on)(chip, ch2x); + } else { + aymo_(ch2x_key_off)(chip, ch2x); + } + } + } +} + + +static +void aymo_(write_C0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + struct aymo_ymf262_reg_C0h* reg_C0h = &(chip->ch2x_regs[ch2x].reg_C0h); + struct aymo_ymf262_reg_C0h reg_C0h_prev = *reg_C0h; + if (!chip->chip_regs.reg_105h.newm) { + value = ((value | 0x30) & 0x3F); + } + *(uint8_t*)(void*)reg_C0h = value; + + int ch2x_word0 = aymo_ymf262_ch2x_to_word[ch2x][0]; + int ch2x_word1 = aymo_ymf262_ch2x_to_word[ch2x][1]; + int sgo = (ch2x_word0 % AYMO_(SLOT_GROUP_LENGTH)); + int sgi0 = (ch2x_word0 / AYMO_(SLOT_GROUP_LENGTH)); + int sgi1 = (ch2x_word1 / AYMO_(SLOT_GROUP_LENGTH)); + struct aymo_(slot_group)* sg0 = &chip->sg[sgi0]; + struct aymo_(slot_group)* sg1 = &chip->sg[sgi1]; + int cgi = aymo_(sgi_to_cgi)(sgi0); + struct aymo_(ch2x_group)* cg = &chip->cg[cgi]; + + if (reg_C0h->cha != reg_C0h_prev.cha) { + int16_t og_ch_gate_a = -(int16_t)reg_C0h->cha; + vinsertv(cg->og_ch_gate_a, og_ch_gate_a, sgo); + vinsertv(sg0->og_out_ch_gate_a, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_a), sgo); + vinsertv(sg1->og_out_ch_gate_a, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_a), sgo); + } + if (reg_C0h->chb != reg_C0h_prev.chb) { + int16_t og_ch_gate_b = -(int16_t)reg_C0h->chb; + vinsertv(cg->og_ch_gate_b, og_ch_gate_b, sgo); + vinsertv(sg0->og_out_ch_gate_b, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_b), sgo); + vinsertv(sg1->og_out_ch_gate_b, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_b), sgo); + } + if (reg_C0h->chc != reg_C0h_prev.chc) { + int16_t og_ch_gate_c = -(int16_t)reg_C0h->chc; + vinsertv(cg->og_ch_gate_c, og_ch_gate_c, sgo); + vinsertv(sg0->og_out_ch_gate_c, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_c), sgo); + vinsertv(sg1->og_out_ch_gate_c, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_c), sgo); + } + if (reg_C0h->chd != reg_C0h_prev.chd) { + int16_t og_ch_gate_d = -(int16_t)reg_C0h->chd; + vinsertv(cg->og_ch_gate_d, og_ch_gate_d, sgo); + vinsertv(sg0->og_out_ch_gate_d, (vextractv(sg0->og_out_gate, sgo) & og_ch_gate_d), sgo); + vinsertv(sg1->og_out_ch_gate_d, (vextractv(sg1->og_out_gate, sgo) & og_ch_gate_d), sgo); + } + + if (reg_C0h->fb != reg_C0h_prev.fb) { + int16_t fb_mulhi = (reg_C0h->fb ? (0x0040 << reg_C0h->fb) : 0); + vinsertv(sg0->wg_fb_mulhi, fb_mulhi, sgo); + vinsertv(sg1->wg_fb_mulhi, fb_mulhi, sgo); + } + + if (chip->chip_regs.reg_105h.stereo) { + // TODO + } + + if (reg_C0h->cnt != reg_C0h_prev.cnt) { + aymo_(cm_rewire_ch2x)(chip, ch2x); + } +} + + +static +void aymo_(write_D0h)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + int ch2x = aymo_(addr_to_ch2x)(address); + *(uint8_t*)(void*)&(chip->ch2x_regs[ch2x].reg_C0h) = value; + + if (chip->chip_regs.reg_105h.stereo) { + // TODO + } +} + + +static +int aymo_(rq_enqueue)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + uint16_t rq_tail = chip->rq_tail; + uint16_t rq_next = (rq_tail + 1); + if (rq_next >= AYMO_(REG_QUEUE_LENGTH)) { + rq_next = 0u; + } + + if (rq_next != chip->rq_head) { + chip->rq_buffer[rq_tail].address = address; + chip->rq_buffer[rq_tail].value = value; + chip->rq_tail = rq_next; + return 1; + } + return 0; +} + + +const struct aymo_ymf262_vt* aymo_(get_vt)(void) +{ + return &(aymo_(vt)); +} + + +uint32_t aymo_(get_sizeof)(void) +{ + return sizeof(struct aymo_(chip)); +} + + +void aymo_(ctor)(struct aymo_(chip)* chip) +{ + assert(chip); + + // Wipe everything, except VT + const struct aymo_ymf262_vt* vt = chip->parent.vt; + aymo_memset(chip, 0, sizeof(*chip)); + chip->parent.vt = vt; + + // Initialize slots + for (int sgi = 0; sgi < AYMO_(SLOT_GROUP_NUM); ++sgi) { + struct aymo_(slot_group)* sg = &(chip->sg[sgi]); + sg->eg_rout = vset1(0x01FF); + sg->eg_out = vset1(0x01FF); + sg->eg_gen = vset1(AYMO_(EG_GEN_RELEASE)); + sg->eg_gen_mullo = vset1(AYMO_(EG_GEN_MULLO_RELEASE)); + sg->pg_mult_x2 = vset1(aymo_ymf262_pg_mult_x2_table[0]); + sg->og_prout_ac = vsetm(aymo_(og_prout_ac)[sgi]); + sg->og_prout_bd = vsetm(aymo_(og_prout_bd)[sgi]); + + const struct aymo_(wave)* wave = &aymo_(wave_table)[0]; + sg->wg_phase_mullo = vset1(wave->wg_phase_mullo); + sg->wg_phase_zero = vset1(wave->wg_phase_zero); + sg->wg_phase_neg = vset1(wave->wg_phase_neg); + sg->wg_phase_flip = vset1(wave->wg_phase_flip); + sg->wg_phase_mask = vset1(wave->wg_phase_mask); + sg->wg_sine_gate = vset1(wave->wg_sine_gate); + } + + // Initialize channels + for (int cgi = 0; cgi < (AYMO_(SLOT_GROUP_NUM) / 2); ++cgi) { + struct aymo_(ch2x_group)* cg = &(chip->cg[cgi]); + cg->og_ch_gate_a = vset1(-1); + cg->og_ch_gate_b = vset1(-1); + } + for (int ch2x = 0; ch2x < AYMO_(CHANNEL_NUM_MAX); ++ch2x) { + aymo_(cm_rewire_ch2x)(chip, ch2x); + } + + // Initialize chip + chip->ng_noise = 1; + + chip->eg_tremoloshift = 4; + chip->eg_vibshift = 1; +} + + +void aymo_(dtor)(struct aymo_(chip)* chip) +{ + AYMO_UNUSED_VAR(chip); + assert(chip); +} + + +uint8_t aymo_(read)(struct aymo_(chip)* chip, uint16_t address) +{ + AYMO_UNUSED_VAR(chip); + AYMO_UNUSED_VAR(address); + assert(chip); + + // not supported + return 0u; +} + + +void aymo_(write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address > 0x1FF) { + return; + } + + switch (address & 0xF0) { + case 0x00: { + aymo_(write_00h)(chip, address, value); + break; + } + case 0x20: + case 0x30: { + aymo_(write_20h)(chip, address, value); + break; + } + case 0x40: + case 0x50: { + aymo_(write_40h)(chip, address, value); + break; + } + case 0x60: + case 0x70: { + aymo_(write_60h)(chip, address, value); + break; + } + case 0x80: + case 0x90: { + aymo_(write_80h)(chip, address, value); + break; + } + case 0xE0: + case 0xF0: { + aymo_(write_E0h)(chip, address, value); + break; + } + case 0xA0: { + aymo_(write_A0h)(chip, address, value); + break; + } + case 0xB0: { + aymo_(write_B0h)(chip, address, value); + break; + } + case 0xC0: { + aymo_(write_C0h)(chip, address, value); + break; + } + case 0xD0: { + aymo_(write_D0h)(chip, address, value); + break; + } + } + vsfence(); +} + + +int aymo_(enqueue_write)(struct aymo_(chip)* chip, uint16_t address, uint8_t value) +{ + assert(chip); + + if (address < 0x8000u) { + return aymo_(rq_enqueue)(chip, address, value); + } + return 0; +} + + +int aymo_(enqueue_delay)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + if (count < 0x8000u) { + uint16_t address = (uint16_t)((count >> 8) | 0x8000u); + uint8_t value = (uint8_t)(count & 0xFFu); + return aymo_(rq_enqueue)(chip, address, value); + } + return 0; +} + + +int16_t aymo_(get_output)(struct aymo_(chip)* chip, uint8_t channel) +{ + assert(chip); + + switch (channel) { + case 0u: return _mm_extract_epi16(chip->og_out, 0); + case 1u: return _mm_extract_epi16(chip->og_out, 1); + case 2u: return _mm_extract_epi16(chip->og_out, 2); + case 3u: return _mm_extract_epi16(chip->og_out, 3); + default: return 0; + } +} + + +void aymo_(tick)(struct aymo_(chip)* chip, uint32_t count) +{ + assert(chip); + + while (count--) { + aymo_(tick_once)(chip); + } +} + + +void aymo_(generate_i16x2)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 3u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + *(int32_t*)y = _mm_cvtsi128_si32(chip->og_out); + y += 2u; + } +} + + +void aymo_(generate_i16x4)(struct aymo_(chip)* chip, uint32_t count, int16_t y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 7u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + _mm_storel_epi64((void*)y, chip->og_out); + y += 4u; + } +} + + +void aymo_(generate_f32x2)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 7u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out); + vf32x4_t vf32 = _mm_cvtepi32_ps(vi32); + _mm_storel_pi((void*)y, vf32); + y += 2u; + } +} + + +void aymo_(generate_f32x4)(struct aymo_(chip)* chip, uint32_t count, float y[]) +{ + assert(chip); + assert(((uintptr_t)(void*)y & 15u) == 0u); + + while (count--) { + aymo_(tick_once)(chip); + + vi32x4_t vi32 = _mm_cvtepi16_epi32(chip->og_out); + vf32x4_t vf32 = _mm_cvtepi32_ps(vi32); + _mm_store_ps(y, vf32); + y += 4u; + } +} + + +AYMO_CXX_EXTERN_C_END + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 diff --git a/tests/aymo_testing.c b/tests/aymo_testing.c new file mode 100644 index 0000000..5c40e08 --- /dev/null +++ b/tests/aymo_testing.c @@ -0,0 +1,110 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_testing.h" + +#include +#include +#include +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +char* aymo_test_args_to_str( + int first, + int last, + char** argv, + const char *prefix, + const char *suffix +) +{ + assert(argv); + + size_t prefix_len = 0u; + if (prefix) { + prefix_len = strlen(prefix); + } + size_t suffix_len = 0u; + if (suffix) { + suffix_len = strlen(suffix); + } + + size_t line_len = (prefix_len + (size_t)(last - first) + suffix_len); + + if (first == 0) { + char* fwd = strrchr(argv[0], '/'); + if (fwd) { + argv[0] = (fwd + 1); + } + char* bwd = strrchr(argv[0], '\\'); + if (bwd) { + argv[0] = (bwd + 1); + } + } + + for (int i = first; i <= last; ++i) { + assert(argv[i]); + size_t arg_len = strlen(argv[i]); + line_len += arg_len; + } + + char *line = malloc(line_len + 1u); + assert(line); + size_t offset = 0u; + + if (prefix) { + memcpy(&line[offset], prefix, prefix_len); + offset += prefix_len; + } + for (int i = first; i <= last; ++i) { + size_t arg_len = strlen(argv[i]); + memcpy(&line[offset], argv[i], arg_len); + offset += arg_len; + if (i < last) { + line[offset++] = '_'; + } + } + if (suffix) { + memcpy(&line[offset], suffix, suffix_len); + offset += suffix_len; + } + + line[offset] = '\0'; + + for (offset = 0u; offset < line_len; ++offset) { + char c = line[offset]; + if (!isgraph(c) || (c == '/') || (c == '\\')) { + line[offset] = '_'; + } + } + + return line; +} + + +void aymo_test_free_args_str(char* line) +{ + free(line); +} + + +AYMO_CXX_EXTERN_C_END diff --git a/tests/aymo_testing.h b/tests/aymo_testing.h new file mode 100644 index 0000000..0860c53 --- /dev/null +++ b/tests/aymo_testing.h @@ -0,0 +1,54 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ +#ifndef _include_aymo_testing_h +#define _include_aymo_testing_h + +#include "aymo_cc.h" + + +// Test exit code status values +#define TEST_STATUS_PASS (0) // Test passed successfully +#define TEST_STATUS_FAIL (1) // Tast failed; usually with error text +#define TEST_STATUS_SKIP (77) // Test skipped +#define TEST_STATUS_HARD (99) // Test failed with hard error + + +// Macros to build test name lookup tables +typedef void (*aymo_testing_test_f)(void); // using globals as test status variables + +struct aymo_testing_entry { + const char* name; + aymo_testing_test_f func; +}; + +#define AYMO_TEST_ENTRY(name) { AYMO_STRINGIFY2(name), name } + + +AYMO_PUBLIC char* aymo_test_args_to_str( + int first, + int last, + char** argv, + const char *prefix, + const char *suffix +); +AYMO_PUBLIC void aymo_test_free_args_str(char* line); + + +#endif // _include_aymo_testing_h diff --git a/tests/aymo_testing_epilogue_inline.h b/tests/aymo_testing_epilogue_inline.h new file mode 100644 index 0000000..68eb553 --- /dev/null +++ b/tests/aymo_testing_epilogue_inline.h @@ -0,0 +1,41 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + + +int main(int argc, char** argv) +{ + app_return = TEST_STATUS_PASS; + + if (argc != 2) { + fprintf(stderr, "USAGE:\t%s TETSNAME\n", (argc ? argv[0] : "test_exe")); + return TEST_STATUS_HARD; + } + + for (unsigned i = 0; i < AYMO_VECTOR_LENGTH(unit_tests); ++i) { + if (!strcmp(unit_tests[i].name, argv[1])) { + (unit_tests[i].func)(); + break; + } + } + return app_return; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/tests/meson.build b/tests/meson.build new file mode 100644 index 0000000..6551089 --- /dev/null +++ b/tests/meson.build @@ -0,0 +1,392 @@ +test_includes = include_directories( + '.', +) + +test_common_sources = files( + 'aymo_testing.c', +) + + +test_names = [ +] + +test_names_none = [ + 'test_convert_none', + 'test_tda8425_none_sweep', + 'test_ym7128_none_sweep', + 'test_ymf262_none_compare', +] + + +test_names_x86 = [ +] + +test_names_x86_sse41 = [ + 'test_convert_x86_sse41', + 'test_tda8425_x86_sse41_sweep', + 'test_ym7128_x86_sse41_sweep', + 'test_ymf262_x86_sse41_compare', +] + +test_names_x86_avx2 = [ + 'test_convert_x86_avx2', + 'test_tda8425_x86_avx2_sweep', + 'test_ymf262_x86_avx2_compare', +] + + +test_names_arm = [ +] + +test_names_arm_neon = [ + 'test_convert_arm_neon', + 'test_tda8425_arm_neon_sweep', + 'test_ym7128_arm_neon_sweep', + 'test_ymf262_arm_neon_compare', +] + + +# ===================================================================== + +fs = import('fs') + + +# Generic +foreach test_name : test_names + test_c = '@0@.c'.format(test_name) + if fs.exists(test_c) + test_exe = executable( + test_name, + test_c, + test_common_sources, + include_directories: test_includes, + dependencies: [aymo_static_dep, aymo_libc_dep], + install: false, + ) + test(test_name, test_exe) + endif +endforeach + + +# x86 specific +if host_cpu_family in ['x86', 'x86_64'] + foreach test_name : test_names_x86 + test_c = '@0@.c'.format(test_name) + if fs.exists(test_c) + test_exe = executable( + test_name, + test_c, + test_common_sources, + include_directories: test_includes, + dependencies: [aymo_static_dep, aymo_libc_dep], + install: false, + ) + test(test_name, test_exe) + endif + endforeach +endif + + +# ARM specific +if host_cpu_family in ['arm', 'aarch64'] + foreach test_name : test_names_arm + test_c = '@0@.c'.format(test_name) + if fs.exists(test_c) + test_exe = executable( + test_name, + test_c, + test_common_sources, + include_directories: test_includes, + dependencies: [aymo_static_dep, aymo_libc_dep], + install: false, + ) + test(test_name, test_exe) + endif + endforeach +endif + + +# CPU-ext specific +foreach intr_name : ['none', 'x86_sse41', 'x86_avx2', 'arm_neon'] + have_intr = get_variable('aymo_have_@0@'.format(intr_name)) + if have_intr + test_names = get_variable('test_names_@0@'.format(intr_name)) + intr_args = get_variable('aymo_@0@_args'.format(intr_name), []) + foreach test_name : test_names + test_c = '@0@.c'.format(test_name) + if fs.exists(test_c) + test_exe = executable( + test_name, + test_c, + test_common_sources, + c_args: intr_args, + include_directories: test_includes, + dependencies: [aymo_static_dep, aymo_libc_dep], + install: false, + ) + set_variable('@0@_exe'.format(test_name), test_exe) + endif + endforeach + endif +endforeach + + +# ===================================================================== +# convert + +# function_name +aymo_convert_suite = [ + 'test_aymo_convert_@0@_i16_f32', + 'test_aymo_convert_@0@_f32_i16', + 'test_aymo_convert_@0@_i16_f32_1', + 'test_aymo_convert_@0@_f32_i16_1', + 'test_aymo_convert_@0@_i16_f32_k', + 'test_aymo_convert_@0@_f32_i16_k', + 'test_aymo_convert_@0@_u16_f32', + 'test_aymo_convert_@0@_f32_u16', + 'test_aymo_convert_@0@_u16_f32_1', + 'test_aymo_convert_@0@_f32_u16_1', + 'test_aymo_convert_@0@_u16_f32_k', + 'test_aymo_convert_@0@_f32_u16_k', +] + +foreach intr_name : ['none', 'x86_sse41', 'x86_avx2', 'arm_neon'] + have_intr = get_variable('aymo_have_@0@'.format(intr_name)) + if have_intr + test_suite = 'test_convert_@0@'.format(intr_name) + test_exe = get_variable('@0@_exe'.format(test_suite)) + foreach t : aymo_convert_suite + test_name = t.format(intr_name) + test(test_name, test_exe, args: test_name) + endforeach + endif +endforeach + + +# ===================================================================== +# TDA8425 + +seconds = '8' +samplerate = '48000' +aymo_tda8425_sweep_suite = { + 'stereo_ab_1_384000': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', '384000', seconds], + 'stereo_ab_1_96000': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', '96000', seconds], + 'stereo_ab_1_48000': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', '48000', seconds], + 'stereo_ab_1_44100': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', '44100', seconds], + + 'volume_lr_max_min': ['0xFF', '0xF0', '0xF6', '0xF6', '0xFF', '0xCE', samplerate, seconds], + 'volume_lr_min_max': ['0xF0', '0xFF', '0xF6', '0xF6', '0xFF', '0xCE', samplerate, seconds], + + 'bass_min': ['0xFC', '0xFC', '0xF0', '0xF6', '0xFF', '0xCE', samplerate, seconds], + 'bass_max': ['0xFC', '0xFC', '0xFF', '0xF6', '0xFF', '0xCE', samplerate, seconds], + + 'treble_min': ['0xFC', '0xFC', '0xF6', '0xF0', '0xFF', '0xCE', samplerate, seconds], + 'treble_max': ['0xFC', '0xFC', '0xF6', '0xFF', '0xFF', '0xCE', samplerate, seconds], + + 'stereo_none_1': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC8', samplerate, seconds], + 'stereo_none_2': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC9', samplerate, seconds], + 'stereo_1_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCA', samplerate, seconds], + 'stereo_1_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCC', samplerate, seconds], + 'stereo_1_ab': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCE', samplerate, seconds], + 'stereo_2_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCB', samplerate, seconds], + 'stereo_2_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCD', samplerate, seconds], + 'stereo_2_ab': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xCF', samplerate, seconds], + + 'mono_1_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC2', samplerate, seconds], + 'mono_1_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC4', samplerate, seconds], + 'mono_1_ab': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xC6', samplerate, seconds], + + 'pseudo_1_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xD2', samplerate, seconds], + 'pseudo_1_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xD4', samplerate, seconds], + # TODO: Preset 1 + # TODO: Preset 2 + # TODO: Preset 3 + + 'spatial_1_a': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xDE', samplerate, seconds], + 'spatial_1_b': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xDE', samplerate, seconds], + 'spatial_1_ab': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xDE', samplerate, seconds], + + 'mute': ['0xFC', '0xFC', '0xF6', '0xF6', '0xFF', '0xEE', samplerate, seconds], + + # TODO: T-filter extremes +} + +foreach intr_name : ['none', 'x86_sse41', 'x86_avx2', 'arm_neon'] + have_intr = get_variable('aymo_have_@0@'.format(intr_name)) + if have_intr + test_suite = 'test_tda8425_@0@_sweep'.format(intr_name) + test_exe = get_variable('@0@_exe'.format(test_suite)) + foreach test_name, test_args : aymo_tda8425_sweep_suite + test_name = ('_'.join([test_suite] + [test_name])).underscorify() + test(test_name, test_exe, args: test_args) + endforeach + endif +endforeach + + +# ===================================================================== +# YM7128 + +seconds = '20' +# name: [seconds, reg...] +aymo_ym7128_sweep_suite = { + 'off': [seconds, + '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', + '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', + '0x00', '0x00', '0x00', '0x00', + '0x00', '0x00', + '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00' + ], + 'direct': [seconds, + '0x3F', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', + '0x3F', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', + '0x3F', '0x00', '0x3F', '0x3F', + '0x00', '0x00', + '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00', '0x00' + ], + 'dune/arrakis': [seconds, + '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00', + '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', + '0x1A', '0x1D', '0x1A', '0x1A', + '0x16', '0x16', + '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F', + ], + 'dune/baghdad': [seconds, + '0x1F', '0x00', '0x1B', '0x00', '0x17', '0x00', '0x33', '0x00', + '0x00', '0x1D', '0x00', '0x19', '0x00', '0x15', '0x00', '0x11', + '0x1D', '0x1D', '0x1D', '0x1D', + '0x13', '0x13', + '0x06', '0x02', '0x04', '0x06', '0x08', '0x0A', '0x0C', '0x0E', '0x10', + ], + 'dune/morning': [seconds, + '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00', + '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', + '0x1A', '0x1D', '0x1B', '0x1B', + '0x16', '0x16', + '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F', + ], + 'dune/sequence': [seconds, + '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00', + '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', + '0x1A', '0x1D', '0x1C', '0x1C', + '0x16', '0x16', + '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F', + ], + 'dune/sietch': [seconds, + '0x1F', '0x00', '0x1B', '0x00', '0x17', '0x00', '0x33', '0x00', + '0x00', '0x1D', '0x00', '0x19', '0x00', '0x15', '0x00', '0x11', + '0x1D', '0x1D', '0x1D', '0x1D', + '0x13', '0x13', + '0x06', '0x02', '0x04', '0x06', '0x08', '0x0A', '0x0C', '0x0E', '0x10', + ], + 'dune/warsong': [seconds, + '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00', + '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', + '0x1A', '0x1D', '0x1C', '0x1C', + '0x16', '0x16', + '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F', + ], + 'dune/water': [seconds, + '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00', + '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', + '0x1A', '0x1D', '0x1A', '0x1A', + '0x16', '0x16', + '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F', + ], + 'dune/wormintro': [seconds, + '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00', + '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', + '0x1A', '0x1D', '0x18', '0x18', + '0x16', '0x16', + '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F', + ], + 'dune/wormsuit': [seconds, + '0x18', '0x00', '0x1A', '0x00', '0x1C', '0x00', '0x1E', '0x00', + '0x00', '0x19', '0x00', '0x1B', '0x00', '0x1D', '0x00', '0x1F', + '0x1B', '0x1F', '0x17', '0x17', + '0x12', '0x08', + '0x1F', '0x07', '0x0A', '0x0D', '0x10', '0x13', '0x16', '0x19', '0x1C', + ], + 'gold/cavern': [seconds, + '0x1F', '0x00', '0x1D', '0x00', '0x1B', '0x00', '0x19', '0x00', + '0x20', '0x3E', '0x20', '0x3C', '0x20', '0x3A', '0x20', '0x38', + '0x3C', '0x3E', '0x1C', '0x1C', + '0x11', '0x0A', + '0x12', '0x10', '0x0E', '0x0C', '0x0A', '0x08', '0x06', '0x04', '0x02', + ], + 'gold/chapel': [seconds, + '0x1F', '0x1E', '0x1D', '0x1C', '0x1B', '0x1A', '0x19', '0x18', + '0x3F', '0x3E', '0x3D', '0x3C', '0x3B', '0x3A', '0x39', '0x38', + '0x38', '0x3D', '0x1B', '0x1B', + '0x10', '0x10', + '0x1F', '0x1F', '0x1D', '0x1B', '0x19', '0x17', '0x15', '0x13', '0x11', + ], + 'gold/concert_hall': [seconds, + '0x31', '0x00', '0x15', '0x00', '0x39', '0x00', '0x1D', '0x00', + '0x00', '0x33', '0x00', '0x17', '0x00', '0x3B', '0x00', '0x1F', + '0x1A', '0x1C', '0x1D', '0x1D', + '0x16', '0x16', + '0x1F', '0x1C', '0x19', '0x16', '0x13', '0x10', '0x0D', '0x0A', '0x07', + ], + 'gold/deep_space': [seconds, + '0x18', '0x00', '0x1A', '0x00', '0x1C', '0x00', '0x1E', '0x00', + '0x00', '0x19', '0x00', '0x1B', '0x00', '0x1D', '0x00', '0x1F', + '0x1B', '0x1F', '0x1C', '0x1C', + '0x12', '0x08', + '0x1F', '0x07', '0x0A', '0x0D', '0x10', '0x13', '0x16', '0x19', '0x1C', + ], + 'gold/jazz_club': [seconds, + '0x1F', '0x1B', '0x37', '0x13', '0x2F', '0x0B', '0x27', '0x03', + '0x1F', '0x3B', '0x17', '0x33', '0x0F', '0x2B', '0x07', '0x23', + '0x1C', '0x1F', '0x1B', '0x1B', + '0x0C', '0x0C', + '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F', + ], + 'gold/movie_theater': [seconds, + '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', '0x00', + '0x00', '0x1F', '0x00', '0x17', '0x00', '0x0F', '0x00', '0x07', + '0x1A', '0x1D', '0x1C', '0x1C', + '0x16', '0x16', + '0x1F', '0x03', '0x07', '0x0B', '0x0F', '0x13', '0x17', '0x1B', '0x1F', + ], + 'gold/recital_hall': [seconds, + '0x1F', '0x3E', '0x1D', '0x3C', '0x1B', '0x3A', '0x19', '0x38', + '0x3F', '0x1E', '0x3D', '0x1C', '0x3B', '0x1A', '0x39', '0x18', + '0x18', '0x1C', '0x1C', '0x1C', + '0x15', '0x15', + '0x14', '0x04', '0x06', '0x08', '0x0A', '0x0C', '0x0E', '0x10', '0x12', + ], + 'gold/stadium': [seconds, + '0x1F', '0x00', '0x1B', '0x00', '0x17', '0x00', '0x33', '0x00', + '0x00', '0x1D', '0x00', '0x19', '0x00', '0x15', '0x00', '0x11', + '0x1D', '0x1D', '0x3D', '0x3D', + '0x13', '0x13', + '0x06', '0x02', '0x04', '0x06', '0x08', '0x0A', '0x0C', '0x0E', '0x10', + ], +} + +foreach intr_name : ['none', 'x86_sse41', 'arm_neon'] + have_intr = get_variable('aymo_have_@0@'.format(intr_name)) + if have_intr + test_suite = 'test_ym7128_@0@_sweep'.format(intr_name) + test_exe = get_variable('@0@_exe'.format(test_suite)) + foreach test_name, test_args : aymo_ym7128_sweep_suite + test_name = ('_'.join([test_suite] + [test_name])).underscorify() + test(test_name, test_exe, args: test_args) + endforeach + endif +endforeach + + +# ===================================================================== +# YMF262 + +foreach intr_name : ['none', 'x86_sse41', 'x86_avx2', 'arm_neon'] + have_intr = get_variable('aymo_have_@0@'.format(intr_name)) + if have_intr + # TODO: improve testing scores + test_name = 'test_ymf262_@0@_compare'.format(intr_name) + test_exe = get_variable('@0@_exe'.format(test_name)) + test(test_name, test_exe, args: ['avd', '../tests/scores/DUNE.avd']) + endif +endforeach diff --git a/tests/test_convert_arm_neon.c b/tests/test_convert_arm_neon.c new file mode 100644 index 0000000..4d02e36 --- /dev/null +++ b/tests/test_convert_arm_neon.c @@ -0,0 +1,376 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +#include "aymo_file.h" +#include "aymo_testing.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_convert_arm_neon.h" + +#include "test_convert_prologue_inline.h" + + +void test_aymo_convert_arm_neon_i16_f32(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, DIRTY, sizeof(buf_f32)); + aymo_(i16_f32)((ei - si), &src_i16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32, ref_n); +} + + +void test_aymo_convert_arm_neon_f32_i16(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16)((ei - si), &src_f32[si], &buf_i16[si]); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16, ref_n); +} + + +void test_aymo_convert_arm_neon_i16_f32_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(i16_f32_1)((ei - si), &src_i16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32_1, ref_n); +} + + +void test_aymo_convert_arm_neon_f32_i16_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16_1)((ei - si), &src_f32_1[si], &buf_i16[si]); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16_1, ref_n); +} + + +void test_aymo_convert_arm_neon_i16_f32_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(i16_f32_k)((ei - si), &src_i16[si], &buf_f32[si], (float)(1. / K)); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32_1, ref_n); +} + + +void test_aymo_convert_arm_neon_f32_i16_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16_k)((ei - si), &src_f32_1[si], &buf_i16[si], (float)(K)); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16_1, ref_n); +} + + +void test_aymo_convert_arm_neon_u16_f32(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, DIRTY, sizeof(buf_f32)); + aymo_(u16_f32)((ei - si), &src_u16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32, ref_n); +} + + +void test_aymo_convert_arm_neon_f32_u16(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16)((ei - si), &src_f32[si], &buf_u16[si]); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16, ref_n); +} + + +void test_aymo_convert_arm_neon_u16_f32_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(u16_f32_1)((ei - si), &src_u16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32_1, ref_n); +} + + +void test_aymo_convert_arm_neon_f32_u16_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16_1)((ei - si), &src_f32_1[si], &buf_u16[si]); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16_1, ref_n); +} + + +void test_aymo_convert_arm_neon_u16_f32_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(u16_f32_k)((ei - si), &src_u16[si], &buf_f32[si], (float)(1. / K)); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32_1, ref_n); +} + + +void test_aymo_convert_arm_neon_f32_u16_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16_k)((ei - si), &src_f32_1[si], &buf_u16[si], (float)(K)); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16_1, ref_n); +} + + +struct aymo_testing_entry unit_tests[] = +{ + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_i16_f32), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_i16), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_i16_f32_1), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_i16_1), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_i16_f32_k), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_i16_k), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_u16_f32), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_u16), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_u16_f32_1), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_u16_1), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_u16_f32_k), + AYMO_TEST_ENTRY(test_aymo_convert_arm_neon_f32_u16_k) +}; + + +#include "aymo_testing_epilogue_inline.h" + + +#endif // AYMO_CPU_SUPPORT_ARM_NEON diff --git a/tests/test_convert_none.c b/tests/test_convert_none.c new file mode 100644 index 0000000..20bf38f --- /dev/null +++ b/tests/test_convert_none.c @@ -0,0 +1,371 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#include "aymo_file.h" +#include "aymo_testing.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_convert_none.h" + +#include "test_convert_prologue_inline.h" + + +void test_aymo_convert_none_i16_f32(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, DIRTY, sizeof(buf_f32)); + aymo_(i16_f32)((ei - si), &src_i16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32, ref_n); +} + + +void test_aymo_convert_none_f32_i16(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16)((ei - si), &src_f32[si], &buf_i16[si]); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16, ref_n); +} + + +void test_aymo_convert_none_i16_f32_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(i16_f32_1)((ei - si), &src_i16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32_1, ref_n); +} + + +void test_aymo_convert_none_f32_i16_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16_1)((ei - si), &src_f32_1[si], &buf_i16[si]); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16_1, ref_n); +} + + +void test_aymo_convert_none_i16_f32_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(i16_f32_k)((ei - si), &src_i16[si], &buf_f32[si], (float)(1. / K)); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32_1, ref_n); +} + + +void test_aymo_convert_none_f32_i16_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16_k)((ei - si), &src_f32_1[si], &buf_i16[si], (float)(K)); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16_1, ref_n); +} + + +void test_aymo_convert_none_u16_f32(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, DIRTY, sizeof(buf_f32)); + aymo_(u16_f32)((ei - si), &src_u16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32, ref_n); +} + + +void test_aymo_convert_none_f32_u16(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16)((ei - si), &src_f32[si], &buf_u16[si]); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16, ref_n); +} + + +void test_aymo_convert_none_u16_f32_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(u16_f32_1)((ei - si), &src_u16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32_1, ref_n); +} + + +void test_aymo_convert_none_f32_u16_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16_1)((ei - si), &src_f32_1[si], &buf_u16[si]); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16_1, ref_n); +} + + +void test_aymo_convert_none_u16_f32_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(u16_f32_k)((ei - si), &src_u16[si], &buf_f32[si], (float)(1. / K)); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32_1, ref_n); +} + + +void test_aymo_convert_none_f32_u16_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16_k)((ei - si), &src_f32_1[si], &buf_u16[si], (float)(K)); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16_1, ref_n); +} + + +struct aymo_testing_entry unit_tests[] = +{ + AYMO_TEST_ENTRY(test_aymo_convert_none_i16_f32), + AYMO_TEST_ENTRY(test_aymo_convert_none_f32_i16), + AYMO_TEST_ENTRY(test_aymo_convert_none_i16_f32_1), + AYMO_TEST_ENTRY(test_aymo_convert_none_f32_i16_1), + AYMO_TEST_ENTRY(test_aymo_convert_none_i16_f32_k), + AYMO_TEST_ENTRY(test_aymo_convert_none_f32_i16_k), + AYMO_TEST_ENTRY(test_aymo_convert_none_u16_f32), + AYMO_TEST_ENTRY(test_aymo_convert_none_f32_u16), + AYMO_TEST_ENTRY(test_aymo_convert_none_u16_f32_1), + AYMO_TEST_ENTRY(test_aymo_convert_none_f32_u16_1), + AYMO_TEST_ENTRY(test_aymo_convert_none_u16_f32_k), + AYMO_TEST_ENTRY(test_aymo_convert_none_f32_u16_k) +}; + + +#include "aymo_testing_epilogue_inline.h" diff --git a/tests/test_convert_prologue_inline.h b/tests/test_convert_prologue_inline.h new file mode 100644 index 0000000..bdb3a0f --- /dev/null +++ b/tests/test_convert_prologue_inline.h @@ -0,0 +1,296 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cc.h" + +#include +#include +#include +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +static int app_return; + + +#define ref_n 64u + + +#undef DIRTY + +#undef x0xmm +#undef x0xMM + +#undef mmu +#undef MMu + +#undef K +#undef xmm_f +#undef xMM_f +#undef xmi_f +#undef xMI_f +#undef xmu_f +#undef xMU_f + + +#define DIRTY (0xCCu) + +#define x0xmm (INT16_MIN) +#define x0xMM (INT16_MAX) + +#define xxmmu (0u) +#define xxMMu (UINT16_MAX) + +#define K (+32768.f) +#define x0xff (-1024.f * K) +#define x0xFF (+1024.f * K) +#define x0xfi ((float)x0xmm) +#define x0xFI ((float)x0xMM) +#define x0xfu ((float)xxmmu) +#define x0xFU ((float)xxMMu) + + +static float buf_f32[ref_n]; +static int16_t buf_i16[ref_n]; +static uint16_t buf_u16[ref_n]; + + +const int16_t src_i16[ref_n] = { + x0xmm, -0x01, -0x02, +0x03, x0xMM, -0x05, -0x06, +0x07, + -0x10, x0xmm, +0x12, -0x13, -0x14, x0xMM, -0x16, -0x17, + +0x20, +0x21, x0xmm, -0x23, -0x24, -0x25, x0xMM, -0x27, + -0x30, -0x31, +0x32, x0xmm, +0x34, +0x35, +0x36, x0xMM, + x0xMM, -0x41, +0x42, +0x43, x0xmm, +0x45, -0x46, -0x47, + +0x50, x0xMM, -0x52, -0x53, +0x54, x0xmm, +0x56, +0x57, + +0x60, -0x61, x0xMM, +0x63, -0x64, +0x65, x0xmm, -0x67, + -0x70, +0x71, +0x72, x0xMM, +0x74, -0x75, -0x76, x0xmm +}; + +const float ref_i16_f32[ref_n] = { + x0xfi, -0x01, -0x02, +0x03, x0xFI, -0x05, -0x06, +0x07, + -0x10, x0xfi, +0x12, -0x13, -0x14, x0xFI, -0x16, -0x17, + +0x20, +0x21, x0xfi, -0x23, -0x24, -0x25, x0xFI, -0x27, + -0x30, -0x31, +0x32, x0xfi, +0x34, +0x35, +0x36, x0xFI, + x0xFI, -0x41, +0x42, +0x43, x0xfi, +0x45, -0x46, -0x47, + +0x50, x0xFI, -0x52, -0x53, +0x54, x0xfi, +0x56, +0x57, + +0x60, -0x61, x0xFI, +0x63, -0x64, +0x65, x0xfi, -0x67, + -0x70, +0x71, +0x72, x0xFI, +0x74, -0x75, -0x76, x0xfi +}; + +const float ref_i16_f32_1[ref_n] = { + x0xfi/K, -0x01/K, -0x02/K, +0x03/K, x0xFI/K, -0x05/K, -0x06/K, +0x07/K, + -0x10/K, x0xfi/K, +0x12/K, -0x13/K, -0x14/K, x0xFI/K, -0x16/K, -0x17/K, + +0x20/K, +0x21/K, x0xfi/K, -0x23/K, -0x24/K, -0x25/K, x0xFI/K, -0x27/K, + -0x30/K, -0x31/K, +0x32/K, x0xfi/K, +0x34/K, +0x35/K, +0x36/K, x0xFI/K, + x0xFI/K, -0x41/K, +0x42/K, +0x43/K, x0xfi/K, +0x45/K, -0x46/K, -0x47/K, + +0x50/K, x0xFI/K, -0x52/K, -0x53/K, +0x54/K, x0xfi/K, +0x56/K, +0x57/K, + +0x60/K, -0x61/K, x0xFI/K, +0x63/K, -0x64/K, +0x65/K, x0xfi/K, -0x67/K, + -0x70/K, +0x71/K, +0x72/K, x0xFI/K, +0x74/K, -0x75/K, -0x76/K, x0xfi/K +}; + + +const uint16_t src_u16[ref_n] = { + xxmmu, 0x01u, 0x02u, 0x03u, xxMMu, 0x05u, 0x06u, 0x07u, + 0x10u, xxmmu, 0x12u, 0x13u, 0x14u, xxMMu, 0x16u, 0x17u, + 0x20u, 0x21u, xxmmu, 0x23u, 0x24u, 0x25u, xxMMu, 0x27u, + 0x30u, 0x31u, 0x32u, xxmmu, 0x34u, 0x35u, 0x36u, xxMMu, + xxMMu, 0x41u, 0x42u, 0x43u, xxmmu, 0x45u, 0x46u, 0x47u, + 0x50u, xxMMu, 0x52u, 0x53u, 0x54u, xxmmu, 0x56u, 0x57u, + 0x60u, 0x61u, xxMMu, 0x63u, 0x64u, 0x65u, xxmmu, 0x67u, + 0x70u, 0x71u, 0x72u, xxMMu, 0x74u, 0x75u, 0x76u, xxmmu +}; + +const float ref_u16_f32[ref_n] = { + x0xfu, +0x01, +0x02, +0x03, x0xFU, +0x05, +0x06, +0x07, + +0x10, x0xfu, +0x12, +0x13, +0x14, x0xFU, +0x16, +0x17, + +0x20, +0x21, x0xfu, +0x23, +0x24, +0x25, x0xFU, +0x27, + +0x30, +0x31, +0x32, x0xfu, +0x34, +0x35, +0x36, x0xFU, + x0xFU, +0x41, +0x42, +0x43, x0xfu, +0x45, +0x46, +0x47, + +0x50, x0xFU, +0x52, +0x53, +0x54, x0xfu, +0x56, +0x57, + +0x60, +0x61, x0xFU, +0x63, +0x64, +0x65, x0xfu, +0x67, + +0x70, +0x71, +0x72, x0xFU, +0x74, +0x75, +0x76, x0xfu +}; + +const float ref_u16_f32_1[ref_n] = { + x0xfu/K, +0x01/K, +0x02/K, +0x03/K, x0xFU/K, +0x05/K, +0x06/K, +0x07/K, + +0x10/K, x0xfu/K, +0x12/K, +0x13/K, +0x14/K, x0xFU/K, +0x16/K, +0x17/K, + +0x20/K, +0x21/K, x0xfu/K, +0x23/K, +0x24/K, +0x25/K, x0xFU/K, +0x27/K, + +0x30/K, +0x31/K, +0x32/K, x0xfu/K, +0x34/K, +0x35/K, +0x36/K, x0xFU/K, + x0xFU/K, +0x41/K, +0x42/K, +0x43/K, x0xfu/K, +0x45/K, +0x46/K, +0x47/K, + +0x50/K, x0xFU/K, +0x52/K, +0x53/K, +0x54/K, x0xfu/K, +0x56/K, +0x57/K, + +0x60/K, +0x61/K, x0xFU/K, +0x63/K, +0x64/K, +0x65/K, x0xfu/K, +0x67/K, + +0x70/K, +0x71/K, +0x72/K, x0xFU/K, +0x74/K, +0x75/K, +0x76/K, x0xfu/K +}; + + +const float src_f32[ref_n] = { + x0xff, -0x01, -0x02, +0x03, x0xFF, -0x05, -0x06, +0x07, + -0x10, x0xff, +0x12, -0x13, -0x14, x0xFF, -0x16, -0x17, + +0x20, +0x21, x0xff, -0x23, -0x24, -0x25, x0xFF, -0x27, + -0x30, -0x31, +0x32, x0xff, +0x34, +0x35, +0x36, x0xFF, + x0xFF, -0x41, +0x42, +0x43, x0xff, +0x45, -0x46, -0x47, + +0x50, x0xFF, -0x52, -0x53, +0x54, x0xff, +0x56, +0x57, + +0x60, -0x61, x0xFF, +0x63, -0x64, +0x65, x0xff, -0x67, + -0x70, +0x71, +0x72, x0xFF, +0x74, -0x75, -0x76, x0xff +}; + +const int16_t ref_f32_i16[ref_n] = { + x0xmm, -0x01, -0x02, +0x03, x0xMM, -0x05, -0x06, +0x07, + -0x10, x0xmm, +0x12, -0x13, -0x14, x0xMM, -0x16, -0x17, + +0x20, +0x21, x0xmm, -0x23, -0x24, -0x25, x0xMM, -0x27, + -0x30, -0x31, +0x32, x0xmm, +0x34, +0x35, +0x36, x0xMM, + x0xMM, -0x41, +0x42, +0x43, x0xmm, +0x45, -0x46, -0x47, + +0x50, x0xMM, -0x52, -0x53, +0x54, x0xmm, +0x56, +0x57, + +0x60, -0x61, x0xMM, +0x63, -0x64, +0x65, x0xmm, -0x67, + -0x70, +0x71, +0x72, x0xMM, +0x74, -0x75, -0x76, x0xmm +}; + +const uint16_t ref_f32_u16[ref_n] = { + xxmmu, xxmmu, xxmmu, 0x03u, xxMMu, xxmmu, xxmmu, 0x07u, + xxmmu, xxmmu, 0x12u, xxmmu, xxmmu, xxMMu, xxmmu, xxmmu, + 0x20u, 0x21u, xxmmu, xxmmu, xxmmu, xxmmu, xxMMu, xxmmu, + xxmmu, xxmmu, 0x32u, xxmmu, 0x34u, 0x35u, 0x36u, xxMMu, + xxMMu, xxmmu, 0x42u, 0x43u, xxmmu, 0x45u, xxmmu, xxmmu, + 0x50u, xxMMu, xxmmu, xxmmu, 0x54u, xxmmu, 0x56u, 0x57u, + 0x60u, xxmmu, xxMMu, 0x63u, xxmmu, 0x65u, xxmmu, xxmmu, + xxmmu, 0x71u, 0x72u, xxMMu, 0x74u, xxmmu, xxmmu, xxmmu +}; + + +const float src_f32_1[ref_n] = { + x0xff/K, -0x01/K, -0x02/K, +0x03/K, x0xFF/K, -0x05/K, -0x06/K, +0x07/K, + -0x10/K, x0xff/K, +0x12/K, -0x13/K, -0x14/K, x0xFF/K, -0x16/K, -0x17/K, + +0x20/K, +0x21/K, x0xff/K, -0x23/K, -0x24/K, -0x25/K, x0xFF/K, -0x27/K, + -0x30/K, -0x31/K, +0x32/K, x0xff/K, +0x34/K, +0x35/K, +0x36/K, x0xFF/K, + x0xFF/K, -0x41/K, +0x42/K, +0x43/K, x0xff/K, +0x45/K, -0x46/K, -0x47/K, + +0x50/K, x0xFF/K, -0x52/K, -0x53/K, +0x54/K, x0xff/K, +0x56/K, +0x57/K, + +0x60/K, -0x61/K, x0xFF/K, +0x63/K, -0x64/K, +0x65/K, x0xff/K, -0x67/K, + -0x70/K, +0x71/K, +0x72/K, x0xFF/K, +0x74/K, -0x75/K, -0x76/K, x0xff/K +}; + +const int16_t ref_f32_i16_1[ref_n] = { + x0xmm, -0x01, -0x02, +0x03, x0xMM, -0x05, -0x06, +0x07, + -0x10, x0xmm, +0x12, -0x13, -0x14, x0xMM, -0x16, -0x17, + +0x20, +0x21, x0xmm, -0x23, -0x24, -0x25, x0xMM, -0x27, + -0x30, -0x31, +0x32, x0xmm, +0x34, +0x35, +0x36, x0xMM, + x0xMM, -0x41, +0x42, +0x43, x0xmm, +0x45, -0x46, -0x47, + +0x50, x0xMM, -0x52, -0x53, +0x54, x0xmm, +0x56, +0x57, + +0x60, -0x61, x0xMM, +0x63, -0x64, +0x65, x0xmm, -0x67, + -0x70, +0x71, +0x72, x0xMM, +0x74, -0x75, -0x76, x0xmm +}; + +const uint16_t ref_f32_u16_1[ref_n] = { + xxmmu, xxmmu, xxmmu, 0x03u, xxMMu, xxmmu, xxmmu, 0x07u, + xxmmu, xxmmu, 0x12u, xxmmu, xxmmu, xxMMu, xxmmu, xxmmu, + 0x20u, 0x21u, xxmmu, xxmmu, xxmmu, xxmmu, xxMMu, xxmmu, + xxmmu, xxmmu, 0x32u, xxmmu, 0x34u, 0x35u, 0x36u, xxMMu, + xxMMu, xxmmu, 0x42u, 0x43u, xxmmu, 0x45u, xxmmu, xxmmu, + 0x50u, xxMMu, xxmmu, xxmmu, 0x54u, xxmmu, 0x56u, 0x57u, + 0x60u, xxmmu, xxMMu, 0x63u, xxmmu, 0x65u, xxmmu, xxmmu, + xxmmu, 0x71u, 0x72u, xxMMu, 0x74u, xxmmu, xxmmu, xxmmu +}; + + +void print_i16(FILE* fp, const int16_t* vp, size_t n) +{ + fprintf(fp, "{ "); + while (n--) { + int i = (int)*vp++; + char sc = ((i < 0) ? '-' : ((i > 0) ? '+' : ' ')); + if (i < 0) i = -i; + fprintf(fp, "%c%04Xh, ", sc, (unsigned)i); + } + fprintf(fp, "}\n"); +} + + +void print_u16(FILE* fp, const uint16_t* vp, size_t n) +{ + fprintf(fp, "{ "); + while (n--) { + fprintf(fp, "%04Xh, ", (unsigned)*vp++); + } + fprintf(fp, "}\n"); +} + + +void print_f32(FILE* fp, const float* vp, size_t n) +{ + fprintf(fp, "{ "); + while (n--) { + fprintf(fp, "%+6.2f, ", *vp++); + } + fprintf(fp, "}\n"); +} + + +const int16_t* compare_i16(const int16_t* bufp, const int16_t* refp, size_t len) +{ + while (len--) { + if (*bufp != *refp) { + return bufp; + } + ++bufp; + ++refp; + } + return NULL; +} + + +const uint16_t* compare_u16(const uint16_t* bufp, const uint16_t* refp, size_t len) +{ + while (len--) { + if (*bufp != *refp) { + return bufp; + } + ++bufp; + ++refp; + } + return NULL; +} + + +const float* compare_f32(const float* bufp, const float* refp, size_t len, float epsilon) +{ + while (len--) { + if (fabsf(*bufp - *refp) > epsilon) { + return bufp; + } + ++bufp; + ++refp; + } + return NULL; +} + + +const void* compare_dirty(const void* bufp, uint8_t refv, size_t size) +{ + const uint8_t* sp = (const uint8_t*)bufp; + const uint8_t* ep = (sp + size); + while (sp != ep) { + if (*sp != refv) { + return sp; + } + ++sp; + } + return NULL; +} diff --git a/tests/test_convert_x86_avx2.c b/tests/test_convert_x86_avx2.c new file mode 100644 index 0000000..666c700 --- /dev/null +++ b/tests/test_convert_x86_avx2.c @@ -0,0 +1,376 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +#include "aymo_file.h" +#include "aymo_testing.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_convert_x86_avx2.h" + +#include "test_convert_prologue_inline.h" + + +void test_aymo_convert_x86_avx2_i16_f32(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, DIRTY, sizeof(buf_f32)); + aymo_(i16_f32)((ei - si), &src_i16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32, ref_n); +} + + +void test_aymo_convert_x86_avx2_f32_i16(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16)((ei - si), &src_f32[si], &buf_i16[si]); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16, ref_n); +} + + +void test_aymo_convert_x86_avx2_i16_f32_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(i16_f32_1)((ei - si), &src_i16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32_1, ref_n); +} + + +void test_aymo_convert_x86_avx2_f32_i16_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16_1)((ei - si), &src_f32_1[si], &buf_i16[si]); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16_1, ref_n); +} + + +void test_aymo_convert_x86_avx2_i16_f32_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(i16_f32_k)((ei - si), &src_i16[si], &buf_f32[si], (float)(1. / K)); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32_1, ref_n); +} + + +void test_aymo_convert_x86_avx2_f32_i16_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16_k)((ei - si), &src_f32_1[si], &buf_i16[si], (float)(K)); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16_1, ref_n); +} + + +void test_aymo_convert_x86_avx2_u16_f32(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, DIRTY, sizeof(buf_f32)); + aymo_(u16_f32)((ei - si), &src_u16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32, ref_n); +} + + +void test_aymo_convert_x86_avx2_f32_u16(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16)((ei - si), &src_f32[si], &buf_u16[si]); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16, ref_n); +} + + +void test_aymo_convert_x86_avx2_u16_f32_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(u16_f32_1)((ei - si), &src_u16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32_1, ref_n); +} + + +void test_aymo_convert_x86_avx2_f32_u16_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16_1)((ei - si), &src_f32_1[si], &buf_u16[si]); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16_1, ref_n); +} + + +void test_aymo_convert_x86_avx2_u16_f32_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(u16_f32_k)((ei - si), &src_u16[si], &buf_f32[si], (float)(1. / K)); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32_1, ref_n); +} + + +void test_aymo_convert_x86_avx2_f32_u16_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16_k)((ei - si), &src_f32_1[si], &buf_u16[si], (float)(K)); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16_1, ref_n); +} + + +struct aymo_testing_entry unit_tests[] = +{ + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_i16_f32), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_i16), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_i16_f32_1), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_i16_1), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_i16_f32_k), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_i16_k), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_u16_f32), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_u16), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_u16_f32_1), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_u16_1), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_u16_f32_k), + AYMO_TEST_ENTRY(test_aymo_convert_x86_avx2_f32_u16_k) +}; + + +#include "aymo_testing_epilogue_inline.h" + + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 diff --git a/tests/test_convert_x86_sse41.c b/tests/test_convert_x86_sse41.c new file mode 100644 index 0000000..8782118 --- /dev/null +++ b/tests/test_convert_x86_sse41.c @@ -0,0 +1,376 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#include "aymo_file.h" +#include "aymo_testing.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_convert_x86_sse41.h" + +#include "test_convert_prologue_inline.h" + + +void test_aymo_convert_x86_sse41_i16_f32(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, DIRTY, sizeof(buf_f32)); + aymo_(i16_f32)((ei - si), &src_i16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32, ref_n); +} + + +void test_aymo_convert_x86_sse41_f32_i16(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16)((ei - si), &src_f32[si], &buf_i16[si]); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16, ref_n); +} + + +void test_aymo_convert_x86_sse41_i16_f32_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(i16_f32_1)((ei - si), &src_i16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32_1, ref_n); +} + + +void test_aymo_convert_x86_sse41_f32_i16_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16_1)((ei - si), &src_f32_1[si], &buf_i16[si]); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16_1, ref_n); +} + + +void test_aymo_convert_x86_sse41_i16_f32_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(i16_f32_k)((ei - si), &src_i16[si], &buf_f32[si], (float)(1. / K)); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_i16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_i16_f32_1, ref_n); +} + + +void test_aymo_convert_x86_sse41_f32_i16_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_i16, (int)DIRTY, sizeof(buf_i16)); + aymo_(f32_i16_k)((ei - si), &src_f32_1[si], &buf_i16[si], (float)(K)); + if (compare_dirty(&buf_i16[0], DIRTY, (si * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + if (compare_i16(&buf_i16[si], &ref_f32_i16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_i16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_i16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_i16(stderr, buf_i16, ref_n); + print_i16(stderr, ref_f32_i16_1, ref_n); +} + + +void test_aymo_convert_x86_sse41_u16_f32(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, DIRTY, sizeof(buf_f32)); + aymo_(u16_f32)((ei - si), &src_u16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32, ref_n); +} + + +void test_aymo_convert_x86_sse41_f32_u16(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16)((ei - si), &src_f32[si], &buf_u16[si]); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16, ref_n); +} + + +void test_aymo_convert_x86_sse41_u16_f32_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(u16_f32_1)((ei - si), &src_u16[si], &buf_f32[si]); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32_1, ref_n); +} + + +void test_aymo_convert_x86_sse41_f32_u16_1(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16_1)((ei - si), &src_f32_1[si], &buf_u16[si]); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16_1, ref_n); +} + + +void test_aymo_convert_x86_sse41_u16_f32_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_f32, (int)DIRTY, sizeof(buf_f32)); + aymo_(u16_f32_k)((ei - si), &src_u16[si], &buf_f32[si], (float)(1. / K)); + if (compare_dirty(&buf_f32[0], DIRTY, (si * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + if (compare_f32(&buf_f32[si], &ref_u16_f32_1[si], (ei - si), 0)) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_f32[ei], DIRTY, ((ref_n - ei) * sizeof(buf_f32[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_f32(stderr, buf_f32, ref_n); + print_f32(stderr, ref_u16_f32_1, ref_n); +} + + +void test_aymo_convert_x86_sse41_f32_u16_k(void) +{ + unsigned si, ei; int line = 0; + for (si = 0; si < ref_n; ++si) { + for (ei = si; ei < ref_n; ++ei) { + memset(buf_u16, (int)DIRTY, sizeof(buf_u16)); + aymo_(f32_u16_k)((ei - si), &src_f32_1[si], &buf_u16[si], (float)(K)); + if (compare_dirty(&buf_u16[0], DIRTY, (si * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + if (compare_u16(&buf_u16[si], &ref_f32_u16_1[si], (ei - si))) { + line = __LINE__; goto error_; + } + if (compare_dirty(&buf_u16[ei], DIRTY, ((ref_n - ei) * sizeof(buf_u16[0])))) { + line = __LINE__; goto error_; + } + } + } + return; +error_: + app_return = TEST_STATUS_FAIL; + fprintf(stderr, "%s @ %d: si=%u, ei=%u\n", __func__, line, si, ei); + print_u16(stderr, buf_u16, ref_n); + print_u16(stderr, ref_f32_u16_1, ref_n); +} + + +struct aymo_testing_entry unit_tests[] = +{ + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_i16_f32), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_i16), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_i16_f32_1), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_i16_1), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_i16_f32_k), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_i16_k), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_u16_f32), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_u16), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_u16_f32_1), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_u16_1), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_u16_f32_k), + AYMO_TEST_ENTRY(test_aymo_convert_x86_sse41_f32_u16_k) +}; + + +#include "aymo_testing_epilogue_inline.h" + + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 diff --git a/tests/test_tda8425_arm_neon_sweep.c b/tests/test_tda8425_arm_neon_sweep.c new file mode 100644 index 0000000..6e593e7 --- /dev/null +++ b/tests/test_tda8425_arm_neon_sweep.c @@ -0,0 +1,31 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_tda8425_arm_neon.h" + + +#include "test_tda8425_sweep_inline.h" + + +#endif // AYMO_CPU_SUPPORT_ARM_NEON diff --git a/tests/test_tda8425_none_sweep.c b/tests/test_tda8425_none_sweep.c new file mode 100644 index 0000000..d0aeaff --- /dev/null +++ b/tests/test_tda8425_none_sweep.c @@ -0,0 +1,27 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_tda8425_none.h" + + +#include "test_tda8425_sweep_inline.h" diff --git a/tests/test_tda8425_sweep_inline.h b/tests/test_tda8425_sweep_inline.h new file mode 100644 index 0000000..717e9fc --- /dev/null +++ b/tests/test_tda8425_sweep_inline.h @@ -0,0 +1,330 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#include "aymo_tda8425.h" +#include "aymo_testing.h" +#include "TDA8425_emu.h" + +#ifdef TEST_FILES +#include "aymo_wave.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#ifndef M_PI +#define M_PI (3.14159265358979323846264338327950288) +#endif + +#ifndef INPUT_AMPLITUDE +#define INPUT_AMPLITUDE (.4) +#endif + +#ifndef STDEV_LIMIT +#define STDEV_LIMIT (.0002) +#endif + +#define EMU_AHEAD 4 + +static AYMO_TDA8425_DEFINE_MATH_DEFAULT(tda8425_math); + + +struct app_args { + int argc; + char** argv; + double fs; + double tt; + uint8_t reg_vl; + uint8_t reg_vr; + uint8_t reg_ba; + uint8_t reg_tr; + uint8_t reg_pp; + uint8_t reg_sf; +}; + + +static int app_return; +static struct app_args app_args; + +static TDA8425_Chip emu; +static struct aymo_(chip) chip; + +#ifdef TEST_FILES +static char* in_name; +static char* emu_out_name; +static char* chip_out_name; +static FILE* in_file; +static FILE* emu_out_file; +static FILE* chip_out_file; +#endif // TEST_FILES + + +static int arg2reg(const char* arg, uint8_t* reg) { + errno = 0; + unsigned long x = strtoul(arg, NULL, 0); + if ((x != ULONG_MAX) || (errno != ERANGE)) { + if (x <= UINT8_MAX) { + *reg = (uint8_t)x; + } else { + errno = ERANGE; + } + } + return errno; +} + + +static int arg2posf(const char* arg, double* val) { + errno = 0; + double x = strtod(arg, NULL); + if (((x != +HUGE_VAL) && (x != -HUGE_VAL)) || (errno != ERANGE)) { + if (x > 0.) { + *val = (float)x; + } else { + errno = ERANGE; + } + } + return errno; +} + + +static void app_boot(void) +{ + aymo_cpu_boot(); + aymo_tda8425_boot(&tda8425_math); + + app_return = TEST_STATUS_PASS; + memset(&app_args, 0, sizeof(app_args)); + + TDA8425_Chip_Ctor(&emu); +} + + +static void app_setup(void) +{ + TDA8425_Chip_Setup( + &emu, + (TDA8425_Float)app_args.fs, + (TDA8425_Float)TDA8425_Pseudo_C1_Table[TDA8425_Pseudo_Preset_1], + (TDA8425_Float)TDA8425_Pseudo_C2_Table[TDA8425_Pseudo_Preset_1], + TDA8425_Tfilter_Mode_Disabled + ); + TDA8425_Chip_Reset(&emu); + TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_VL, app_args.reg_vl); + TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_VR, app_args.reg_vr); + TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_BA, app_args.reg_ba); + TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_TR, app_args.reg_tr); + TDA8425_Chip_Write(&emu, (TDA8425_Address)TDA8425_Reg_SF, app_args.reg_sf); + TDA8425_Chip_Start(&emu); + + aymo_(ctor)(&chip, (float)app_args.fs); + aymo_(write)(&chip, 0x00u, app_args.reg_vl); + aymo_(write)(&chip, 0x01u, app_args.reg_vr); + aymo_(write)(&chip, 0x02u, app_args.reg_ba); + aymo_(write)(&chip, 0x03u, app_args.reg_tr); + aymo_(write)(&chip, 0x07u, app_args.reg_pp); + aymo_(write)(&chip, 0x08u, app_args.reg_sf); + +#ifdef TEST_FILES + in_name = aymo_test_args_to_str(0, (app_args.argc - 1), app_args.argv, "", "_in.wav"); + emu_out_name = aymo_test_args_to_str(0, (app_args.argc - 1), app_args.argv, "", "_emu_out.wav"); + chip_out_name = aymo_test_args_to_str(0, (app_args.argc - 1), app_args.argv, "", "_chip_out.wav"); + assert(in_name); + assert(emu_out_name); + assert(chip_out_name); + fprintf(stderr, "in_name: \"%s\"\n", in_name); + fprintf(stderr, "emu_out_name: \"%s\"\n", emu_out_name); + fprintf(stderr, "chip_out_name: \"%s\"\n", chip_out_name); + + in_file = fopen(in_name, "wb"); + emu_out_file = fopen(emu_out_name, "wb"); + chip_out_file = fopen(chip_out_name, "wb"); + assert(in_file); + assert(emu_out_file); + assert(chip_out_file); + + double fs = app_args.fs; + double T = app_args.tt; + uint32_t N = (uint32_t)fmax(16., (fs * T)); + uint16_t fmt = AYMO_WAVE_FMT_TYPE_FLOAT; + struct aymo_wave_heading wavh; + aymo_wave_heading_setup(&wavh, fmt, 2u, 32u, (uint32_t)app_args.fs, (2u * N)); + fwrite(&wavh, sizeof(wavh), 1u, in_file); + aymo_wave_heading_setup(&wavh, fmt, 2u, 32u, (uint32_t)app_args.fs, (2u * N)); + fwrite(&wavh, sizeof(wavh), 1u, emu_out_file); + aymo_wave_heading_setup(&wavh, fmt, 2u, 32u, (uint32_t)app_args.fs, (2u * N)); + fwrite(&wavh, sizeof(wavh), 1u, chip_out_file); +#endif // TEST_FILES +} + + +static void app_teardown(void) +{ + TDA8425_Chip_Stop(&emu); + TDA8425_Chip_Dtor(&emu); + + aymo_(dtor)(&chip); + +#ifdef TEST_FILES + fclose(in_file); + fclose(emu_out_file); + fclose(chip_out_file); +#endif // TEST_FILES +} + + +static void app_run(void) +{ + double fs = app_args.fs; + double T = app_args.tt; + long N = (long)fmax(16., (fs * T)); + double f0 = 10.; + double f1 = fmin((fs / 2.), 21000.); + + TDA8425_Chip_Process_Data emu_data; + memset(&emu_data, 0, sizeof(emu_data)); + float emu_y[EMU_AHEAD][2] = {{0}}; + float chip_x[2] = {0}; + float chip_y[2] = {0}; + double sum_el = 0.; + double sum_eel = 0.; + double sum_er = 0.; + double sum_eer = 0.; + long k; + + for (k = 0; k < N; ++k) { + double t = ((double)k / fs); + double th = ((2. * M_PI * f0 * T) * (pow((f1 / f0), (t / T)) - 1.) / log(f1 / f0)); + th = fmod(th, (2. * M_PI)); + float xl = (float)(INPUT_AMPLITUDE * sin(th)); + float xr = (float)(INPUT_AMPLITUDE * cos(th)); + + emu_data.inputs[0][0] = (TDA8425_Float)xl; + emu_data.inputs[0][1] = (TDA8425_Float)xr; + chip_x[0] = xl; + chip_x[1] = xr; + + TDA8425_Chip_Process(&emu, &emu_data); + for (int i = (EMU_AHEAD - 1); i > 0; --i) { + emu_y[i][0] = emu_y[i-1][0]; + emu_y[i][1] = emu_y[i-1][1]; + } + emu_y[0][0] = (float)emu_data.outputs[0]; + emu_y[0][1] = (float)emu_data.outputs[1]; + + aymo_(process_f32)(&chip, 1u, chip_x, chip_y); + + double el = (emu_y[EMU_AHEAD-1][0] - chip_y[0]); + double er = (emu_y[EMU_AHEAD-1][1] - chip_y[1]); + sum_el += el; + sum_er += er; + sum_eel = (el * el); + sum_eer = (er * er); + +#ifdef TEST_FILES + if (in_file) { + fwrite(chip_x, sizeof(float), 2, in_file); + } + if (emu_out_file) { + fwrite(emu_y[EMU_AHEAD-1], sizeof(float), 2, emu_out_file); + } + if (chip_out_file) { + fwrite(chip_y, sizeof(float), 2, chip_out_file); + } +#endif // TEST_FILES + } + + double avg_el = (sum_el / (double)k); + double avg_er = (sum_er / (double)k); + double avg_eel = (sum_eel / (double)k); + double avg_eer = (sum_eer / (double)k); + double var_el = fabs(avg_eel - (avg_el * avg_el)); + double var_er = fabs(avg_eer - (avg_er * avg_er)); + double stdev_el = sqrt(var_el); + double stdev_er = sqrt(var_er); + + fprintf(stderr, "L: stdev_e=%g N=%ld k=%ld sum_e=%g sum_ee=%g\n", stdev_el, N, k, sum_el, sum_eel); + fprintf(stderr, "R: stdev_e=%g N=%ld k=%ld sum_e=%g sum_ee=%g\n", stdev_er, N, k, sum_er, sum_eer); + + if ((stdev_el > STDEV_LIMIT) || (stdev_er > STDEV_LIMIT)) { + app_return = TEST_STATUS_FAIL; + } +} + + +int main(int argc, char** argv) +{ + app_boot(); + + app_args.argc = argc; + app_args.argv = argv; + + if (argc != 9) { + fprintf(stderr, "USAGE:\t%s VL VR BA TR PP SF fs tt\n", (argc ? argv[0] : "test_exe")); + app_return = TEST_STATUS_HARD; + goto catch_; + } + + if (arg2reg(argv[1], &app_args.reg_vl)) { + perror("VL"); app_return = TEST_STATUS_HARD; goto catch_; + } + if (arg2reg(argv[2], &app_args.reg_vr)) { + perror("VR"); app_return = TEST_STATUS_HARD; goto catch_; + } + if (arg2reg(argv[3], &app_args.reg_ba)) { + perror("BA"); app_return = TEST_STATUS_HARD; goto catch_; + } + if (arg2reg(argv[4], &app_args.reg_tr)) { + perror("TR"); app_return = TEST_STATUS_HARD; goto catch_; + } + if (arg2reg(argv[5], &app_args.reg_pp)) { + perror("PP"); app_return = TEST_STATUS_HARD; goto catch_; + } + if (arg2reg(argv[6], &app_args.reg_sf)) { + perror("SF"); app_return = TEST_STATUS_HARD; goto catch_; + } + if (arg2posf(argv[7], &app_args.fs)) { + perror("fs"); app_return = TEST_STATUS_HARD; goto catch_; + } + if (arg2posf(argv[8], &app_args.tt)) { + perror("tt"); app_return = TEST_STATUS_HARD; goto catch_; + } + + app_setup(); + app_run(); + goto finally_; + +catch_: +finally_: + app_teardown(); + return app_return; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/tests/test_tda8425_x86_avx2_sweep.c b/tests/test_tda8425_x86_avx2_sweep.c new file mode 100644 index 0000000..ddaec79 --- /dev/null +++ b/tests/test_tda8425_x86_avx2_sweep.c @@ -0,0 +1,31 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_tda8425_x86_avx2.h" + + +#include "test_tda8425_sweep_inline.h" + + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 diff --git a/tests/test_tda8425_x86_sse41_sweep.c b/tests/test_tda8425_x86_sse41_sweep.c new file mode 100644 index 0000000..fd33de9 --- /dev/null +++ b/tests/test_tda8425_x86_sse41_sweep.c @@ -0,0 +1,31 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_tda8425_x86_sse41.h" + + +#include "test_tda8425_sweep_inline.h" + + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 diff --git a/tests/test_ym7128_arm_neon_sweep.c b/tests/test_ym7128_arm_neon_sweep.c new file mode 100644 index 0000000..6c8b9b0 --- /dev/null +++ b/tests/test_ym7128_arm_neon_sweep.c @@ -0,0 +1,31 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_ARM_NEON + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ym7128_arm_neon.h" + + +#include "test_ym7128_sweep_inline.h" + + +#endif // AYMO_CPU_SUPPORT_ARM_NEON diff --git a/tests/test_ym7128_none_sweep.c b/tests/test_ym7128_none_sweep.c new file mode 100644 index 0000000..da770c0 --- /dev/null +++ b/tests/test_ym7128_none_sweep.c @@ -0,0 +1,27 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ym7128_none.h" + + +#include "test_ym7128_sweep_inline.h" diff --git a/tests/test_ym7128_sweep_inline.h b/tests/test_ym7128_sweep_inline.h new file mode 100644 index 0000000..defa42d --- /dev/null +++ b/tests/test_ym7128_sweep_inline.h @@ -0,0 +1,316 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_cpu.h" +#include "aymo_testing.h" +#include "aymo_ym7128.h" +#include "YM7128B_emu.h" + +#ifdef TEST_FILES +#include "aymo_wave.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +#ifndef M_PI +#define M_PI (3.14159265358979323846264338327950288) +#endif + +#ifndef DUTY_RATIO +#define DUTY_RATIO (.8) +#endif + +#ifndef INPUT_AMPLITUDE +#define INPUT_AMPLITUDE (.25) +#endif + +#ifndef STDEV_LIMIT +#define STDEV_LIMIT (.1) +#endif + + +struct test_args { + int argc; + char** argv; + double tt; + uint8_t regs[YM7128B_Reg_Count]; +}; + + +static int app_return; +static struct test_args test_args; + +static YM7128B_ChipFixed emu; +static struct aymo_(chip) chip; + +#ifdef TEST_FILES +static char* in_name; +static char* emu_out_name; +static char* chip_out_name; +static FILE* in_file; +static FILE* emu_out_file; +static FILE* chip_out_file; +#endif // TEST_FILES + + +static int arg2reg(const char* arg, uint8_t* reg) { + errno = 0; + unsigned long x = strtoul(arg, NULL, 0); + if ((x != ULONG_MAX) || (errno != ERANGE)) { + if (x <= UINT8_MAX) { + *reg = (uint8_t)x; + } else { + errno = ERANGE; + } + } + return errno; +} + + +static int arg2posf(const char* arg, double* val) { + errno = 0; + double x = strtod(arg, NULL); + if (((x != +HUGE_VAL) && (x != -HUGE_VAL)) || (errno != ERANGE)) { + if (x > 0.) { + *val = (float)x; + } else { + errno = ERANGE; + } + } + return errno; +} + + +static void app_boot(void) +{ + aymo_cpu_boot(); + aymo_ym7128_boot(); + + app_return = TEST_STATUS_PASS; + memset(&test_args, 0, sizeof(test_args)); + + YM7128B_ChipFixed_Ctor(&emu); +} + + +static void app_setup(void) +{ + YM7128B_ChipFixed_Reset(&emu); + for (int i = 0; i < YM7128B_Reg_Count; ++i) { + YM7128B_ChipFixed_Write(&emu, (YM7128B_Address)i, test_args.regs[i]); + } + + aymo_(ctor)(&chip); + for (int i = 0; i < AYMO_YM7128_REG_COUNT; ++i) { + aymo_(write)(&chip, (uint16_t)i, test_args.regs[i]); + } + +#ifdef TEST_FILES + in_name = aymo_test_args_to_str(0, (test_args.argc - 1), test_args.argv, "", "_in.wav"); + emu_out_name = aymo_test_args_to_str(0, (test_args.argc - 1), test_args.argv, "", "_emu_out.wav"); + chip_out_name = aymo_test_args_to_str(0, (test_args.argc - 1), test_args.argv, "", "_chip_out.wav"); + assert(in_name); + assert(emu_out_name); + assert(chip_out_name); + fprintf(stderr, "in_name: \"%s\"\n", in_name); + fprintf(stderr, "emu_out_name: \"%s\"\n", emu_out_name); + fprintf(stderr, "chip_out_name: \"%s\"\n", chip_out_name); + + in_file = fopen(in_name, "wb"); + emu_out_file = fopen(emu_out_name, "wb"); + chip_out_file = fopen(chip_out_name, "wb"); + assert(in_file); + assert(emu_out_file); + assert(chip_out_file); + + double fs = (double)YM7128B_Input_Rate; + double T = (test_args.tt * DUTY_RATIO); + uint32_t N = (uint32_t)fmax(16., (fs * (T / DUTY_RATIO))); + uint16_t fmt = AYMO_WAVE_FMT_TYPE_PCM; + struct aymo_wave_heading wavh; + aymo_wave_heading_setup(&wavh, fmt, 1u, 16u, (uint32_t)YM7128B_Input_Rate, (1u * N)); + fwrite(&wavh, sizeof(wavh), 1u, in_file); + aymo_wave_heading_setup(&wavh, fmt, 2u, 16u, (uint32_t)YM7128B_Output_Rate, (2u * N)); + fwrite(&wavh, sizeof(wavh), 1u, emu_out_file); + aymo_wave_heading_setup(&wavh, fmt, 2u, 16u, (uint32_t)YM7128B_Output_Rate, (2u * N)); + fwrite(&wavh, sizeof(wavh), 1u, chip_out_file); +#endif // TEST_FILES +} + + +static void app_teardown(void) +{ + aymo_(dtor)(&chip); + +#ifdef TEST_FILES + fclose(in_file); + fclose(emu_out_file); + fclose(chip_out_file); +#endif // TEST_FILES +} + + +static void app_run(void) +{ + double fs = (double)YM7128B_Input_Rate; + double T = (test_args.tt * DUTY_RATIO); + long N = (long)fmax(16., (fs * (T / DUTY_RATIO))); + double f0 = 10.; + double f1 = fmin((fs / 2.), 21000.); + + YM7128B_ChipFixed_Process_Data emu_data; + memset(&emu_data, 0, sizeof(emu_data)); + int16_t chip_x[1] = {0}; + int16_t chip_y[4] = {0}; + double sum_e0l = 0.; + double sum_ee0l = 0.; + double sum_e0r = 0.; + double sum_ee0r = 0.; + double sum_e1l = 0.; + double sum_ee1l = 0.; + double sum_e1r = 0.; + double sum_ee1r = 0.; + long k; + + for (k = 0; k < N; ++k) { + double xx = 0.; + if ((double)k < ((double)N * DUTY_RATIO)) { + double t = ((double)k / fs); + double th = ((2. * M_PI * f0 * T) * (pow((f1 / f0), (t / T)) - 1.) / log(f1 / f0)); + th = fmod(th, (2. * M_PI)); + xx = (INPUT_AMPLITUDE * cos(th)); + } + int16_t x = (int16_t)(xx * (double)YM7128B_Fixed_Max); + + emu_data.inputs[0] = x; + chip_x[0] = x; + + YM7128B_ChipFixed_Process(&emu, &emu_data); + + aymo_(process_i16)(&chip, 1u, chip_x, chip_y); + + double e0l = ((double)emu_data.outputs[0][0] - (double)chip_y[0]); + double e0r = ((double)emu_data.outputs[1][0] - (double)chip_y[1]); + double e1l = ((double)emu_data.outputs[0][1] - (double)chip_y[2]); + double e1r = ((double)emu_data.outputs[1][1] - (double)chip_y[3]); + sum_e0l += e0l; sum_ee0l = (e0l * e0l); + sum_e0r += e0r; sum_ee0r = (e0r * e0r); + sum_e1l += e1l; sum_ee1l = (e1l * e1l); + sum_e1r += e1r; sum_ee1r = (e1r * e1r); + +#ifdef TEST_FILES + if (in_file) { + fwrite(chip_x, sizeof(int16_t), 1, in_file); + } + if (emu_out_file) { + fwrite(&emu_data.outputs[0][0], sizeof(int16_t), 1, emu_out_file); + fwrite(&emu_data.outputs[1][0], sizeof(int16_t), 1, emu_out_file); + fwrite(&emu_data.outputs[0][1], sizeof(int16_t), 1, emu_out_file); + fwrite(&emu_data.outputs[1][1], sizeof(int16_t), 1, emu_out_file); + } + if (chip_out_file) { + fwrite(chip_y, sizeof(int16_t), 4, chip_out_file); + } +#endif // TEST_FILES + } + + double avg_e0l = (sum_e0l / (double)k); + double avg_ee0l = (sum_ee0l / (double)k); + double var_e0l = fabs(avg_ee0l - (avg_e0l * avg_e0l)); + double stdev_e0l = sqrt(var_e0l); + fprintf(stderr, "L0: stdev_e=%g N=%ld k=%ld sum_e=%g sum_ee=%g\n", stdev_e0l, N, k, sum_e0l, sum_ee0l); + + double avg_e0r = (sum_e0r / (double)k); + double avg_ee0r = (sum_ee0r / (double)k); + double var_e0r = fabs(avg_ee0r - (avg_e0r * avg_e0r)); + double stdev_e0r = sqrt(var_e0r); + fprintf(stderr, "R0: stdev_e=%g N=%ld k=%ld sum_e=%g sum_ee=%g\n", stdev_e0r, N, k, sum_e0r, sum_ee0r); + + double avg_e1l = (sum_e1l / (double)k); + double avg_ee1l = (sum_ee1l / (double)k); + double var_e1l = fabs(avg_ee1l - (avg_e1l * avg_e1l)); + double stdev_e1l = sqrt(var_e1l); + fprintf(stderr, "L1: stdev_e=%g N=%ld k=%ld sum_e=%g sum_ee=%g\n", stdev_e1l, N, k, sum_e1l, sum_ee1l); + + double avg_e1r = (sum_e1r / (double)k); + double avg_ee1r = (sum_ee1r / (double)k); + double var_e1r = fabs(avg_ee1r - (avg_e1r * avg_e1r)); + double stdev_e1r = sqrt(var_e1r); + fprintf(stderr, "R1: stdev_e=%g N=%ld k=%ld sum_e=%g sum_ee=%g\n", stdev_e1r, N, k, sum_e1r, sum_ee1r); + + double stdev_e = sqrt(var_e0l + var_e0r + var_e1l + var_e1r); + if (stdev_e > STDEV_LIMIT) { + app_return = TEST_STATUS_FAIL; + } +} + + +int main(int argc, char** argv) +{ + const int argo = 2; + + app_boot(); + + test_args.argc = argc; + test_args.argv = argv; + + if ((argc < argo) || (argc > (argo + AYMO_YM7128_REG_COUNT))) { + fprintf(stderr, "USAGE:\t%s tt REGn...\n", (argc ? argv[0] : "test_exe")); + app_return = TEST_STATUS_HARD; + goto catch_; + } + + if (arg2posf(argv[1], &test_args.tt)) { + perror("tt"); app_return = TEST_STATUS_HARD; goto catch_; + } + + int arge = (argo + AYMO_YM7128_REG_COUNT); + if (arge > argc) { + arge = argc; + } + for (int i = argo; i < arge; ++i) { + if (arg2reg(argv[i], &test_args.regs[i - argo])) { + char text[16]; sprintf(text, "%d", i); + perror(text); app_return = TEST_STATUS_HARD; goto catch_; + } + } + + app_setup(); + app_run(); + goto finally_; + +catch_: +finally_: + app_teardown(); + return app_return; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/tests/test_ym7128_x86_sse41_sweep.c b/tests/test_ym7128_x86_sse41_sweep.c new file mode 100644 index 0000000..e74a0df --- /dev/null +++ b/tests/test_ym7128_x86_sse41_sweep.c @@ -0,0 +1,31 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ym7128_x86_sse41.h" + + +#include "test_ym7128_sweep_inline.h" + + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 diff --git a/tests/test_ymf262_arm_neon_compare.c b/tests/test_ymf262_arm_neon_compare.c new file mode 100644 index 0000000..6b83ada --- /dev/null +++ b/tests/test_ymf262_arm_neon_compare.c @@ -0,0 +1,77 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_arm_neon.h" + +#include "test_ymf262_compare_prologue_inline.h" + + +static int compare_slots(int slot_) +{ + if (slot_ >= 36) { + return 0; // ignore + } + + // TODO: + + return 0; +catch_: + return 1; +} + + +static int compare_ch2xs(int ch2x) +{ + if (ch2x >= 18) { + return 0; // ignore + } + + // TODO: + + return 0; +catch_: + return 1; +} + + +static int compare_chips(void) +{ + // TODO: + + for (int ch2x = 0; ch2x < 18; ++ch2x) { + if (compare_ch2xs(ch2x)) { + assert(0); + } + } + + for (int slot = 0; slot < 36; ++slot) { + if (compare_slots(slot)) { + assert(0); + } + } + + return 0; +catch_: + return 1; +} + + +#include "test_ymf262_compare_epilogue_inline.h" diff --git a/tests/test_ymf262_compare_epilogue_inline.h b/tests/test_ymf262_compare_epilogue_inline.h new file mode 100644 index 0000000..e67d121 --- /dev/null +++ b/tests/test_ymf262_compare_epilogue_inline.h @@ -0,0 +1,165 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" + +#include + + +static int app_boot(void) +{ + app_return = TEST_STATUS_HARD; + + aymo_cpu_boot(); + + score_data = NULL; + score_size = 0u; + memset(&score, 0, sizeof(score)); + + memset(&nuked_chip, 0, sizeof(nuked_chip)); + memset(&nuked_out, 0, sizeof(nuked_out)); + memset(&aymo_chip, 0, sizeof(aymo_chip)); + + return TEST_STATUS_PASS; +} + + +static int app_args_init(int argc, char** argv) +{ + memset(&app_args, 0, sizeof(app_args)); + + app_args.argc = argc; + app_args.argv = argv; + + app_args.score_type = aymo_score_type_unknown; + + return TEST_STATUS_PASS; +} + + +static int app_args_parse(void) +{ + if (app_args.argc != 3) { + fprintf(stderr, "USAGE:\t%s SCORETYPE SCOREPATH\n", + (app_args.argc ? app_args.argv[0] : "test_exe")); + return TEST_STATUS_HARD; + } + + app_args.score_type_cstr = app_args.argv[1]; + app_args.score_type = aymo_score_ext_to_type(app_args.score_type_cstr); + if (app_args.score_type == aymo_score_type_unknown) { + fprintf(stderr, "Unsupported score format: %s\n", app_args.score_type_cstr); + return TEST_STATUS_HARD; + } + + app_args.score_path_cstr = app_args.argv[2]; + + return TEST_STATUS_PASS; +} + + +static int app_setup(void) +{ + score.base.vt = aymo_score_type_to_vt(app_args.score_type); + aymo_score_ctor(&score.base); + + if (aymo_file_load(app_args.score_path_cstr, &score_data, &score_size)) { + perror("aymo_file_load()"); + return TEST_STATUS_HARD; + } + + assert(score_size <= UINT32_MAX); + if (aymo_score_load(&score.base, score_data, (uint32_t)score_size)) { + fprintf(stderr, "Cannot load score: %s\n", app_args.argv[2]); + return TEST_STATUS_HARD; + } + + OPL3_Reset(&nuked_chip, (uint32_t)AYMO_YMF262_SAMPLE_RATE); + aymo_(ctor)(&aymo_chip); + + return TEST_STATUS_PASS; +} + + +static void app_teardown(void) +{ + aymo_(dtor)(&aymo_chip); + + if (score.base.vt) { + aymo_score_unload(&score.base); + aymo_score_dtor(&score.base); + } + aymo_file_unload(score_data); + score_data = NULL; +} + + +static int app_run(void) +{ + struct aymo_score_status* status = aymo_score_get_status(&score.base); + + while (!(status->flags & AYMO_SCORE_FLAG_EOF)) { + if (compare_chips()) { + fprintf(stderr, "Chips do not match\n"); + return TEST_STATUS_FAIL; + } + + aymo_score_tick(&score.base, 1u); + + if (status->flags & AYMO_SCORE_FLAG_EVENT) { + OPL3_WriteReg(&nuked_chip, status->address, status->value); + aymo_(write)(&aymo_chip, status->address, status->value); + } + + OPL3_Generate4Ch(&nuked_chip, &nuked_out[0]); + aymo_(tick)(&aymo_chip, 1u); + } + return TEST_STATUS_PASS; +} + + +int main(int argc, char** argv) +{ + app_return = app_boot(); + if (app_return) goto catch_; + + app_return = app_args_init(argc, argv); + if (app_return) goto catch_; + + app_return = app_args_parse(); + if (app_return) goto catch_; + + app_return = app_setup(); + if (app_return) goto catch_; + + app_return = app_run(); + if (app_return) goto catch_; + + app_return = TEST_STATUS_PASS; + goto finally_; + +catch_: +finally_: + app_teardown(); + return app_return; +} + + +AYMO_CXX_EXTERN_C_END diff --git a/tests/test_ymf262_compare_prologue_inline.h b/tests/test_ymf262_compare_prologue_inline.h new file mode 100644 index 0000000..6828031 --- /dev/null +++ b/tests/test_ymf262_compare_prologue_inline.h @@ -0,0 +1,77 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo_file.h" +#include "aymo_score_dro.h" +#include "aymo_score_avd.h" +#include "aymo_score_imf.h" +#include "aymo_testing.h" +#include "aymo_ymf262.h" + +#include "opl3.h" + +#include +#include + +AYMO_CXX_EXTERN_C_BEGIN + + +struct app_args { + int argc; + char** argv; + + // Score parameters + const char* score_path_cstr; // NULL or "-" for stdin + const char* score_type_cstr; // NULL uses score file extension + enum aymo_score_type score_type; +}; + + +// copied from opl3.c +static const uint8_t mt[16] = { + 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 20, 24, 24, 30, 30 +}; + + +static int app_return; + +static struct app_args app_args; + +static void* score_data; +static size_t score_size; +static union app_scores { + struct aymo_score_instance base; + struct aymo_score_avd_instance avd; + struct aymo_score_dro_instance dro; + struct aymo_score_imf_instance imf; +} score; + +static struct aymo_(chip) aymo_chip; +static opl3_chip nuked_chip; +static int16_t nuked_out[4]; + + +#undef assert +#define assert(x) { \ + if (!(x)) { \ + fprintf(stderr, "@ %d: FAILED assert(%s)\n", \ + __LINE__, (#x)); goto catch_; \ + } \ +}// diff --git a/tests/test_ymf262_none_compare.c b/tests/test_ymf262_none_compare.c new file mode 100644 index 0000000..f2337fc --- /dev/null +++ b/tests/test_ymf262_none_compare.c @@ -0,0 +1,73 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_none.h" + +#include "test_ymf262_compare_prologue_inline.h" + + +static int compare_slots(int slot_) +{ + if (slot_ >= 36) { + return 0; // ignore + } + + // TODO: + + return 0; +//catch_: +// return 1; +} + + +static int compare_ch2xs(int ch2x) +{ + if (ch2x >= 18) { + return 0; // ignore + } + + // TODO: + + return 0; +//catch_: +// return 1; +} + + +static int compare_chips(void) +{ + // TODO: + + for (int ch2x = 0; ch2x < 18; ++ch2x) { + assert(!compare_ch2xs(ch2x)); + } + + for (int slot = 0; slot < 36; ++slot) { + assert(!compare_slots(slot)); + } + + return 0; +catch_: + return 1; +} + + +#include "test_ymf262_compare_epilogue_inline.h" diff --git a/tests/test_ymf262_x86_avx2_compare.c b/tests/test_ymf262_x86_avx2_compare.c new file mode 100644 index 0000000..198042f --- /dev/null +++ b/tests/test_ymf262_x86_avx2_compare.c @@ -0,0 +1,170 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_X86_AVX2 + +#include "aymo_cpu_x86_avx2_inline.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_x86_avx2.h" + +#include "test_ymf262_compare_prologue_inline.h" + + +static int compare_slots(int slot_) +{ + if (slot_ >= 36) { + return 0; // ignore + } + + int word = aymo_ymf262_slot_to_word[slot_]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + const struct aymo_(slot_group)* sg = &aymo_chip.sg[sgi]; + const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi]; + const opl3_slot* slot = &nuked_chip.slot[slot_]; + (void)cg; + + // TODO: Commented stuff + assert((int16_t)vextractn(sg->wg_out, sgo) == slot->out); + assert((int16_t)vextractn(sg->wg_fb_mulhi, sgo) == (int16_t)(slot->channel->fb ? (0x40 << slot->channel->fb) : 0)); +// assert(vextractn(sg->wg_fbmod, sgo) == slot->fbmod); +// assert(vextractn(sg->wg_mod, sgo) == *slot->mod); + assert((int16_t)vextractn(sg->wg_prout, sgo) == slot->prout); + assert((uint16_t)vextractn(sg->eg_rout, sgo) == slot->eg_rout); + assert((uint16_t)vextractn(sg->eg_out, sgo) == slot->eg_out); +// assert(vextractn(sg->eg_inc, sgo) == slot->eg_inc); + assert((uint16_t)vextractn(sg->eg_gen, sgo) == slot->eg_gen); +// assert(vextractn(sg->eg_rate, sgo) == slot->eg_rate); +// assert(vextractn(sg->eg_ksl, sgo) == slot->eg_ksl); + assert((int16_t)vextractn(sg->eg_tremolo_am, sgo) == *slot->trem); + assert((uint16_t)-vextractn(sg->pg_vib, sgo) == slot->reg_vib); + //assert(vextractn(sg->eg_egt, sgo) == slot->reg_type); + //assert(vextractn(sg->eg_ksr, sgo) == slot->reg_ksr); + assert((uint16_t)vextractn(sg->pg_mult_x2, sgo) == mt[slot->reg_mult]); +//FIXME: assert((uint16_t)vextractn(sg->eg_tl_x4, sgo) == slot->reg_tl * 4U); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 12) & 15) == slot->reg_ar); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 8) & 15) == slot->reg_dr); + assert((uint16_t)vextractn(sg->eg_sl, sgo) == slot->reg_sl); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 0) & 15) == slot->reg_rr); + //assert(vextractn(sg->wg_wf, sgo) == slot->reg_wf); + assert((uint16_t)vextractn(sg->eg_key, sgo) == slot->key); + vi32_t pg_phase_vv = (aymo_(sgo_side)[sgo] ? sg->pg_phase_hi : sg->pg_phase_lo); + uint32_t pg_phase = vvextractn(pg_phase_vv, aymo_(sgo_cell)[sgo]); + assert(pg_phase == slot->pg_phase); + assert((uint16_t)vextractn(sg->pg_phase_out, sgo) == slot->pg_phase_out); + + return 0; +catch_: + return 1; +} + + +static int compare_ch2xs(int ch2x) +{ + if (ch2x >= 18) { + return 0; // ignore + } + + int word = aymo_ymf262_ch2x_to_word[ch2x][0]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi]; + const opl3_channel* channel = &nuked_chip.channel[ch2x]; + + // TODO: Commented stuff + //int16_t* out[0]; + //int16_t* out[1]; + //int16_t* out[2]; + //int16_t* out[3]; + //int32_t leftpan; + //int32_t rightpan; + //uint8_t chtype; + assert((uint16_t)vextractn(cg->pg_fnum, sgo) == channel->f_num); + assert((uint16_t)vextractn(cg->pg_block, sgo) == channel->block); + //uint8_t fb; // compared at slot group level + //uint8_t con; + //uint8_t alg; + assert((uint16_t)vextractn(cg->eg_ksv, sgo) == channel->ksv); + assert((uint16_t)vextractn(cg->og_ch_gate_a, sgo) == channel->cha); + assert((uint16_t)vextractn(cg->og_ch_gate_b, sgo) == channel->chb); + assert((uint16_t)vextractn(cg->og_ch_gate_c, sgo) == channel->chc); + assert((uint16_t)vextractn(cg->og_ch_gate_d, sgo) == channel->chd); + + return 0; +catch_: + return 1; +} + + +static int compare_chips(void) +{ + _mm_sfence(); + + // TODO: Commented stuff + assert((uint16_t)aymo_chip.tm_timer == nuked_chip.timer); + assert(aymo_chip.eg_timer == nuked_chip.eg_timer); + assert(aymo_chip.eg_timerrem == nuked_chip.eg_timerrem); + assert(aymo_chip.eg_state == nuked_chip.eg_state); + assert((uint16_t)vextractn(aymo_chip.eg_add, 0) == nuked_chip.eg_add); + //uint8_t newm; + //uint8_t nts; + //uint8_t rhy; + assert(aymo_chip.pg_vibpos == nuked_chip.vibpos); + assert(aymo_chip.eg_vibshift == nuked_chip.vibshift); + //assert((uint16_t)vextractn(aymo_chip.eg_tremolo, 0) == nuked_chip.tremolo); + assert(aymo_chip.eg_tremolopos == nuked_chip.tremolopos); + assert(aymo_chip.eg_tremoloshift == nuked_chip.tremoloshift); + assert(aymo_chip.ng_noise == nuked_chip.noise); + assert((int16_t)_mm_extract_epi16(aymo_chip.og_out, 0) == nuked_out[0]); + assert((int16_t)_mm_extract_epi16(aymo_chip.og_out, 1) == nuked_out[1]); + assert((int16_t)_mm_extract_epi16(aymo_chip.og_out, 2) == nuked_out[2]); + assert((int16_t)_mm_extract_epi16(aymo_chip.og_out, 3) == nuked_out[3]); + assert(aymo_chip.rm_hh_bit2 == nuked_chip.rm_hh_bit2); + assert(aymo_chip.rm_hh_bit3 == nuked_chip.rm_hh_bit3); + assert(aymo_chip.rm_hh_bit7 == nuked_chip.rm_hh_bit7); + assert(aymo_chip.rm_hh_bit8 == nuked_chip.rm_hh_bit8); + assert(aymo_chip.rm_tc_bit3 == nuked_chip.rm_tc_bit3); + assert(aymo_chip.rm_tc_bit5 == nuked_chip.rm_tc_bit5); + + for (int ch2x = 0; ch2x < 18; ++ch2x) { + if (compare_ch2xs(ch2x)) { + assert(0); + } + } + + for (int slot = 0; slot < 36; ++slot) { + if (compare_slots(slot)) { + assert(0); + } + } + + return 0; +catch_: + return 1; +} + + +#include "test_ymf262_compare_epilogue_inline.h" + + +#endif // AYMO_CPU_SUPPORT_X86_AVX2 diff --git a/tests/test_ymf262_x86_avx_compare.c b/tests/test_ymf262_x86_avx_compare.c new file mode 100644 index 0000000..bcc1d3a --- /dev/null +++ b/tests/test_ymf262_x86_avx_compare.c @@ -0,0 +1,170 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#include "aymo_cpu_x86_sse41_inline.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_x86_sse41.h" + +#include "test_ymf262_compare_prologue_inline.h" + + +static int compare_slots(int slot_) +{ + if (slot_ >= 36) { + return 0; // ignore + } + + int word = aymo_ymf262_slot_to_word[slot_]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + const struct aymo_(slot_group)* sg = &aymo_chip.sg[sgi]; + const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi]; + const opl3_slot* slot = &nuked_chip.slot[slot_]; + (void)cg; + + // TODO: Commented stuff + assert((int16_t)vextractn(sg->wg_out, sgo) == slot->out); + assert((int16_t)vextractn(sg->wg_fb_mulhi, sgo) == (int16_t)(slot->channel->fb ? (0x40 << slot->channel->fb) : 0)); +// assert(vextractn(sg->wg_fbmod, sgo) == slot->fbmod); +// assert(vextractn(sg->wg_mod, sgo) == *slot->mod); + assert((int16_t)vextractn(sg->wg_prout, sgo) == slot->prout); + assert((uint16_t)vextractn(sg->eg_rout, sgo) == slot->eg_rout); + assert((uint16_t)vextractn(sg->eg_out, sgo) == slot->eg_out); +// assert(vextractn(sg->eg_inc, sgo) == slot->eg_inc); + assert((uint16_t)vextractn(sg->eg_gen, sgo) == slot->eg_gen); +// assert(vextractn(sg->eg_rate, sgo) == slot->eg_rate); +// assert(vextractn(sg->eg_ksl, sgo) == slot->eg_ksl); + assert((int16_t)vextractn(sg->eg_tremolo_am, sgo) == *slot->trem); + assert((uint16_t)-vextractn(sg->pg_vib, sgo) == slot->reg_vib); + //assert(vextractn(sg->eg_egt, sgo) == slot->reg_type); + //assert(vextractn(sg->eg_ksr, sgo) == slot->reg_ksr); + assert((uint16_t)vextractn(sg->pg_mult_x2, sgo) == mt[slot->reg_mult]); + assert((uint16_t)vextractn(sg->eg_tl_x4, sgo) == slot->reg_tl * 4U); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 12) & 15) == slot->reg_ar); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 8) & 15) == slot->reg_dr); + assert((uint16_t)vextractn(sg->eg_sl, sgo) == slot->reg_sl); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 0) & 15) == slot->reg_rr); + //assert(vextractn(sg->wg_wf, sgo) == slot->reg_wf); + assert((uint16_t)vextractn(sg->eg_key, sgo) == slot->key); + vi32_t pg_phase_vv = (aymo_(sgo_side)[sgo] ? sg->pg_phase_hi : sg->pg_phase_lo); + uint32_t pg_phase = vvextractn(pg_phase_vv, aymo_(sgo_cell)[sgo]); + assert(pg_phase == slot->pg_phase); + assert((uint16_t)vextractn(sg->pg_phase_out, sgo) == slot->pg_phase_out); + + return 0; +catch_: + return 1; +} + + +static int compare_ch2xs(int ch2x) +{ + if (ch2x >= 18) { + return 0; // ignore + } + + int word = aymo_ymf262_ch2x_to_word[ch2x][0]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi]; + const opl3_channel* channel = &nuked_chip.channel[ch2x]; + + // TODO: Commented stuff + //int16_t* out[0]; + //int16_t* out[1]; + //int16_t* out[2]; + //int16_t* out[3]; + //int32_t leftpan; + //int32_t rightpan; + //uint8_t chtype; + assert((uint16_t)vextractn(cg->pg_fnum, sgo) == channel->f_num); + assert((uint16_t)vextractn(cg->pg_block, sgo) == channel->block); + //uint8_t fb; // compared at slot group level + //uint8_t con; + //uint8_t alg; + assert((uint16_t)vextractn(cg->eg_ksv, sgo) == channel->ksv); + assert((uint16_t)vextractn(cg->og_ch_gate_a, sgo) == channel->cha); + assert((uint16_t)vextractn(cg->og_ch_gate_b, sgo) == channel->chb); + assert((uint16_t)vextractn(cg->og_ch_gate_c, sgo) == channel->chc); + assert((uint16_t)vextractn(cg->og_ch_gate_d, sgo) == channel->chd); + + return 0; +catch_: + return 1; +} + + +static int compare_chips(void) +{ + _mm_sfence(); + + // TODO: Commented stuff + assert((uint16_t)aymo_chip.tm_timer == nuked_chip.timer); + assert(aymo_chip.eg_timer == nuked_chip.eg_timer); + assert(aymo_chip.eg_timerrem == nuked_chip.eg_timerrem); + assert(aymo_chip.eg_state == nuked_chip.eg_state); + assert((uint16_t)vextract(aymo_chip.eg_add, 0) == nuked_chip.eg_add); + //uint8_t newm; + //uint8_t nts; + //uint8_t rhy; + assert(aymo_chip.pg_vibpos == nuked_chip.vibpos); + assert(aymo_chip.eg_vibshift == nuked_chip.vibshift); + //assert((uint16_t)vextractn(aymo_chip.eg_tremolo, 0) == nuked_chip.tremolo); + assert(aymo_chip.eg_tremolopos == nuked_chip.tremolopos); + assert(aymo_chip.eg_tremoloshift == nuked_chip.tremoloshift); + assert(aymo_chip.ng_noise == nuked_chip.noise); + assert((int16_t)vextract(aymo_chip.og_out, 0) == nuked_out[0]); + assert((int16_t)vextract(aymo_chip.og_out, 1) == nuked_out[1]); + assert((int16_t)vextract(aymo_chip.og_out, 2) == nuked_out[2]); + assert((int16_t)vextract(aymo_chip.og_out, 3) == nuked_out[3]); + assert(aymo_chip.rm_hh_bit2 == nuked_chip.rm_hh_bit2); + assert(aymo_chip.rm_hh_bit3 == nuked_chip.rm_hh_bit3); + assert(aymo_chip.rm_hh_bit7 == nuked_chip.rm_hh_bit7); + assert(aymo_chip.rm_hh_bit8 == nuked_chip.rm_hh_bit8); + assert(aymo_chip.rm_tc_bit3 == nuked_chip.rm_tc_bit3); + assert(aymo_chip.rm_tc_bit5 == nuked_chip.rm_tc_bit5); + + for (int ch2x = 0; ch2x < 18; ++ch2x) { + if (compare_ch2xs(ch2x)) { + assert(0); + } + } + + for (int slot = 0; slot < 36; ++slot) { + if (compare_slots(slot)) { + assert(0); + } + } + + return 0; +catch_: + return 1; +} + + +#include "test_ymf262_compare_epilogue_inline.h" + + +#endif // AYMO_CPU_SUPPORT_X86_SSE41 diff --git a/tests/test_ymf262_x86_sse41_compare.c b/tests/test_ymf262_x86_sse41_compare.c new file mode 100644 index 0000000..2d6f4ec --- /dev/null +++ b/tests/test_ymf262_x86_sse41_compare.c @@ -0,0 +1,170 @@ +/* +AYMO - Accelerated YaMaha Operator +Copyright (c) 2023-2024 Andrea Zoppi. + +This file is part of AYMO. + +AYMO is free software: you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation, either version 2.1 of the License, or (at your option) +any later version. + +AYMO is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +more details. + +You should have received a copy of the GNU Lesser General Public License +along with AYMO. If not, see . +*/ + +#include "aymo.h" +#ifdef AYMO_CPU_SUPPORT_X86_SSE41 + +#include "aymo_cpu_x86_sse41_inline.h" +#define AYMO_KEEP_SHORTHANDS +#include "aymo_ymf262_x86_sse41.h" + +#include "test_ymf262_compare_prologue_inline.h" + + +static int compare_slots(int slot_) +{ + if (slot_ >= 36) { + return 0; // ignore + } + + int word = aymo_ymf262_slot_to_word[slot_]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + const struct aymo_(slot_group)* sg = &aymo_chip.sg[sgi]; + const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi]; + const opl3_slot* slot = &nuked_chip.slot[slot_]; + (void)cg; + + // TODO: Commented stuff + assert((int16_t)vextractn(sg->wg_out, sgo) == slot->out); + assert((int16_t)vextractn(sg->wg_fb_mulhi, sgo) == (int16_t)(slot->channel->fb ? (0x40 << slot->channel->fb) : 0)); +// assert(vextractn(sg->wg_fbmod, sgo) == slot->fbmod); +// assert(vextractn(sg->wg_mod, sgo) == *slot->mod); + assert((int16_t)vextractn(sg->wg_prout, sgo) == slot->prout); + assert((uint16_t)vextractn(sg->eg_rout, sgo) == slot->eg_rout); + assert((uint16_t)vextractn(sg->eg_out, sgo) == slot->eg_out); +// assert(vextractn(sg->eg_inc, sgo) == slot->eg_inc); + assert((uint16_t)vextractn(sg->eg_gen, sgo) == slot->eg_gen); +// assert(vextractn(sg->eg_rate, sgo) == slot->eg_rate); +// assert(vextractn(sg->eg_ksl, sgo) == slot->eg_ksl); + assert((int16_t)vextractn(sg->eg_tremolo_am, sgo) == *slot->trem); + assert((uint16_t)-vextractn(sg->pg_vib, sgo) == slot->reg_vib); + //assert(vextractn(sg->eg_egt, sgo) == slot->reg_type); + //assert(vextractn(sg->eg_ksr, sgo) == slot->reg_ksr); + assert((uint16_t)vextractn(sg->pg_mult_x2, sgo) == mt[slot->reg_mult]); +//FIXME: assert((uint16_t)vextractn(sg->eg_tl_x4, sgo) == slot->reg_tl * 4U); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 12) & 15) == slot->reg_ar); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 8) & 15) == slot->reg_dr); + assert((uint16_t)vextractn(sg->eg_sl, sgo) == slot->reg_sl); + assert((((uint16_t)vextractn(sg->eg_adsr, sgo) >> 0) & 15) == slot->reg_rr); + //assert(vextractn(sg->wg_wf, sgo) == slot->reg_wf); + assert((uint16_t)vextractn(sg->eg_key, sgo) == slot->key); + vi32_t pg_phase_vv = (aymo_(sgo_side)[sgo] ? sg->pg_phase_hi : sg->pg_phase_lo); + uint32_t pg_phase = vvextractn(pg_phase_vv, aymo_(sgo_cell)[sgo]); + assert(pg_phase == slot->pg_phase); + assert((uint16_t)vextractn(sg->pg_phase_out, sgo) == slot->pg_phase_out); + + return 0; +catch_: + return 1; +} + + +static int compare_ch2xs(int ch2x) +{ + if (ch2x >= 18) { + return 0; // ignore + } + + int word = aymo_ymf262_ch2x_to_word[ch2x][0]; + int sgi = (word / AYMO_(SLOT_GROUP_LENGTH)); + int sgo = (word % AYMO_(SLOT_GROUP_LENGTH)); + int cgi = aymo_(sgi_to_cgi)(sgi); + const struct aymo_(ch2x_group)* cg = &aymo_chip.cg[cgi]; + const opl3_channel* channel = &nuked_chip.channel[ch2x]; + + // TODO: Commented stuff + //int16_t* out[0]; + //int16_t* out[1]; + //int16_t* out[2]; + //int16_t* out[3]; + //int32_t leftpan; + //int32_t rightpan; + //uint8_t chtype; + assert((uint16_t)vextractn(cg->pg_fnum, sgo) == channel->f_num); + assert((uint16_t)vextractn(cg->pg_block, sgo) == channel->block); + //uint8_t fb; // compared at slot group level + //uint8_t con; + //uint8_t alg; + assert((uint16_t)vextractn(cg->eg_ksv, sgo) == channel->ksv); + assert((uint16_t)vextractn(cg->og_ch_gate_a, sgo) == channel->cha); + assert((uint16_t)vextractn(cg->og_ch_gate_b, sgo) == channel->chb); + assert((uint16_t)vextractn(cg->og_ch_gate_c, sgo) == channel->chc); + assert((uint16_t)vextractn(cg->og_ch_gate_d, sgo) == channel->chd); + + return 0; +catch_: + return 1; +} + + +static int compare_chips(void) +{ + _mm_sfence(); + + // TODO: Commented stuff + assert((uint16_t)aymo_chip.tm_timer == nuked_chip.timer); + assert(aymo_chip.eg_timer == nuked_chip.eg_timer); + assert(aymo_chip.eg_timerrem == nuked_chip.eg_timerrem); + assert(aymo_chip.eg_state == nuked_chip.eg_state); + assert((uint16_t)vextract(aymo_chip.eg_add, 0) == nuked_chip.eg_add); + //uint8_t newm; + //uint8_t nts; + //uint8_t rhy; + assert(aymo_chip.pg_vibpos == nuked_chip.vibpos); + assert(aymo_chip.eg_vibshift == nuked_chip.vibshift); + //assert((uint16_t)vextractn(aymo_chip.eg_tremolo, 0) == nuked_chip.tremolo); + assert(aymo_chip.eg_tremolopos == nuked_chip.tremolopos); + assert(aymo_chip.eg_tremoloshift == nuked_chip.tremoloshift); + assert(aymo_chip.ng_noise == nuked_chip.noise); + assert((int16_t)vextract(aymo_chip.og_out, 0) == nuked_out[0]); + assert((int16_t)vextract(aymo_chip.og_out, 1) == nuked_out[1]); + assert((int16_t)vextract(aymo_chip.og_out, 2) == nuked_out[2]); + assert((int16_t)vextract(aymo_chip.og_out, 3) == nuked_out[3]); + assert(aymo_chip.rm_hh_bit2 == nuked_chip.rm_hh_bit2); + assert(aymo_chip.rm_hh_bit3 == nuked_chip.rm_hh_bit3); + assert(aymo_chip.rm_hh_bit7 == nuked_chip.rm_hh_bit7); + assert(aymo_chip.rm_hh_bit8 == nuked_chip.rm_hh_bit8); + assert(aymo_chip.rm_tc_bit3 == nuked_chip.rm_tc_bit3); + assert(aymo_chip.rm_tc_bit5 == nuked_chip.rm_tc_bit5); + + for (int ch2x = 0; ch2x < 18; ++ch2x) { + if (compare_ch2xs(ch2x)) { + assert(0); + } + } + + for (int slot = 0; slot < 36; ++slot) { + if (compare_slots(slot)) { + assert(0); + } + } + + return 0; +catch_: + return 1; +} + + +#include "test_ymf262_compare_epilogue_inline.h" + + +#endif // AYMO_CPU_SUPPORT_X86_SSE41