Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simd: riscv: implement RVV intrinsics #9731

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions cmake/riscv64.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv64)")
message(WARNING "LuaJIT is disabled, this platform does not support built-in LuaJIT and system provided one neither.")
set(FLB_LUAJIT OFF)
endif()
if(FLB_SIMD)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv_zba")
endif()
endif ()
52 changes: 52 additions & 0 deletions include/fluent-bit/flb_simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,27 @@ typedef __m128i flb_vector32;
typedef uint8x16_t flb_vector8;
typedef uint32x4_t flb_vector32;

#elif defined(__riscv) && (__riscv_v_intrinsic >= 11000)
/*
* We use RVV (RISC-V "Vector") instructions if the compiler provides
* access to them (as indicated by __riscv_v_intrinsic) and using with
* -march=rv64gcv_zba flag. RVV extension is currently optional for
* risc-v processors. If the processors can handle this RVV
* intrinsics, this extension is able to use on that platform.
* However, there is a few RISC-V prosessors to support RVV
* extensions.
* If there is no RISC-V processor which supports RVV extensions,
* qemu-riscv with -cpu rv64,v=true,zba=true,vlen=128 flags could be
* able to emulate such extensions.
*/
#include <riscv_vector.h>
#define FLB_SIMD_RVV
typedef vuint8m1_t flb_vector8;
typedef vuint32m1_t flb_vector32;

#define RVV_VEC8_INST_LEN (128 / 8) /* 16 */
#define RVV_VEC32_INST_LEN (128 / 8 / 4) /* 4 */

#else
/*
* If no SIMD instructions are available, we can in some cases emulate vector
Expand All @@ -79,6 +100,15 @@ typedef uint64_t flb_vector8;
typedef uint8_t flb_vector8;
#endif /* FLB_SIMD_DISABLED */

/* RVV's instruction length is flexible and not fixed width.
* We assumed that VLEN which is the fundamental intsruction length is 128.
*/
#if defined(FLB_SIMD_RVV)
#define FLB_SIMD_VEC8_INST_LEN RVV_VEC8_INST_LEN
#else
#define FLB_SIMD_VEC8_INST_LEN sizeof(flb_vector8)
#endif

/* element-wise comparisons to a scalar */
static inline bool flb_vector8_has(const flb_vector8 v, const uint8_t c);
static inline bool flb_vector8_has_zero(const flb_vector8 v);
Expand All @@ -94,6 +124,8 @@ static inline void flb_vector8_load(flb_vector8 *v, const uint8_t *s)
*v = _mm_loadu_si128((const __m128i *) s);
#elif defined(FLB_SIMD_NEON)
*v = vld1q_u8(s);
#elif defined(FLB_SIMD_RVV)
*v = __riscv_vle8_v_u8m1(s, RVV_VEC8_INST_LEN);
#else
memset(v, 0, sizeof(flb_vector8));
#endif
Expand Down Expand Up @@ -129,6 +161,8 @@ static inline flb_vector8 flb_vector8_ssub(const flb_vector8 v1, const flb_vecto
return _mm_subs_epu8(v1, v2);
#elif defined(FLB_SIMD_NEON)
return vqsubq_u8(v1, v2);
#elif defined(FLB_SIMD_RVV)
return __riscv_vssubu_vv_u8m1(v1, v2, RVV_VEC8_INST_LEN);
#endif
}
#endif /* ! FLB_SIMD_NONE */
Expand All @@ -144,6 +178,11 @@ static inline flb_vector8 flb_vector8_eq(const flb_vector8 v1, const flb_vector8
return _mm_cmpeq_epi8(v1, v2);
#elif defined(FLB_SIMD_NEON)
return vceqq_u8(v1, v2);
#elif defined(FLB_SIMD_RVV)
vbool8_t ret = __riscv_vmseq_vv_u8m1_b8(v1, v2, RVV_VEC8_INST_LEN);
return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0, RVV_VEC8_INST_LEN),
__riscv_vmv_v_x_u8m1(UINT8_MAX, RVV_VEC8_INST_LEN),
ret, RVV_VEC8_INST_LEN);
#endif
}
#endif /* ! FLB_SIMD_NONE */
Expand All @@ -155,6 +194,11 @@ static inline flb_vector32 flb_vector32_eq(const flb_vector32 v1, const flb_vect
return _mm_cmpeq_epi32(v1, v2);
#elif defined(FLB_SIMD_NEON)
return vceqq_u32(v1, v2);
#elif defined(FLB_SIMD_RVV)
vbool32_t ret = __riscv_vmseq_vv_u32m1_b32(v1, v2, RVV_VEC32_INST_LEN);
return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0, RVV_VEC32_INST_LEN),
__riscv_vmv_v_x_u32m1(UINT32_MAX, RVV_VEC32_INST_LEN),
ret, RVV_VEC32_INST_LEN);
#endif
}
#endif /* ! FLB_SIMD_NONE */
Expand All @@ -168,6 +212,8 @@ static inline flb_vector8 flb_vector8_broadcast(const uint8_t c)
return _mm_set1_epi8(c);
#elif defined(FLB_SIMD_NEON)
return vdupq_n_u8(c);
#elif defined(FLB_SIMD_RVV)
return __riscv_vmv_v_x_u8m1(c, RVV_VEC8_INST_LEN);
#else
return ~UINT64CONST(0) / 0xFF * c;
#endif
Expand All @@ -182,6 +228,10 @@ static inline bool flb_vector8_is_highbit_set(const flb_vector8 v)
return _mm_movemask_epi8(v) != 0;
#elif defined(FLB_SIMD_NEON)
return vmaxvq_u8(v) > 0x7F;
#elif defined(FLB_SIMD_RVV)
return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(v,
__riscv_vmv_v_x_u8m1(0, RVV_VEC8_INST_LEN),
RVV_VEC8_INST_LEN));
#else
return v & flb_vector8_broadcast(0x80);
#endif
Expand Down Expand Up @@ -249,6 +299,8 @@ static inline char *flb_simd_info()
return "SSE2";
#elif defined(FLB_SIMD_NEON)
return "NEON";
#elif defined(FLB_SIMD_RVV)
return "RVV";
#elif defined(FLB_SIMD_NONE)
return "none";
#else
Expand Down
7 changes: 4 additions & 3 deletions src/flb_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
off_t offset = 0;
char tmp[16];
char *p;
const size_t inst_len = FLB_SIMD_VEC8_INST_LEN;

/* to encode codepoints > 0xFFFF */
uint16_t high;
Expand All @@ -816,10 +817,10 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
p = buf + *off;

/* align length to the nearest multiple of the vector size for safe SIMD processing */
vlen = str_len & ~(sizeof(flb_vector8) - 1);
vlen = str_len & ~(inst_len - 1);
for (i = 0;;) {
/* SIMD optimization: Process chunk of input string */
for (; i < vlen; i += sizeof(flb_vector8)) {
for (; i < vlen; i += inst_len) {
flb_vector8 chunk;
flb_vector8_load(&chunk, (const uint8_t *)&str[i]);

Expand Down Expand Up @@ -851,7 +852,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
}

/* Process remaining characters one by one */
for (b = 0; b < sizeof(flb_vector8); b++) {
for (b = 0; b < inst_len; b++) {
if (i >= str_len) {
/* all characters has been processed */
goto done;
Expand Down
Loading