From e3cb067bf482a0cea305d09ffc388f9da969bc93 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 11 Feb 2026 00:27:27 +0000 Subject: [PATCH] Fixed MADD to use float16 values. Use LMUL = 2 in main loop. Now 1.85X faster on BananaPi. --- kernel/riscv64/shgemm_kernel_16x8_zvl256b.c | 448 ++++++++++++++++---- kernel/riscv64/shgemm_kernel_8x8_zvl128b.c | 153 +++---- 2 files changed, 433 insertions(+), 168 deletions(-) diff --git a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c index 217b6dbfc..4ec59f5e0 100644 --- a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c @@ -8,13 +8,114 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, BLASLONG gvl = 0; BLASLONG m_top = 0; BLASLONG n_top = 0; +#ifdef FP16_NARROW + IFLOAT alpha16 = (IFLOAT)(alpha); +#endif // -- MAIN PASS for (BLASLONG j=0; j