Fixed MADD to use float16 values. Use LMUL = 2 in main loop. Now 1.85X faster on BananaPi.

This commit is contained in:
Chip Kerchner
2026-02-11 00:27:27 +00:00
parent 74d9fe2832
commit e3cb067bf4
2 changed files with 433 additions and 168 deletions

View File

@@ -8,13 +8,114 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG gvl = 0;
BLASLONG m_top = 0;
BLASLONG n_top = 0;
#ifdef FP16_NARROW
IFLOAT alpha16 = (IFLOAT)(alpha);
#endif
// -- MAIN PASS
for (BLASLONG j=0; j<N/8; j+=1) {
m_top = 0;
#ifdef FP16_NARROW
BLASLONG gvl = __riscv_vsetvl_e16m2(32);
for (BLASLONG i=0; i<M/32; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
_Float16 B0 = B[bi+0];
_Float16 B1 = B[bi+1];
_Float16 B2 = B[bi+2];
_Float16 B3 = B[bi+3];
_Float16 B4 = B[bi+4];
_Float16 B5 = B[bi+5];
_Float16 B6 = B[bi+6];
_Float16 B7 = B[bi+7];
bi += 8;
vfloat16m1_t A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
vfloat16m1_t A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
vfloat16m2_t A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
ai += 16;
vfloat16m2_t result0 = __riscv_vfmul_vf_f16m2( A0, B0, gvl);
vfloat16m2_t result1 = __riscv_vfmul_vf_f16m2( A0, B1, gvl);
vfloat16m2_t result2 = __riscv_vfmul_vf_f16m2( A0, B2, gvl);
vfloat16m2_t result3 = __riscv_vfmul_vf_f16m2( A0, B3, gvl);
vfloat16m2_t result4 = __riscv_vfmul_vf_f16m2( A0, B4, gvl);
vfloat16m2_t result5 = __riscv_vfmul_vf_f16m2( A0, B5, gvl);
vfloat16m2_t result6 = __riscv_vfmul_vf_f16m2( A0, B6, gvl);
vfloat16m2_t result7 = __riscv_vfmul_vf_f16m2( A0, B7, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
B2 = B[bi+2];
B3 = B[bi+3];
B4 = B[bi+4];
B5 = B[bi+5];
B6 = B[bi+6];
B7 = B[bi+7];
bi += 8;
A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
ai += 16;
result0 = __riscv_vfmacc_vf_f16m2(result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f16m2(result1, B1, A0, gvl);
result2 = __riscv_vfmacc_vf_f16m2(result2, B2, A0, gvl);
result3 = __riscv_vfmacc_vf_f16m2(result3, B3, A0, gvl);
result4 = __riscv_vfmacc_vf_f16m2(result4, B4, A0, gvl);
result5 = __riscv_vfmacc_vf_f16m2(result5, B5, A0, gvl);
result6 = __riscv_vfmacc_vf_f16m2(result6, B6, A0, gvl);
result7 = __riscv_vfmacc_vf_f16m2(result7, B7, A0, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat32m4_t c0 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c1 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c2 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c3 = __riscv_vle32_v_f32m4( &C[ci], gvl);
ci-=ldc*3;
c0 = __riscv_vfwmacc_vf_f32m4(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m4(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m4(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m4(c3, alpha16, result3, gvl);
__riscv_vse32_v_f32m4( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c3, gvl); ci += ldc-gvl*0;
vfloat32m4_t c4 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c5 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c6 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c7 = __riscv_vle32_v_f32m4( &C[ci], gvl);
ci-=ldc*3;
c4 = __riscv_vfwmacc_vf_f32m4(c4, alpha16, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m4(c5, alpha16, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m4(c6, alpha16, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m4(c7, alpha16, result7, gvl);
__riscv_vse32_v_f32m4( &C[ci], c4, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c5, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c6, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c7, gvl);
m_top += 32;
}
if (M & 16) {
gvl = __riscv_vsetvl_e16m1(16);
#else
BLASLONG gvl = __riscv_vsetvl_e16m1(16);
for (BLASLONG i=0; i<M/16; i+=1) {
#endif
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
@@ -97,14 +198,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m2_t c7 = __riscv_vle32_v_f32m2( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m2(c4, alpha, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m2(c5, alpha, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m2(c6, alpha, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m2(c7, alpha, result7, gvl);
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha16, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m2(c4, alpha16, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m2(c5, alpha16, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m2(c6, alpha16, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m2(c7, alpha16, result7, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
@@ -218,14 +319,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha, result7, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha16, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha16, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha16, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha16, result7, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -343,14 +444,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m1_t c7 = __riscv_vle32_v_f32m1(&C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha, result7, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha16, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha16, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha16, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha16, result7, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -458,22 +559,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci=n_top*ldc+m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
C[ci + 2 * ldc + 0] += alpha * (float)(result4);
C[ci + 2 * ldc + 1] += alpha * (float)(result5);
C[ci + 3 * ldc + 0] += alpha * (float)(result6);
C[ci + 3 * ldc + 1] += alpha * (float)(result7);
C[ci + 4 * ldc + 0] += alpha * (float)(result8);
C[ci + 4 * ldc + 1] += alpha * (float)(result9);
C[ci + 5 * ldc + 0] += alpha * (float)(result10);
C[ci + 5 * ldc + 1] += alpha * (float)(result11);
C[ci + 6 * ldc + 0] += alpha * (float)(result12);
C[ci + 6 * ldc + 1] += alpha * (float)(result13);
C[ci + 7 * ldc + 0] += alpha * (float)(result14);
C[ci + 7 * ldc + 1] += alpha * (float)(result15);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
C[ci + 2 * ldc + 0] += alpha16 * (float)(result4);
C[ci + 2 * ldc + 1] += alpha16 * (float)(result5);
C[ci + 3 * ldc + 0] += alpha16 * (float)(result6);
C[ci + 3 * ldc + 1] += alpha16 * (float)(result7);
C[ci + 4 * ldc + 0] += alpha16 * (float)(result8);
C[ci + 4 * ldc + 1] += alpha16 * (float)(result9);
C[ci + 5 * ldc + 0] += alpha16 * (float)(result10);
C[ci + 5 * ldc + 1] += alpha16 * (float)(result11);
C[ci + 6 * ldc + 0] += alpha16 * (float)(result12);
C[ci + 6 * ldc + 1] += alpha16 * (float)(result13);
C[ci + 7 * ldc + 0] += alpha16 * (float)(result14);
C[ci + 7 * ldc + 1] += alpha16 * (float)(result15);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -548,14 +649,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci = n_top * ldc + m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
C[ci + 2 * ldc + 0] += alpha * (float)(result2);
C[ci + 3 * ldc + 0] += alpha * (float)(result3);
C[ci + 4 * ldc + 0] += alpha * (float)(result4);
C[ci + 5 * ldc + 0] += alpha * (float)(result5);
C[ci + 6 * ldc + 0] += alpha * (float)(result6);
C[ci + 7 * ldc + 0] += alpha * (float)(result7);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
C[ci + 2 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 3 * ldc + 0] += alpha16 * (float)(result3);
C[ci + 4 * ldc + 0] += alpha16 * (float)(result4);
C[ci + 5 * ldc + 0] += alpha16 * (float)(result5);
C[ci + 6 * ldc + 0] += alpha16 * (float)(result6);
C[ci + 7 * ldc + 0] += alpha16 * (float)(result7);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 1 * ldc + 0] += alpha * result1;
@@ -572,10 +673,76 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
}
if( N & 4 ) {
gvl = __riscv_vsetvl_e16m1(16);
m_top = 0;
#ifdef FP16_NARROW
gvl = __riscv_vsetvl_e16m2(32);
for (BLASLONG i=0; i<M/32; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
_Float16 B0 = B[bi+0];
_Float16 B1 = B[bi+1];
_Float16 B2 = B[bi+2];
_Float16 B3 = B[bi+3];
bi += 4;
vfloat16m1_t A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
vfloat16m1_t A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
vfloat16m2_t A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
ai += 16;
vfloat16m2_t result0 = __riscv_vfmul_vf_f16m2( A0, B0, gvl);
vfloat16m2_t result1 = __riscv_vfmul_vf_f16m2( A0, B1, gvl);
vfloat16m2_t result2 = __riscv_vfmul_vf_f16m2( A0, B2, gvl);
vfloat16m2_t result3 = __riscv_vfmul_vf_f16m2( A0, B3, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
B2 = B[bi+2];
B3 = B[bi+3];
bi += 4;
A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
ai += 16;
result0 = __riscv_vfmacc_vf_f16m2(result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f16m2(result1, B1, A0, gvl);
result2 = __riscv_vfmacc_vf_f16m2(result2, B2, A0, gvl);
result3 = __riscv_vfmacc_vf_f16m2(result3, B3, A0, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat32m4_t c0 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c1 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c2 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c3 = __riscv_vle32_v_f32m4( &C[ci], gvl);
c0 = __riscv_vfwmacc_vf_f32m4(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m4(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m4(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m4(c3, alpha16, result3, gvl);
ci=n_top*ldc+m_top;
__riscv_vse32_v_f32m4( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c3, gvl);
m_top += 32;
}
if (M & 16) {
gvl = __riscv_vsetvl_e16m1(16);
#else
gvl = __riscv_vsetvl_e16m1(16);
for (BLASLONG i=0; i<M/16; i+=1) {
#endif
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
@@ -632,10 +799,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m2_t c3 = __riscv_vle32_v_f32m2( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha, result3, gvl);
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha16, result3, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
@@ -710,10 +877,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -789,10 +956,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci += ldc - gvl * 0;
vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -861,14 +1028,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci=n_top*ldc+m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
C[ci + 2 * ldc + 0] += alpha * (float)(result4);
C[ci + 2 * ldc + 1] += alpha * (float)(result5);
C[ci + 3 * ldc + 0] += alpha * (float)(result6);
C[ci + 3 * ldc + 1] += alpha * (float)(result7);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
C[ci + 2 * ldc + 0] += alpha16 * (float)(result4);
C[ci + 2 * ldc + 1] += alpha16 * (float)(result5);
C[ci + 3 * ldc + 0] += alpha16 * (float)(result6);
C[ci + 3 * ldc + 1] += alpha16 * (float)(result7);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -919,10 +1086,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci = n_top * ldc + m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
C[ci + 2 * ldc + 0] += alpha * (float)(result2);
C[ci + 3 * ldc + 0] += alpha * (float)(result3);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
C[ci + 2 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 3 * ldc + 0] += alpha16 * (float)(result3);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 1 * ldc + 0] += alpha * result1;
@@ -939,10 +1106,61 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
// -- tails for N=2
if( N & 2 ) {
gvl = __riscv_vsetvl_e16m1(16);
m_top = 0;
#ifdef FP16_NARROW
gvl = __riscv_vsetvl_e16m2(32);
for (BLASLONG i=0; i<M/32; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
_Float16 B0 = B[bi+0];
_Float16 B1 = B[bi+1];
bi += 2;
vfloat16m1_t A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
vfloat16m1_t A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
vfloat16m2_t A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
ai += 16;
vfloat16m2_t result0 = __riscv_vfmul_vf_f16m2( A0, B0, gvl);
vfloat16m2_t result1 = __riscv_vfmul_vf_f16m2( A0, B1, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
bi += 2;
A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
ai += 16;
result0 = __riscv_vfmacc_vf_f16m2(result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f16m2(result1, B1, A0, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat32m4_t c0 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m4_t c1 = __riscv_vle32_v_f32m4( &C[ci], gvl);
c0 = __riscv_vfwmacc_vf_f32m4(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m4(c1, alpha16, result1, gvl);
ci=n_top*ldc+m_top;
__riscv_vse32_v_f32m4( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m4( &C[ci], c1, gvl);
m_top += 32;
}
if (M & 16) {
gvl = __riscv_vsetvl_e16m1(16);
#else
gvl = __riscv_vsetvl_e16m1(16);
for (BLASLONG i=0; i<M/16; i+=1) {
#endif
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
@@ -984,8 +1202,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
@@ -1041,8 +1259,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc - gvl * 0;
vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -1098,8 +1316,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci += ldc - gvl * 0;
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -1147,10 +1365,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci=n_top*ldc+m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -1189,8 +1407,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci = n_top * ldc + m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 1 * ldc + 0] += alpha * result1;
@@ -1205,10 +1423,54 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
// -- tails for N=1
if( N & 1 ) {
gvl = __riscv_vsetvl_e16m1(16);
m_top = 0;
#ifdef FP16_NARROW
gvl = __riscv_vsetvl_e16m2(32);
for (BLASLONG i=0; i<M/32; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
_Float16 B0 = B[bi+0];
bi += 1;
vfloat16m1_t A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
vfloat16m1_t A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
vfloat16m2_t A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
ai += 16;
vfloat16m2_t result0 = __riscv_vfmul_vf_f16m2( A0, B0, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
bi += 1;
A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
ai += 16;
result0 = __riscv_vfmacc_vf_f16m2(result0, B0, A0, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat32m4_t c0 = __riscv_vle32_v_f32m4( &C[ci], gvl);
c0 = __riscv_vfwmacc_vf_f32m4(c0, alpha16, result0, gvl);
ci=n_top*ldc+m_top;
__riscv_vse32_v_f32m4( &C[ci], c0, gvl);
m_top += 32;
}
if (M & 16) {
gvl = __riscv_vsetvl_e16m1(16);
#else
gvl = __riscv_vsetvl_e16m1(16);
for (BLASLONG i=0; i<M/16; i+=1) {
#endif
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
_Float16 B0 = B[bi+0];
@@ -1242,7 +1504,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
#endif
@@ -1290,7 +1552,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
#endif
@@ -1336,7 +1598,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
#endif
@@ -1374,8 +1636,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci=n_top*ldc+m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -1408,7 +1670,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci = n_top * ldc + m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
#endif

View File

@@ -8,6 +8,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG gvl = 0;
BLASLONG m_top = 0;
BLASLONG n_top = 0;
#ifdef FP16_NARROW
IFLOAT alpha16 = (IFLOAT)(alpha);
#endif
// -- MAIN PASS
for (BLASLONG j=0; j<N/8; j+=1) {
@@ -100,14 +103,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m2_t c7 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m2(c4, alpha, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m2(c5, alpha, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m2(c6, alpha, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m2(c7, alpha, result7, gvl);
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha16, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m2(c4, alpha16, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m2(c5, alpha16, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m2(c6, alpha16, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m2(c7, alpha16, result7, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
@@ -225,14 +228,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci += ldc - gvl * 0;
vfloat32m1_t c7 = __riscv_vle32_v_f32m1(&C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha, result7, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha16, result4, gvl);
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha16, result5, gvl);
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha16, result6, gvl);
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha16, result7, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -343,22 +346,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci=n_top*ldc+m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
C[ci + 2 * ldc + 0] += alpha * (float)(result4);
C[ci + 2 * ldc + 1] += alpha * (float)(result5);
C[ci + 3 * ldc + 0] += alpha * (float)(result6);
C[ci + 3 * ldc + 1] += alpha * (float)(result7);
C[ci + 4 * ldc + 0] += alpha * (float)(result8);
C[ci + 4 * ldc + 1] += alpha * (float)(result9);
C[ci + 5 * ldc + 0] += alpha * (float)(result10);
C[ci + 5 * ldc + 1] += alpha * (float)(result11);
C[ci + 6 * ldc + 0] += alpha * (float)(result12);
C[ci + 6 * ldc + 1] += alpha * (float)(result13);
C[ci + 7 * ldc + 0] += alpha * (float)(result14);
C[ci + 7 * ldc + 1] += alpha * (float)(result15);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
C[ci + 2 * ldc + 0] += alpha16 * (float)(result4);
C[ci + 2 * ldc + 1] += alpha16 * (float)(result5);
C[ci + 3 * ldc + 0] += alpha16 * (float)(result6);
C[ci + 3 * ldc + 1] += alpha16 * (float)(result7);
C[ci + 4 * ldc + 0] += alpha16 * (float)(result8);
C[ci + 4 * ldc + 1] += alpha16 * (float)(result9);
C[ci + 5 * ldc + 0] += alpha16 * (float)(result10);
C[ci + 5 * ldc + 1] += alpha16 * (float)(result11);
C[ci + 6 * ldc + 0] += alpha16 * (float)(result12);
C[ci + 6 * ldc + 1] += alpha16 * (float)(result13);
C[ci + 7 * ldc + 0] += alpha16 * (float)(result14);
C[ci + 7 * ldc + 1] += alpha16 * (float)(result15);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -433,14 +436,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci = n_top * ldc + m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
C[ci + 2 * ldc + 0] += alpha * (float)(result2);
C[ci + 3 * ldc + 0] += alpha * (float)(result3);
C[ci + 4 * ldc + 0] += alpha * (float)(result4);
C[ci + 5 * ldc + 0] += alpha * (float)(result5);
C[ci + 6 * ldc + 0] += alpha * (float)(result6);
C[ci + 7 * ldc + 0] += alpha * (float)(result7);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
C[ci + 2 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 3 * ldc + 0] += alpha16 * (float)(result3);
C[ci + 4 * ldc + 0] += alpha16 * (float)(result4);
C[ci + 5 * ldc + 0] += alpha16 * (float)(result5);
C[ci + 6 * ldc + 0] += alpha16 * (float)(result6);
C[ci + 7 * ldc + 0] += alpha16 * (float)(result7);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 1 * ldc + 0] += alpha * result1;
@@ -519,10 +522,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m2_t c3 = __riscv_vle32_v_f32m2( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha, result3, gvl);
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha16, result3, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
@@ -598,10 +601,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci += ldc - gvl * 0;
vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -672,14 +675,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci=n_top*ldc+m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
C[ci + 2 * ldc + 0] += alpha * (float)(result4);
C[ci + 2 * ldc + 1] += alpha * (float)(result5);
C[ci + 3 * ldc + 0] += alpha * (float)(result6);
C[ci + 3 * ldc + 1] += alpha * (float)(result7);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
C[ci + 2 * ldc + 0] += alpha16 * (float)(result4);
C[ci + 2 * ldc + 1] += alpha16 * (float)(result5);
C[ci + 3 * ldc + 0] += alpha16 * (float)(result6);
C[ci + 3 * ldc + 1] += alpha16 * (float)(result7);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -730,10 +733,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci = n_top * ldc + m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
C[ci + 2 * ldc + 0] += alpha * (float)(result2);
C[ci + 3 * ldc + 0] += alpha * (float)(result3);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
C[ci + 2 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 3 * ldc + 0] += alpha16 * (float)(result3);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 1 * ldc + 0] += alpha * result1;
@@ -796,8 +799,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
@@ -853,8 +856,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci += ldc - gvl * 0;
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
@@ -904,10 +907,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci=n_top*ldc+m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -946,8 +949,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci = n_top * ldc + m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 1 * ldc + 0] += alpha * result1;
@@ -1001,7 +1004,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
#endif
@@ -1047,7 +1050,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
#ifdef FP16_NARROW
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
#else
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
#endif
@@ -1087,8 +1090,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci=n_top*ldc+m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -1121,7 +1124,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG ci = n_top * ldc + m_top;
#ifdef FP16_NARROW
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
#else
C[ci + 0 * ldc + 0] += alpha * result0;
#endif