mirror of
https://github.com/OpenMathLib/OpenBLAS
synced 2026-06-08 01:15:39 +08:00
Fixed MADD to use float16 values. Use LMUL = 2 in main loop. Now 1.85X faster on BananaPi.
This commit is contained in:
@@ -8,13 +8,114 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
#ifdef FP16_NARROW
|
||||
IFLOAT alpha16 = (IFLOAT)(alpha);
|
||||
#endif
|
||||
|
||||
// -- MAIN PASS
|
||||
for (BLASLONG j=0; j<N/8; j+=1) {
|
||||
m_top = 0;
|
||||
#ifdef FP16_NARROW
|
||||
BLASLONG gvl = __riscv_vsetvl_e16m2(32);
|
||||
|
||||
for (BLASLONG i=0; i<M/32; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
_Float16 B0 = B[bi+0];
|
||||
_Float16 B1 = B[bi+1];
|
||||
_Float16 B2 = B[bi+2];
|
||||
_Float16 B3 = B[bi+3];
|
||||
_Float16 B4 = B[bi+4];
|
||||
_Float16 B5 = B[bi+5];
|
||||
_Float16 B6 = B[bi+6];
|
||||
_Float16 B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
vfloat16m1_t A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
|
||||
vfloat16m1_t A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
|
||||
vfloat16m2_t A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
|
||||
ai += 16;
|
||||
|
||||
vfloat16m2_t result0 = __riscv_vfmul_vf_f16m2( A0, B0, gvl);
|
||||
vfloat16m2_t result1 = __riscv_vfmul_vf_f16m2( A0, B1, gvl);
|
||||
vfloat16m2_t result2 = __riscv_vfmul_vf_f16m2( A0, B2, gvl);
|
||||
vfloat16m2_t result3 = __riscv_vfmul_vf_f16m2( A0, B3, gvl);
|
||||
vfloat16m2_t result4 = __riscv_vfmul_vf_f16m2( A0, B4, gvl);
|
||||
vfloat16m2_t result5 = __riscv_vfmul_vf_f16m2( A0, B5, gvl);
|
||||
vfloat16m2_t result6 = __riscv_vfmul_vf_f16m2( A0, B6, gvl);
|
||||
vfloat16m2_t result7 = __riscv_vfmul_vf_f16m2( A0, B7, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
B4 = B[bi+4];
|
||||
B5 = B[bi+5];
|
||||
B6 = B[bi+6];
|
||||
B7 = B[bi+7];
|
||||
bi += 8;
|
||||
A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
|
||||
A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
|
||||
A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
|
||||
ai += 16;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f16m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f16m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f16m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f16m2(result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfmacc_vf_f16m2(result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfmacc_vf_f16m2(result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfmacc_vf_f16m2(result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfmacc_vf_f16m2(result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat32m4_t c0 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c1 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c2 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c3 = __riscv_vle32_v_f32m4( &C[ci], gvl);
|
||||
|
||||
ci-=ldc*3;
|
||||
|
||||
c0 = __riscv_vfwmacc_vf_f32m4(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m4(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m4(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m4(c3, alpha16, result3, gvl);
|
||||
|
||||
__riscv_vse32_v_f32m4( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c3, gvl); ci += ldc-gvl*0;
|
||||
|
||||
vfloat32m4_t c4 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c5 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c6 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c7 = __riscv_vle32_v_f32m4( &C[ci], gvl);
|
||||
|
||||
ci-=ldc*3;
|
||||
|
||||
c4 = __riscv_vfwmacc_vf_f32m4(c4, alpha16, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m4(c5, alpha16, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m4(c6, alpha16, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m4(c7, alpha16, result7, gvl);
|
||||
|
||||
__riscv_vse32_v_f32m4( &C[ci], c4, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c5, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c6, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c7, gvl);
|
||||
m_top += 32;
|
||||
}
|
||||
|
||||
if (M & 16) {
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
#else
|
||||
BLASLONG gvl = __riscv_vsetvl_e16m1(16);
|
||||
|
||||
for (BLASLONG i=0; i<M/16; i+=1) {
|
||||
#endif
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
@@ -97,14 +198,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m2_t c7 = __riscv_vle32_v_f32m2( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m2(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m2(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m2(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m2(c7, alpha, result7, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha16, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m2(c4, alpha16, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m2(c5, alpha16, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m2(c6, alpha16, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m2(c7, alpha16, result7, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
@@ -218,14 +319,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha, result7, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha16, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha16, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha16, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha16, result7, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -343,14 +444,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m1_t c7 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha, result7, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha16, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha16, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha16, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha16, result7, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -458,22 +559,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
|
||||
C[ci + 2 * ldc + 0] += alpha * (float)(result4);
|
||||
C[ci + 2 * ldc + 1] += alpha * (float)(result5);
|
||||
C[ci + 3 * ldc + 0] += alpha * (float)(result6);
|
||||
C[ci + 3 * ldc + 1] += alpha * (float)(result7);
|
||||
C[ci + 4 * ldc + 0] += alpha * (float)(result8);
|
||||
C[ci + 4 * ldc + 1] += alpha * (float)(result9);
|
||||
C[ci + 5 * ldc + 0] += alpha * (float)(result10);
|
||||
C[ci + 5 * ldc + 1] += alpha * (float)(result11);
|
||||
C[ci + 6 * ldc + 0] += alpha * (float)(result12);
|
||||
C[ci + 6 * ldc + 1] += alpha * (float)(result13);
|
||||
C[ci + 7 * ldc + 0] += alpha * (float)(result14);
|
||||
C[ci + 7 * ldc + 1] += alpha * (float)(result15);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
|
||||
C[ci + 2 * ldc + 0] += alpha16 * (float)(result4);
|
||||
C[ci + 2 * ldc + 1] += alpha16 * (float)(result5);
|
||||
C[ci + 3 * ldc + 0] += alpha16 * (float)(result6);
|
||||
C[ci + 3 * ldc + 1] += alpha16 * (float)(result7);
|
||||
C[ci + 4 * ldc + 0] += alpha16 * (float)(result8);
|
||||
C[ci + 4 * ldc + 1] += alpha16 * (float)(result9);
|
||||
C[ci + 5 * ldc + 0] += alpha16 * (float)(result10);
|
||||
C[ci + 5 * ldc + 1] += alpha16 * (float)(result11);
|
||||
C[ci + 6 * ldc + 0] += alpha16 * (float)(result12);
|
||||
C[ci + 6 * ldc + 1] += alpha16 * (float)(result13);
|
||||
C[ci + 7 * ldc + 0] += alpha16 * (float)(result14);
|
||||
C[ci + 7 * ldc + 1] += alpha16 * (float)(result15);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -548,14 +649,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
|
||||
C[ci + 2 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 3 * ldc + 0] += alpha * (float)(result3);
|
||||
C[ci + 4 * ldc + 0] += alpha * (float)(result4);
|
||||
C[ci + 5 * ldc + 0] += alpha * (float)(result5);
|
||||
C[ci + 6 * ldc + 0] += alpha * (float)(result6);
|
||||
C[ci + 7 * ldc + 0] += alpha * (float)(result7);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
|
||||
C[ci + 2 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 3 * ldc + 0] += alpha16 * (float)(result3);
|
||||
C[ci + 4 * ldc + 0] += alpha16 * (float)(result4);
|
||||
C[ci + 5 * ldc + 0] += alpha16 * (float)(result5);
|
||||
C[ci + 6 * ldc + 0] += alpha16 * (float)(result6);
|
||||
C[ci + 7 * ldc + 0] += alpha16 * (float)(result7);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
@@ -572,10 +673,76 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
}
|
||||
|
||||
if( N & 4 ) {
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
m_top = 0;
|
||||
#ifdef FP16_NARROW
|
||||
gvl = __riscv_vsetvl_e16m2(32);
|
||||
|
||||
for (BLASLONG i=0; i<M/32; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
_Float16 B0 = B[bi+0];
|
||||
_Float16 B1 = B[bi+1];
|
||||
_Float16 B2 = B[bi+2];
|
||||
_Float16 B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
vfloat16m1_t A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
|
||||
vfloat16m1_t A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
|
||||
vfloat16m2_t A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
|
||||
ai += 16;
|
||||
|
||||
vfloat16m2_t result0 = __riscv_vfmul_vf_f16m2( A0, B0, gvl);
|
||||
vfloat16m2_t result1 = __riscv_vfmul_vf_f16m2( A0, B1, gvl);
|
||||
vfloat16m2_t result2 = __riscv_vfmul_vf_f16m2( A0, B2, gvl);
|
||||
vfloat16m2_t result3 = __riscv_vfmul_vf_f16m2( A0, B3, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
|
||||
A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
|
||||
A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
|
||||
ai += 16;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f16m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f16m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f16m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f16m2(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat32m4_t c0 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c1 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c2 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c3 = __riscv_vle32_v_f32m4( &C[ci], gvl);
|
||||
|
||||
c0 = __riscv_vfwmacc_vf_f32m4(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m4(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m4(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m4(c3, alpha16, result3, gvl);
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse32_v_f32m4( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c3, gvl);
|
||||
m_top += 32;
|
||||
}
|
||||
|
||||
if (M & 16) {
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
#else
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
|
||||
for (BLASLONG i=0; i<M/16; i+=1) {
|
||||
#endif
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
@@ -632,10 +799,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m2_t c3 = __riscv_vle32_v_f32m2( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha16, result3, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
@@ -710,10 +877,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -789,10 +956,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -861,14 +1028,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
|
||||
C[ci + 2 * ldc + 0] += alpha * (float)(result4);
|
||||
C[ci + 2 * ldc + 1] += alpha * (float)(result5);
|
||||
C[ci + 3 * ldc + 0] += alpha * (float)(result6);
|
||||
C[ci + 3 * ldc + 1] += alpha * (float)(result7);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
|
||||
C[ci + 2 * ldc + 0] += alpha16 * (float)(result4);
|
||||
C[ci + 2 * ldc + 1] += alpha16 * (float)(result5);
|
||||
C[ci + 3 * ldc + 0] += alpha16 * (float)(result6);
|
||||
C[ci + 3 * ldc + 1] += alpha16 * (float)(result7);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -919,10 +1086,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
|
||||
C[ci + 2 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 3 * ldc + 0] += alpha * (float)(result3);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
|
||||
C[ci + 2 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 3 * ldc + 0] += alpha16 * (float)(result3);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
@@ -939,10 +1106,61 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
// -- tails for N=2
|
||||
if( N & 2 ) {
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
m_top = 0;
|
||||
#ifdef FP16_NARROW
|
||||
gvl = __riscv_vsetvl_e16m2(32);
|
||||
|
||||
for (BLASLONG i=0; i<M/32; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
_Float16 B0 = B[bi+0];
|
||||
_Float16 B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
vfloat16m1_t A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
|
||||
vfloat16m1_t A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
|
||||
vfloat16m2_t A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
|
||||
ai += 16;
|
||||
|
||||
vfloat16m2_t result0 = __riscv_vfmul_vf_f16m2( A0, B0, gvl);
|
||||
vfloat16m2_t result1 = __riscv_vfmul_vf_f16m2( A0, B1, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
|
||||
A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
|
||||
A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
|
||||
ai += 16;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f16m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f16m2(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat32m4_t c0 = __riscv_vle32_v_f32m4( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m4_t c1 = __riscv_vle32_v_f32m4( &C[ci], gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m4(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m4(c1, alpha16, result1, gvl);
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse32_v_f32m4( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m4( &C[ci], c1, gvl);
|
||||
m_top += 32;
|
||||
}
|
||||
|
||||
if (M & 16) {
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
#else
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
|
||||
for (BLASLONG i=0; i<M/16; i+=1) {
|
||||
#endif
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
@@ -984,8 +1202,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
@@ -1041,8 +1259,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc - gvl * 0;
|
||||
vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -1098,8 +1316,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -1147,10 +1365,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -1189,8 +1407,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
@@ -1205,10 +1423,54 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
// -- tails for N=1
|
||||
if( N & 1 ) {
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
m_top = 0;
|
||||
#ifdef FP16_NARROW
|
||||
gvl = __riscv_vsetvl_e16m2(32);
|
||||
|
||||
for (BLASLONG i=0; i<M/32; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
_Float16 B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
vfloat16m1_t A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
|
||||
vfloat16m1_t A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
|
||||
vfloat16m2_t A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
|
||||
ai += 16;
|
||||
|
||||
vfloat16m2_t result0 = __riscv_vfmul_vf_f16m2( A0, B0, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
A00 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], 16 );
|
||||
A01 = __riscv_vle16_v_f16m1( &A[ai+0*gvl+16*K], 16 );
|
||||
A0 = __riscv_vcreate_v_f16m1_f16m2(A00, A01);
|
||||
ai += 16;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f16m2(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat32m4_t c0 = __riscv_vle32_v_f32m4( &C[ci], gvl);
|
||||
|
||||
c0 = __riscv_vfwmacc_vf_f32m4(c0, alpha16, result0, gvl);
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse32_v_f32m4( &C[ci], c0, gvl);
|
||||
m_top += 32;
|
||||
}
|
||||
|
||||
if (M & 16) {
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
#else
|
||||
gvl = __riscv_vsetvl_e16m1(16);
|
||||
|
||||
for (BLASLONG i=0; i<M/16; i+=1) {
|
||||
#endif
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
_Float16 B0 = B[bi+0];
|
||||
@@ -1242,7 +1504,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
#endif
|
||||
@@ -1290,7 +1552,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
#endif
|
||||
@@ -1336,7 +1598,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
#endif
|
||||
@@ -1374,8 +1636,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -1408,7 +1670,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
#endif
|
||||
|
||||
@@ -8,6 +8,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
#ifdef FP16_NARROW
|
||||
IFLOAT alpha16 = (IFLOAT)(alpha);
|
||||
#endif
|
||||
|
||||
// -- MAIN PASS
|
||||
for (BLASLONG j=0; j<N/8; j+=1) {
|
||||
@@ -100,14 +103,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m2_t c7 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m2(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m2(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m2(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m2(c7, alpha, result7, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha16, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m2(c4, alpha16, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m2(c5, alpha16, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m2(c6, alpha16, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m2(c7, alpha16, result7, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
@@ -225,14 +228,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m1_t c7 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha, result7, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
|
||||
c4 = __riscv_vfwmacc_vf_f32m1(c4, alpha16, result4, gvl);
|
||||
c5 = __riscv_vfwmacc_vf_f32m1(c5, alpha16, result5, gvl);
|
||||
c6 = __riscv_vfwmacc_vf_f32m1(c6, alpha16, result6, gvl);
|
||||
c7 = __riscv_vfwmacc_vf_f32m1(c7, alpha16, result7, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -343,22 +346,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
|
||||
C[ci + 2 * ldc + 0] += alpha * (float)(result4);
|
||||
C[ci + 2 * ldc + 1] += alpha * (float)(result5);
|
||||
C[ci + 3 * ldc + 0] += alpha * (float)(result6);
|
||||
C[ci + 3 * ldc + 1] += alpha * (float)(result7);
|
||||
C[ci + 4 * ldc + 0] += alpha * (float)(result8);
|
||||
C[ci + 4 * ldc + 1] += alpha * (float)(result9);
|
||||
C[ci + 5 * ldc + 0] += alpha * (float)(result10);
|
||||
C[ci + 5 * ldc + 1] += alpha * (float)(result11);
|
||||
C[ci + 6 * ldc + 0] += alpha * (float)(result12);
|
||||
C[ci + 6 * ldc + 1] += alpha * (float)(result13);
|
||||
C[ci + 7 * ldc + 0] += alpha * (float)(result14);
|
||||
C[ci + 7 * ldc + 1] += alpha * (float)(result15);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
|
||||
C[ci + 2 * ldc + 0] += alpha16 * (float)(result4);
|
||||
C[ci + 2 * ldc + 1] += alpha16 * (float)(result5);
|
||||
C[ci + 3 * ldc + 0] += alpha16 * (float)(result6);
|
||||
C[ci + 3 * ldc + 1] += alpha16 * (float)(result7);
|
||||
C[ci + 4 * ldc + 0] += alpha16 * (float)(result8);
|
||||
C[ci + 4 * ldc + 1] += alpha16 * (float)(result9);
|
||||
C[ci + 5 * ldc + 0] += alpha16 * (float)(result10);
|
||||
C[ci + 5 * ldc + 1] += alpha16 * (float)(result11);
|
||||
C[ci + 6 * ldc + 0] += alpha16 * (float)(result12);
|
||||
C[ci + 6 * ldc + 1] += alpha16 * (float)(result13);
|
||||
C[ci + 7 * ldc + 0] += alpha16 * (float)(result14);
|
||||
C[ci + 7 * ldc + 1] += alpha16 * (float)(result15);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -433,14 +436,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
|
||||
C[ci + 2 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 3 * ldc + 0] += alpha * (float)(result3);
|
||||
C[ci + 4 * ldc + 0] += alpha * (float)(result4);
|
||||
C[ci + 5 * ldc + 0] += alpha * (float)(result5);
|
||||
C[ci + 6 * ldc + 0] += alpha * (float)(result6);
|
||||
C[ci + 7 * ldc + 0] += alpha * (float)(result7);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
|
||||
C[ci + 2 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 3 * ldc + 0] += alpha16 * (float)(result3);
|
||||
C[ci + 4 * ldc + 0] += alpha16 * (float)(result4);
|
||||
C[ci + 5 * ldc + 0] += alpha16 * (float)(result5);
|
||||
C[ci + 6 * ldc + 0] += alpha16 * (float)(result6);
|
||||
C[ci + 7 * ldc + 0] += alpha16 * (float)(result7);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
@@ -519,10 +522,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m2_t c3 = __riscv_vle32_v_f32m2( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m2(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m2(c3, alpha16, result3, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
@@ -598,10 +601,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha, result3, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
c2 = __riscv_vfwmacc_vf_f32m1(c2, alpha16, result2, gvl);
|
||||
c3 = __riscv_vfwmacc_vf_f32m1(c3, alpha16, result3, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -672,14 +675,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
|
||||
C[ci + 2 * ldc + 0] += alpha * (float)(result4);
|
||||
C[ci + 2 * ldc + 1] += alpha * (float)(result5);
|
||||
C[ci + 3 * ldc + 0] += alpha * (float)(result6);
|
||||
C[ci + 3 * ldc + 1] += alpha * (float)(result7);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
|
||||
C[ci + 2 * ldc + 0] += alpha16 * (float)(result4);
|
||||
C[ci + 2 * ldc + 1] += alpha16 * (float)(result5);
|
||||
C[ci + 3 * ldc + 0] += alpha16 * (float)(result6);
|
||||
C[ci + 3 * ldc + 1] += alpha16 * (float)(result7);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -730,10 +733,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
|
||||
C[ci + 2 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 3 * ldc + 0] += alpha * (float)(result3);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
|
||||
C[ci + 2 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 3 * ldc + 0] += alpha16 * (float)(result3);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
@@ -796,8 +799,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m2(c1, alpha16, result1, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
@@ -853,8 +856,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
c1 = __riscv_vfwmacc_vf_f32m1(c1, alpha16, result1, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
@@ -904,10 +907,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha * (float)(result3);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result2);
|
||||
C[ci + 1 * ldc + 1] += alpha16 * (float)(result3);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -946,8 +949,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha * (float)(result1);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 1 * ldc + 0] += alpha16 * (float)(result1);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
@@ -1001,7 +1004,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl);
|
||||
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m2(c0, alpha16, result0, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
#endif
|
||||
@@ -1047,7 +1050,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
#ifdef FP16_NARROW
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c0 = __riscv_vfwmacc_vf_f32m1(c0, alpha16, result0, gvl);
|
||||
#else
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
#endif
|
||||
@@ -1087,8 +1090,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha * (float)(result1);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
C[ci + 0 * ldc + 1] += alpha16 * (float)(result1);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -1121,7 +1124,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
#ifdef FP16_NARROW
|
||||
C[ci + 0 * ldc + 0] += alpha * (float)(result0);
|
||||
C[ci + 0 * ldc + 0] += alpha16 * (float)(result0);
|
||||
#else
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user