Fix _Float16 casting issue and reduce LMUL for certain vector instruction from m2 to m1.

This commit is contained in:
Chip Kerchner
2025-09-18 21:30:22 +00:00
parent 79a1f38770
commit a4abf7828e
2 changed files with 175 additions and 175 deletions

View File

@@ -295,22 +295,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+1]*B[bi+0]);
result2+=(float)(A[ai+0]*B[bi+1]);
result3+=(float)(A[ai+1]*B[bi+1]);
result4+=(float)(A[ai+0]*B[bi+2]);
result5+=(float)(A[ai+1]*B[bi+2]);
result6+=(float)(A[ai+0]*B[bi+3]);
result7+=(float)(A[ai+1]*B[bi+3]);
result8+=(float)(A[ai+0]*B[bi+4]);
result9+=(float)(A[ai+1]*B[bi+4]);
result10+=(float)(A[ai+0]*B[bi+5]);
result11+=(float)(A[ai+1]*B[bi+5]);
result12+=(float)(A[ai+0]*B[bi+6]);
result13+=(float)(A[ai+1]*B[bi+6]);
result14+=(float)(A[ai+0]*B[bi+7]);
result15+=(float)(A[ai+1]*B[bi+7]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
result8+=(float)(A[ai+0])*(float)(B[bi+4]);
result9+=(float)(A[ai+1])*(float)(B[bi+4]);
result10+=(float)(A[ai+0])*(float)(B[bi+5]);
result11+=(float)(A[ai+1])*(float)(B[bi+5]);
result12+=(float)(A[ai+0])*(float)(B[bi+6]);
result13+=(float)(A[ai+1])*(float)(B[bi+6]);
result14+=(float)(A[ai+0])*(float)(B[bi+7]);
result15+=(float)(A[ai+1])*(float)(B[bi+7]);
ai+=2;
bi+=8;
}
@@ -353,14 +353,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+0]*B[bi+1]);
result2+=(float)(A[ai+0]*B[bi+2]);
result3+=(float)(A[ai+0]*B[bi+3]);
result4+=(float)(A[ai+0]*B[bi+4]);
result5+=(float)(A[ai+0]*B[bi+5]);
result6+=(float)(A[ai+0]*B[bi+6]);
result7+=(float)(A[ai+0]*B[bi+7]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
result4+=(float)(A[ai+0])*(float)(B[bi+4]);
result5+=(float)(A[ai+0])*(float)(B[bi+5]);
result6+=(float)(A[ai+0])*(float)(B[bi+6]);
result7+=(float)(A[ai+0])*(float)(B[bi+7]);
ai+=1;
bi+=8;
}
@@ -569,14 +569,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+1]*B[bi+0]);
result2+=(float)(A[ai+0]*B[bi+1]);
result3+=(float)(A[ai+1]*B[bi+1]);
result4+=(float)(A[ai+0]*B[bi+2]);
result5+=(float)(A[ai+1]*B[bi+2]);
result6+=(float)(A[ai+0]*B[bi+3]);
result7+=(float)(A[ai+1]*B[bi+3]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
ai+=2;
bi+=4;
}
@@ -607,10 +607,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+0]*B[bi+1]);
result2+=(float)(A[ai+0]*B[bi+2]);
result3+=(float)(A[ai+0]*B[bi+3]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
ai+=1;
bi+=4;
}
@@ -770,10 +770,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+1]*B[bi+0]);
result2+=(float)(A[ai+0]*B[bi+1]);
result3+=(float)(A[ai+1]*B[bi+1]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
ai+=2;
bi+=2;
}
@@ -797,8 +797,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+0]*B[bi+1]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
ai+=1;
bi+=2;
}
@@ -930,8 +930,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+1]*B[bi+0]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
ai+=2;
bi+=1;
}
@@ -953,7 +953,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
ai+=1;
bi+=1;
}
@@ -966,4 +966,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
n_top += 1;
}
return 0;
}
}

View File

@@ -115,17 +115,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
_Float16 B7 = B[bi+7];
bi += 8;
vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
ai += 4;
vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
vfloat32m2_t result4 = __riscv_vfwmul_vf_f32m2( A0, B4, gvl);
vfloat32m2_t result5 = __riscv_vfwmul_vf_f32m2( A0, B5, gvl);
vfloat32m2_t result6 = __riscv_vfwmul_vf_f32m2( A0, B6, gvl);
vfloat32m2_t result7 = __riscv_vfwmul_vf_f32m2( A0, B7, gvl);
vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
vfloat32m1_t result4 = __riscv_vfwmul_vf_f32m1( A0, B4, gvl);
vfloat32m1_t result5 = __riscv_vfwmul_vf_f32m1( A0, B5, gvl);
vfloat32m1_t result6 = __riscv_vfwmul_vf_f32m1( A0, B6, gvl);
vfloat32m1_t result7 = __riscv_vfwmul_vf_f32m1( A0, B7, gvl);
for(BLASLONG k=1; k < K; ++k) {
B0 = B[bi+0];
@@ -138,55 +138,55 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
B7 = B[bi+7];
bi += 8;
A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
ai += 4;
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
result4 = __riscv_vfwmacc_vf_f32m2(result4, B4, A0, gvl);
result5 = __riscv_vfwmacc_vf_f32m2(result5, B5, A0, gvl);
result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
result4 = __riscv_vfwmacc_vf_f32m1(result4, B4, A0, gvl);
result5 = __riscv_vfwmacc_vf_f32m1(result5, B5, A0, gvl);
result6 = __riscv_vfwmacc_vf_f32m1(result6, B6, A0, gvl);
result7 = __riscv_vfwmacc_vf_f32m1(result7, B7, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c2 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c4 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c5 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c6 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
vfloat32m1_t c7 = __riscv_vle32_v_f32m1(&C[ci], gvl);
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
c4 = __riscv_vfmacc_vf_f32m1(c4, alpha, result4, gvl);
c5 = __riscv_vfmacc_vf_f32m1(c5, alpha, result5, gvl);
c6 = __riscv_vfmacc_vf_f32m1(c6, alpha, result6, gvl);
c7 = __riscv_vfmacc_vf_f32m1(c7, alpha, result7, gvl);
ci= n_top * ldc + m_top;
__riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c1, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c2, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c3, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c4, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c5, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c6, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c7, gvl);
__riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c3, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c4, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c5, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c6, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c7, gvl);
m_top += 4;
}
@@ -215,22 +215,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
float result15 = 0;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+1]*B[bi+0]);
result2+=(float)(A[ai+0]*B[bi+1]);
result3+=(float)(A[ai+1]*B[bi+1]);
result4+=(float)(A[ai+0]*B[bi+2]);
result5+=(float)(A[ai+1]*B[bi+2]);
result6+=(float)(A[ai+0]*B[bi+3]);
result7+=(float)(A[ai+1]*B[bi+3]);
result8+=(float)(A[ai+0]*B[bi+4]);
result9+=(float)(A[ai+1]*B[bi+4]);
result10+=(float)(A[ai+0]*B[bi+5]);
result11+=(float)(A[ai+1]*B[bi+5]);
result12+=(float)(A[ai+0]*B[bi+6]);
result13+=(float)(A[ai+1]*B[bi+6]);
result14+=(float)(A[ai+0]*B[bi+7]);
result15+=(float)(A[ai+1]*B[bi+7]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
result8+=(float)(A[ai+0])*(float)(B[bi+4]);
result9+=(float)(A[ai+1])*(float)(B[bi+4]);
result10+=(float)(A[ai+0])*(float)(B[bi+5]);
result11+=(float)(A[ai+1])*(float)(B[bi+5]);
result12+=(float)(A[ai+0])*(float)(B[bi+6]);
result13+=(float)(A[ai+1])*(float)(B[bi+6]);
result14+=(float)(A[ai+0])*(float)(B[bi+7]);
result15+=(float)(A[ai+1])*(float)(B[bi+7]);
ai+=2;
bi+=8;
}
@@ -273,14 +273,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+0]*B[bi+1]);
result2+=(float)(A[ai+0]*B[bi+2]);
result3+=(float)(A[ai+0]*B[bi+3]);
result4+=(float)(A[ai+0]*B[bi+4]);
result5+=(float)(A[ai+0]*B[bi+5]);
result6+=(float)(A[ai+0]*B[bi+6]);
result7+=(float)(A[ai+0]*B[bi+7]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
result4+=(float)(A[ai+0])*(float)(B[bi+4]);
result5+=(float)(A[ai+0])*(float)(B[bi+5]);
result6+=(float)(A[ai+0])*(float)(B[bi+6]);
result7+=(float)(A[ai+0])*(float)(B[bi+7]);
ai+=1;
bi+=8;
}
@@ -372,13 +372,13 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
_Float16 B3 = B[bi+3];
bi += 4;
vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
ai += 4;
vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
for(BLASLONG k=1; k < K; ++k) {
B0 = B[bi+0];
@@ -387,35 +387,35 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
B3 = B[bi+3];
bi += 4;
A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
ai += 4;
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c2 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
ci= n_top * ldc + m_top;
__riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c1, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c2, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
__riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c3, gvl);
m_top += 4;
}
@@ -436,14 +436,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
float result7 = 0;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+1]*B[bi+0]);
result2+=(float)(A[ai+0]*B[bi+1]);
result3+=(float)(A[ai+1]*B[bi+1]);
result4+=(float)(A[ai+0]*B[bi+2]);
result5+=(float)(A[ai+1]*B[bi+2]);
result6+=(float)(A[ai+0]*B[bi+3]);
result7+=(float)(A[ai+1]*B[bi+3]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
ai+=2;
bi+=4;
}
@@ -474,10 +474,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+0]*B[bi+1]);
result2+=(float)(A[ai+0]*B[bi+2]);
result3+=(float)(A[ai+0]*B[bi+3]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
ai+=1;
bi+=4;
}
@@ -551,36 +551,36 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
_Float16 B1 = B[bi+1];
bi += 2;
vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
ai += 4;
vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
for(BLASLONG k=1; k < K; ++k) {
B0 = B[bi+0];
B1 = B[bi+1];
bi += 2;
A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
ai += 4;
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
ci += ldc - gvl * 0;
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
ci= n_top * ldc + m_top;
__riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
__riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
__riscv_vse32_v_f32m1(&C[ci], c1, gvl);
m_top += 4;
}
@@ -597,10 +597,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
float result3 = 0;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+1]*B[bi+0]);
result2+=(float)(A[ai+0]*B[bi+1]);
result3+=(float)(A[ai+1]*B[bi+1]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
ai+=2;
bi+=2;
}
@@ -624,8 +624,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+0]*B[bi+1]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
ai+=1;
bi+=2;
}
@@ -689,29 +689,29 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
_Float16 B0 = B[bi+0];
bi += 1;
vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
ai += 4;
vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
for(BLASLONG k=1; k < K; ++k) {
B0 = B[bi+0];
bi += 1;
A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
ai += 4;
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
ci= n_top * ldc + m_top;
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
__riscv_vse32_v_f32m1(&C[ci], c0, gvl);
m_top += 4;
}
@@ -725,8 +725,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
float result1 = 0;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result1+=(float)(A[ai+1]*B[bi+0]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
ai+=2;
bi+=1;
}
@@ -748,7 +748,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
BLASLONG bi = n_top * K;
for(BLASLONG k=0; k<K; k++) {
result0+=(float)(A[ai+0]*B[bi+0]);
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
ai+=1;
bi+=1;
}
@@ -764,4 +764,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
return 0;
}
}