mirror of
https://github.com/OpenMathLib/OpenBLAS
synced 2026-05-31 00:45:48 +08:00
Fix _Float16 casting issue and reduce LMUL for certain vector instruction from m2 to m1.
This commit is contained in:
@@ -295,22 +295,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+1]*B[bi+0]);
|
||||
result2+=(float)(A[ai+0]*B[bi+1]);
|
||||
result3+=(float)(A[ai+1]*B[bi+1]);
|
||||
result4+=(float)(A[ai+0]*B[bi+2]);
|
||||
result5+=(float)(A[ai+1]*B[bi+2]);
|
||||
result6+=(float)(A[ai+0]*B[bi+3]);
|
||||
result7+=(float)(A[ai+1]*B[bi+3]);
|
||||
result8+=(float)(A[ai+0]*B[bi+4]);
|
||||
result9+=(float)(A[ai+1]*B[bi+4]);
|
||||
result10+=(float)(A[ai+0]*B[bi+5]);
|
||||
result11+=(float)(A[ai+1]*B[bi+5]);
|
||||
result12+=(float)(A[ai+0]*B[bi+6]);
|
||||
result13+=(float)(A[ai+1]*B[bi+6]);
|
||||
result14+=(float)(A[ai+0]*B[bi+7]);
|
||||
result15+=(float)(A[ai+1]*B[bi+7]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
|
||||
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
|
||||
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
|
||||
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
|
||||
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
|
||||
result8+=(float)(A[ai+0])*(float)(B[bi+4]);
|
||||
result9+=(float)(A[ai+1])*(float)(B[bi+4]);
|
||||
result10+=(float)(A[ai+0])*(float)(B[bi+5]);
|
||||
result11+=(float)(A[ai+1])*(float)(B[bi+5]);
|
||||
result12+=(float)(A[ai+0])*(float)(B[bi+6]);
|
||||
result13+=(float)(A[ai+1])*(float)(B[bi+6]);
|
||||
result14+=(float)(A[ai+0])*(float)(B[bi+7]);
|
||||
result15+=(float)(A[ai+1])*(float)(B[bi+7]);
|
||||
ai+=2;
|
||||
bi+=8;
|
||||
}
|
||||
@@ -353,14 +353,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+0]*B[bi+1]);
|
||||
result2+=(float)(A[ai+0]*B[bi+2]);
|
||||
result3+=(float)(A[ai+0]*B[bi+3]);
|
||||
result4+=(float)(A[ai+0]*B[bi+4]);
|
||||
result5+=(float)(A[ai+0]*B[bi+5]);
|
||||
result6+=(float)(A[ai+0]*B[bi+6]);
|
||||
result7+=(float)(A[ai+0]*B[bi+7]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
|
||||
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
|
||||
result4+=(float)(A[ai+0])*(float)(B[bi+4]);
|
||||
result5+=(float)(A[ai+0])*(float)(B[bi+5]);
|
||||
result6+=(float)(A[ai+0])*(float)(B[bi+6]);
|
||||
result7+=(float)(A[ai+0])*(float)(B[bi+7]);
|
||||
ai+=1;
|
||||
bi+=8;
|
||||
}
|
||||
@@ -569,14 +569,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+1]*B[bi+0]);
|
||||
result2+=(float)(A[ai+0]*B[bi+1]);
|
||||
result3+=(float)(A[ai+1]*B[bi+1]);
|
||||
result4+=(float)(A[ai+0]*B[bi+2]);
|
||||
result5+=(float)(A[ai+1]*B[bi+2]);
|
||||
result6+=(float)(A[ai+0]*B[bi+3]);
|
||||
result7+=(float)(A[ai+1]*B[bi+3]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
|
||||
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
|
||||
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
|
||||
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
|
||||
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
|
||||
ai+=2;
|
||||
bi+=4;
|
||||
}
|
||||
@@ -607,10 +607,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+0]*B[bi+1]);
|
||||
result2+=(float)(A[ai+0]*B[bi+2]);
|
||||
result3+=(float)(A[ai+0]*B[bi+3]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
|
||||
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
|
||||
ai+=1;
|
||||
bi+=4;
|
||||
}
|
||||
@@ -770,10 +770,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+1]*B[bi+0]);
|
||||
result2+=(float)(A[ai+0]*B[bi+1]);
|
||||
result3+=(float)(A[ai+1]*B[bi+1]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
|
||||
ai+=2;
|
||||
bi+=2;
|
||||
}
|
||||
@@ -797,8 +797,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+0]*B[bi+1]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
ai+=1;
|
||||
bi+=2;
|
||||
}
|
||||
@@ -930,8 +930,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+1]*B[bi+0]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
|
||||
ai+=2;
|
||||
bi+=1;
|
||||
}
|
||||
@@ -953,7 +953,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
ai+=1;
|
||||
bi+=1;
|
||||
}
|
||||
@@ -966,4 +966,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
n_top += 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,17 +115,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
_Float16 B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
|
||||
vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
|
||||
vfloat32m2_t result4 = __riscv_vfwmul_vf_f32m2( A0, B4, gvl);
|
||||
vfloat32m2_t result5 = __riscv_vfwmul_vf_f32m2( A0, B5, gvl);
|
||||
vfloat32m2_t result6 = __riscv_vfwmul_vf_f32m2( A0, B6, gvl);
|
||||
vfloat32m2_t result7 = __riscv_vfwmul_vf_f32m2( A0, B7, gvl);
|
||||
vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
|
||||
vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
|
||||
vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
|
||||
vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
|
||||
vfloat32m1_t result4 = __riscv_vfwmul_vf_f32m1( A0, B4, gvl);
|
||||
vfloat32m1_t result5 = __riscv_vfwmul_vf_f32m1( A0, B5, gvl);
|
||||
vfloat32m1_t result6 = __riscv_vfwmul_vf_f32m1( A0, B6, gvl);
|
||||
vfloat32m1_t result7 = __riscv_vfwmul_vf_f32m1( A0, B7, gvl);
|
||||
|
||||
for(BLASLONG k=1; k < K; ++k) {
|
||||
B0 = B[bi+0];
|
||||
@@ -138,55 +138,55 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
|
||||
A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfwmacc_vf_f32m2(result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfwmacc_vf_f32m2(result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
|
||||
result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfwmacc_vf_f32m1(result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfwmacc_vf_f32m1(result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfwmacc_vf_f32m1(result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfwmacc_vf_f32m1(result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c2 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c4 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c5 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c6 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
|
||||
vfloat32m1_t c7 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfmacc_vf_f32m1(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfmacc_vf_f32m1(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfmacc_vf_f32m1(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfmacc_vf_f32m1(c7, alpha, result7, gvl);
|
||||
|
||||
ci= n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c4, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c5, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c6, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c7, gvl);
|
||||
__riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c3, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c4, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c5, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c6, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c7, gvl);
|
||||
|
||||
m_top += 4;
|
||||
}
|
||||
@@ -215,22 +215,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
float result15 = 0;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+1]*B[bi+0]);
|
||||
result2+=(float)(A[ai+0]*B[bi+1]);
|
||||
result3+=(float)(A[ai+1]*B[bi+1]);
|
||||
result4+=(float)(A[ai+0]*B[bi+2]);
|
||||
result5+=(float)(A[ai+1]*B[bi+2]);
|
||||
result6+=(float)(A[ai+0]*B[bi+3]);
|
||||
result7+=(float)(A[ai+1]*B[bi+3]);
|
||||
result8+=(float)(A[ai+0]*B[bi+4]);
|
||||
result9+=(float)(A[ai+1]*B[bi+4]);
|
||||
result10+=(float)(A[ai+0]*B[bi+5]);
|
||||
result11+=(float)(A[ai+1]*B[bi+5]);
|
||||
result12+=(float)(A[ai+0]*B[bi+6]);
|
||||
result13+=(float)(A[ai+1]*B[bi+6]);
|
||||
result14+=(float)(A[ai+0]*B[bi+7]);
|
||||
result15+=(float)(A[ai+1]*B[bi+7]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
|
||||
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
|
||||
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
|
||||
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
|
||||
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
|
||||
result8+=(float)(A[ai+0])*(float)(B[bi+4]);
|
||||
result9+=(float)(A[ai+1])*(float)(B[bi+4]);
|
||||
result10+=(float)(A[ai+0])*(float)(B[bi+5]);
|
||||
result11+=(float)(A[ai+1])*(float)(B[bi+5]);
|
||||
result12+=(float)(A[ai+0])*(float)(B[bi+6]);
|
||||
result13+=(float)(A[ai+1])*(float)(B[bi+6]);
|
||||
result14+=(float)(A[ai+0])*(float)(B[bi+7]);
|
||||
result15+=(float)(A[ai+1])*(float)(B[bi+7]);
|
||||
ai+=2;
|
||||
bi+=8;
|
||||
}
|
||||
@@ -273,14 +273,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+0]*B[bi+1]);
|
||||
result2+=(float)(A[ai+0]*B[bi+2]);
|
||||
result3+=(float)(A[ai+0]*B[bi+3]);
|
||||
result4+=(float)(A[ai+0]*B[bi+4]);
|
||||
result5+=(float)(A[ai+0]*B[bi+5]);
|
||||
result6+=(float)(A[ai+0]*B[bi+6]);
|
||||
result7+=(float)(A[ai+0]*B[bi+7]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
|
||||
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
|
||||
result4+=(float)(A[ai+0])*(float)(B[bi+4]);
|
||||
result5+=(float)(A[ai+0])*(float)(B[bi+5]);
|
||||
result6+=(float)(A[ai+0])*(float)(B[bi+6]);
|
||||
result7+=(float)(A[ai+0])*(float)(B[bi+7]);
|
||||
ai+=1;
|
||||
bi+=8;
|
||||
}
|
||||
@@ -372,13 +372,13 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
_Float16 B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
|
||||
vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
|
||||
vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
|
||||
vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
|
||||
vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
|
||||
vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
|
||||
|
||||
for(BLASLONG k=1; k < K; ++k) {
|
||||
B0 = B[bi+0];
|
||||
@@ -387,35 +387,35 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
|
||||
A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c2 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
|
||||
|
||||
ci= n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
__riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c3, gvl);
|
||||
|
||||
m_top += 4;
|
||||
}
|
||||
@@ -436,14 +436,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
float result7 = 0;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+1]*B[bi+0]);
|
||||
result2+=(float)(A[ai+0]*B[bi+1]);
|
||||
result3+=(float)(A[ai+1]*B[bi+1]);
|
||||
result4+=(float)(A[ai+0]*B[bi+2]);
|
||||
result5+=(float)(A[ai+1]*B[bi+2]);
|
||||
result6+=(float)(A[ai+0]*B[bi+3]);
|
||||
result7+=(float)(A[ai+1]*B[bi+3]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
|
||||
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
|
||||
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
|
||||
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
|
||||
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
|
||||
ai+=2;
|
||||
bi+=4;
|
||||
}
|
||||
@@ -474,10 +474,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+0]*B[bi+1]);
|
||||
result2+=(float)(A[ai+0]*B[bi+2]);
|
||||
result3+=(float)(A[ai+0]*B[bi+3]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
|
||||
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
|
||||
ai+=1;
|
||||
bi+=4;
|
||||
}
|
||||
@@ -551,36 +551,36 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
_Float16 B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
|
||||
vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
|
||||
vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
|
||||
vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
|
||||
|
||||
for(BLASLONG k=1; k < K; ++k) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
|
||||
A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
|
||||
|
||||
ci= n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
__riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m1(&C[ci], c1, gvl);
|
||||
|
||||
m_top += 4;
|
||||
}
|
||||
@@ -597,10 +597,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
float result3 = 0;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+1]*B[bi+0]);
|
||||
result2+=(float)(A[ai+0]*B[bi+1]);
|
||||
result3+=(float)(A[ai+1]*B[bi+1]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
|
||||
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
|
||||
ai+=2;
|
||||
bi+=2;
|
||||
}
|
||||
@@ -624,8 +624,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+0]*B[bi+1]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
|
||||
ai+=1;
|
||||
bi+=2;
|
||||
}
|
||||
@@ -689,29 +689,29 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
_Float16 B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
|
||||
vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
|
||||
vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
|
||||
|
||||
for(BLASLONG k=1; k < K; ++k) {
|
||||
B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
|
||||
A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
|
||||
|
||||
ci= n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
__riscv_vse32_v_f32m1(&C[ci], c0, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
@@ -725,8 +725,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
float result1 = 0;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result1+=(float)(A[ai+1]*B[bi+0]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
|
||||
ai+=2;
|
||||
bi+=1;
|
||||
}
|
||||
@@ -748,7 +748,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=(float)(A[ai+0]*B[bi+0]);
|
||||
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
|
||||
ai+=1;
|
||||
bi+=1;
|
||||
}
|
||||
@@ -764,4 +764,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user