riscv: gemv_t_vector.c optimize

This commit is contained in:
yuanjia
2025-09-12 09:45:06 +08:00
parent 2a95564500
commit 53d7452cdf

View File

@@ -72,123 +72,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
FLOAT_V_T_M1 v_res;
size_t vlmax = VSETVL_MAX_M1();
#ifndef RISCV_0p10_INTRINSICS
FLOAT_V_T va0, va1, va2, va3, vr0, vr1, vr2, vr3;
FLOAT_V_T_M1 vec0, vec1, vec2, vec3;
FLOAT *a_ptrs[4], *y_ptrs[4];
#endif
if(inc_x == 1){
#ifndef RISCV_0p10_INTRINSICS
BLASLONG anr = n - n % 4;
for (; i < anr; i += 4) {
for(i = 0; i < n; i++){
v_res = VFMVVF_FLOAT_M1(0, 1);
gvl = VSETVL(m);
j = 0;
for (int l = 0; l < 4; l++) {
a_ptrs[l] = a + (i + l) * lda;
y_ptrs[l] = y + (i + l) * inc_y;
}
vec0 = VFMVVF_FLOAT_M1(0.0, vlmax);
vec1 = VFMVVF_FLOAT_M1(0.0, vlmax);
vec2 = VFMVVF_FLOAT_M1(0.0, vlmax);
vec3 = VFMVVF_FLOAT_M1(0.0, vlmax);
vr0 = VFMVVF_FLOAT(0.0, gvl);
vr1 = VFMVVF_FLOAT(0.0, gvl);
vr2 = VFMVVF_FLOAT(0.0, gvl);
vr3 = VFMVVF_FLOAT(0.0, gvl);
for (k = 0; k < m / gvl; k++) {
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl);
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl);
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl);
vx = VLEV_FLOAT(x + j, gvl);
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
vr1 = VFMULVV_FLOAT(va1, vx, gvl);
vr2 = VFMULVV_FLOAT(va2, vx, gvl);
vr3 = VFMULVV_FLOAT(va3, vx, gvl);
// Floating-point addition does not satisfy the associative law, that is, (a + b) + c ≠ a + (b + c),
// so piecewise multiplication and reduction must be performed inside the loop body.
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl);
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl);
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl);
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl);
vr = VFMVVF_FLOAT(0, gvl);
for(k = 0; k < m/gvl; k++){
va = VLEV_FLOAT(&a_ptr[j], gvl);
vx = VLEV_FLOAT(&x[j], gvl);
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
j += gvl;
}
if (j < m) {
gvl = VSETVL(m - j);
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl);
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl);
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl);
vx = VLEV_FLOAT(x + j, gvl);
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
vr1 = VFMULVV_FLOAT(va1, vx, gvl);
vr2 = VFMULVV_FLOAT(va2, vx, gvl);
vr3 = VFMULVV_FLOAT(va3, vx, gvl);
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl);
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl);
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl);
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl);
if(j < m){
gvl = VSETVL(m-j);
va = VLEV_FLOAT(&a_ptr[j], gvl);
vx = VLEV_FLOAT(&x[j], gvl);
vr = VFMULVV_FLOAT(va, vx, gvl);
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
}
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(vec0));
*y_ptrs[1] += alpha * (FLOAT)(EXTRACT_FLOAT(vec1));
*y_ptrs[2] += alpha * (FLOAT)(EXTRACT_FLOAT(vec2));
*y_ptrs[3] += alpha * (FLOAT)(EXTRACT_FLOAT(vec3));
}
// deal with the tail
for (; i < n; i++) {
v_res = VFMVVF_FLOAT_M1(0, vlmax);
gvl = VSETVL(m);
j = 0;
a_ptrs[0] = a + i * lda;
y_ptrs[0] = y + i * inc_y;
vr0 = VFMVVF_FLOAT(0, gvl);
for (k = 0; k < m / gvl; k++) {
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
vx = VLEV_FLOAT(x + j, gvl);
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl);
j += gvl;
}
if (j < m) {
gvl = VSETVL(m - j);
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
vx = VLEV_FLOAT(x + j, gvl);
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl);
}
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(v_res));
}
#else
for(i = 0; i < n; i++){
v_res = VFMVVF_FLOAT_M1(0, 1);
gvl = VSETVL(m);
j = 0;
vr = VFMVVF_FLOAT(0, gvl);
for(k = 0; k < m/gvl; k++){
va = VLEV_FLOAT(&a_ptr[j], gvl);
vx = VLEV_FLOAT(&x[j], gvl);
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
j += gvl;
}
if(j < m){
gvl = VSETVL(m-j);
va = VLEV_FLOAT(&a_ptr[j], gvl);
vx = VLEV_FLOAT(&x[j], gvl);
vr = VFMULVV_FLOAT(va, vx, gvl);
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
}
temp = (FLOAT)EXTRACT_FLOAT(v_res);
y[iy] += alpha * temp;
temp = (FLOAT)EXTRACT_FLOAT(v_res);
y[iy] += alpha * temp;
iy += inc_y;
a_ptr += lda;
}
#endif
iy += inc_y;
a_ptr += lda;
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for(i = 0; i < n; i++){