mirror of
https://github.com/OpenMathLib/OpenBLAS
synced 2026-05-31 00:45:48 +08:00
riscv: gemv_t_vector.c optimize
This commit is contained in:
@@ -72,123 +72,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||
FLOAT_V_T_M1 v_res;
|
||||
size_t vlmax = VSETVL_MAX_M1();
|
||||
|
||||
#ifndef RISCV_0p10_INTRINSICS
|
||||
FLOAT_V_T va0, va1, va2, va3, vr0, vr1, vr2, vr3;
|
||||
FLOAT_V_T_M1 vec0, vec1, vec2, vec3;
|
||||
FLOAT *a_ptrs[4], *y_ptrs[4];
|
||||
#endif
|
||||
|
||||
if(inc_x == 1){
|
||||
#ifndef RISCV_0p10_INTRINSICS
|
||||
BLASLONG anr = n - n % 4;
|
||||
for (; i < anr; i += 4) {
|
||||
|
||||
for(i = 0; i < n; i++){
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
gvl = VSETVL(m);
|
||||
j = 0;
|
||||
for (int l = 0; l < 4; l++) {
|
||||
a_ptrs[l] = a + (i + l) * lda;
|
||||
y_ptrs[l] = y + (i + l) * inc_y;
|
||||
}
|
||||
vec0 = VFMVVF_FLOAT_M1(0.0, vlmax);
|
||||
vec1 = VFMVVF_FLOAT_M1(0.0, vlmax);
|
||||
vec2 = VFMVVF_FLOAT_M1(0.0, vlmax);
|
||||
vec3 = VFMVVF_FLOAT_M1(0.0, vlmax);
|
||||
vr0 = VFMVVF_FLOAT(0.0, gvl);
|
||||
vr1 = VFMVVF_FLOAT(0.0, gvl);
|
||||
vr2 = VFMVVF_FLOAT(0.0, gvl);
|
||||
vr3 = VFMVVF_FLOAT(0.0, gvl);
|
||||
for (k = 0; k < m / gvl; k++) {
|
||||
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
|
||||
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl);
|
||||
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl);
|
||||
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl);
|
||||
|
||||
vx = VLEV_FLOAT(x + j, gvl);
|
||||
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
|
||||
vr1 = VFMULVV_FLOAT(va1, vx, gvl);
|
||||
vr2 = VFMULVV_FLOAT(va2, vx, gvl);
|
||||
vr3 = VFMULVV_FLOAT(va3, vx, gvl);
|
||||
// Floating-point addition does not satisfy the associative law, that is, (a + b) + c ≠ a + (b + c),
|
||||
// so piecewise multiplication and reduction must be performed inside the loop body.
|
||||
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl);
|
||||
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl);
|
||||
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl);
|
||||
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
for(k = 0; k < m/gvl; k++){
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
|
||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
|
||||
j += gvl;
|
||||
}
|
||||
if (j < m) {
|
||||
gvl = VSETVL(m - j);
|
||||
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
|
||||
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl);
|
||||
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl);
|
||||
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl);
|
||||
|
||||
vx = VLEV_FLOAT(x + j, gvl);
|
||||
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
|
||||
vr1 = VFMULVV_FLOAT(va1, vx, gvl);
|
||||
vr2 = VFMULVV_FLOAT(va2, vx, gvl);
|
||||
vr3 = VFMULVV_FLOAT(va3, vx, gvl);
|
||||
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl);
|
||||
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl);
|
||||
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl);
|
||||
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl);
|
||||
if(j < m){
|
||||
gvl = VSETVL(m-j);
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vr = VFMULVV_FLOAT(va, vx, gvl);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
|
||||
}
|
||||
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(vec0));
|
||||
*y_ptrs[1] += alpha * (FLOAT)(EXTRACT_FLOAT(vec1));
|
||||
*y_ptrs[2] += alpha * (FLOAT)(EXTRACT_FLOAT(vec2));
|
||||
*y_ptrs[3] += alpha * (FLOAT)(EXTRACT_FLOAT(vec3));
|
||||
}
|
||||
// deal with the tail
|
||||
for (; i < n; i++) {
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
gvl = VSETVL(m);
|
||||
j = 0;
|
||||
a_ptrs[0] = a + i * lda;
|
||||
y_ptrs[0] = y + i * inc_y;
|
||||
vr0 = VFMVVF_FLOAT(0, gvl);
|
||||
for (k = 0; k < m / gvl; k++) {
|
||||
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
|
||||
vx = VLEV_FLOAT(x + j, gvl);
|
||||
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
|
||||
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
if (j < m) {
|
||||
gvl = VSETVL(m - j);
|
||||
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
|
||||
vx = VLEV_FLOAT(x + j, gvl);
|
||||
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
|
||||
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl);
|
||||
}
|
||||
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(v_res));
|
||||
}
|
||||
#else
|
||||
for(i = 0; i < n; i++){
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
gvl = VSETVL(m);
|
||||
j = 0;
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
for(k = 0; k < m/gvl; k++){
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
|
||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
|
||||
j += gvl;
|
||||
}
|
||||
if(j < m){
|
||||
gvl = VSETVL(m-j);
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vr = VFMULVV_FLOAT(va, vx, gvl);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
|
||||
}
|
||||
temp = (FLOAT)EXTRACT_FLOAT(v_res);
|
||||
y[iy] += alpha * temp;
|
||||
temp = (FLOAT)EXTRACT_FLOAT(v_res);
|
||||
y[iy] += alpha * temp;
|
||||
|
||||
|
||||
iy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
#endif
|
||||
iy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
for(i = 0; i < n; i++){
|
||||
|
||||
Reference in New Issue
Block a user