mirror of
https://github.com/OpenMathLib/OpenBLAS
synced 2026-05-31 00:45:48 +08:00
Merge pull request #5225 from annop-w/gemv_n
Improve performance for SGEMVN on NEONVERSEN1
This commit is contained in:
@@ -60,7 +60,7 @@ DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
SGEMVNKERNEL = sgemv_n_neon.c
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
219
kernel/arm64/sgemv_n_neon.c
Normal file
219
kernel/arm64/sgemv_n_neon.c
Normal file
@@ -0,0 +1,219 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2025, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT temp;
|
||||
|
||||
ix = 0;
|
||||
a_ptr = a;
|
||||
|
||||
if (inc_x == 1 && inc_y == 1) {
|
||||
FLOAT *a0_ptr = a + lda * 0;
|
||||
FLOAT *a1_ptr = a + lda * 1;
|
||||
FLOAT *a2_ptr = a + lda * 2;
|
||||
FLOAT *a3_ptr = a + lda * 3;
|
||||
FLOAT *a4_ptr = a + lda * 4;
|
||||
FLOAT *a5_ptr = a + lda * 5;
|
||||
FLOAT *a6_ptr = a + lda * 6;
|
||||
FLOAT *a7_ptr = a + lda * 7;
|
||||
|
||||
j = 0;
|
||||
while (j + 3 < n) {
|
||||
float32x4_t x0_vec = vld1q_f32(x + j);
|
||||
x0_vec = vmulq_n_f32(x0_vec, alpha);
|
||||
i = 0;
|
||||
while (i + 7 < m) {
|
||||
float32x4_t a00_vec = vld1q_f32(a0_ptr + i);
|
||||
float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4);
|
||||
float32x4_t a10_vec = vld1q_f32(a1_ptr + i);
|
||||
float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4);
|
||||
float32x4_t a20_vec = vld1q_f32(a2_ptr + i);
|
||||
float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4);
|
||||
float32x4_t a30_vec = vld1q_f32(a3_ptr + i);
|
||||
float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4);
|
||||
|
||||
float32x4_t y0_vec = vld1q_f32(y + i);
|
||||
float32x4_t y1_vec = vld1q_f32(y + i + 4);
|
||||
y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0);
|
||||
y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1);
|
||||
y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2);
|
||||
y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3);
|
||||
y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0);
|
||||
y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1);
|
||||
y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2);
|
||||
y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3);
|
||||
|
||||
vst1q_f32(y + i, y0_vec);
|
||||
vst1q_f32(y + i + 4, y1_vec);
|
||||
|
||||
i += 8;
|
||||
}
|
||||
while (i + 3 < m) {
|
||||
float32x4_t a0_vec = vld1q_f32(a0_ptr + i);
|
||||
float32x4_t a1_vec = vld1q_f32(a1_ptr + i);
|
||||
float32x4_t a2_vec = vld1q_f32(a2_ptr + i);
|
||||
float32x4_t a3_vec = vld1q_f32(a3_ptr + i);
|
||||
|
||||
float32x4_t y_vec = vld1q_f32(y + i);
|
||||
y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0);
|
||||
y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1);
|
||||
y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2);
|
||||
y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3);
|
||||
|
||||
vst1q_f32(y + i, y_vec);
|
||||
|
||||
i += 4;
|
||||
}
|
||||
while (i + 1 < m) {
|
||||
float32x2_t a0_vec = vld1_f32(a0_ptr + i);
|
||||
float32x2_t a1_vec = vld1_f32(a1_ptr + i);
|
||||
float32x2_t a2_vec = vld1_f32(a2_ptr + i);
|
||||
float32x2_t a3_vec = vld1_f32(a3_ptr + i);
|
||||
|
||||
float32x2_t y_vec = vld1_f32(y + i);
|
||||
y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0);
|
||||
y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1);
|
||||
y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2);
|
||||
y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3);
|
||||
|
||||
vst1_f32(y + i, y_vec);
|
||||
|
||||
i += 2;
|
||||
}
|
||||
while (i < m) {
|
||||
y[i] += a0_ptr[i] * x0_vec[0];
|
||||
y[i] += a1_ptr[i] * x0_vec[1];
|
||||
y[i] += a2_ptr[i] * x0_vec[2];
|
||||
y[i] += a3_ptr[i] * x0_vec[3];
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
a0_ptr += lda * 4;
|
||||
a1_ptr += lda * 4;
|
||||
a2_ptr += lda * 4;
|
||||
a3_ptr += lda * 4;
|
||||
|
||||
j += 4;
|
||||
}
|
||||
while (j + 1 < n) {
|
||||
float32x2_t x0_vec = vld1_f32(x + j);
|
||||
x0_vec = vmul_n_f32(x0_vec, alpha);
|
||||
i = 0;
|
||||
while (i + 7 < m) {
|
||||
float32x4_t a00_vec = vld1q_f32(a0_ptr + i);
|
||||
float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4);
|
||||
float32x4_t a10_vec = vld1q_f32(a1_ptr + i);
|
||||
float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4);
|
||||
|
||||
float32x4_t y0_vec = vld1q_f32(y + i);
|
||||
float32x4_t y1_vec = vld1q_f32(y + i + 4);
|
||||
y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0);
|
||||
y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1);
|
||||
y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0);
|
||||
y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1);
|
||||
|
||||
vst1q_f32(y + i, y0_vec);
|
||||
vst1q_f32(y + i + 4, y1_vec);
|
||||
|
||||
i += 8;
|
||||
}
|
||||
while (i + 3 < m) {
|
||||
float32x4_t a0_vec = vld1q_f32(a0_ptr + i);
|
||||
float32x4_t a1_vec = vld1q_f32(a1_ptr + i);
|
||||
|
||||
float32x4_t y_vec = vld1q_f32(y + i);
|
||||
y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0);
|
||||
y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1);
|
||||
|
||||
vst1q_f32(y + i, y_vec);
|
||||
|
||||
i += 4;
|
||||
}
|
||||
while (i + 1 < m) {
|
||||
float32x2_t a0_vec = vld1_f32(a0_ptr + i);
|
||||
float32x2_t a1_vec = vld1_f32(a1_ptr + i);
|
||||
|
||||
float32x2_t y_vec = vld1_f32(y + i);
|
||||
y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0);
|
||||
y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1);
|
||||
|
||||
vst1_f32(y + i, y_vec);
|
||||
|
||||
i += 2;
|
||||
}
|
||||
while (i < m) {
|
||||
y[i] += a0_ptr[i] * x0_vec[0];
|
||||
y[i] += a1_ptr[i] * x0_vec[1];
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
a0_ptr += lda * 2;
|
||||
a1_ptr += lda * 2;
|
||||
|
||||
j += 2;
|
||||
}
|
||||
while (j < n) {
|
||||
i = 0;
|
||||
temp = alpha * x[j];
|
||||
while (i < m) {
|
||||
y[i] += a0_ptr[i] * temp;
|
||||
i++;
|
||||
}
|
||||
|
||||
a0_ptr += lda;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
for (j = 0; j < n; j++) {
|
||||
temp = alpha * x[ix];
|
||||
iy = 0;
|
||||
for (i = 0; i < m; i++) {
|
||||
y[iy] += temp * a_ptr[i];
|
||||
iy += inc_y;
|
||||
}
|
||||
a_ptr += lda;
|
||||
ix += inc_x;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
Reference in New Issue
Block a user