mirror of
https://github.com/OpenMathLib/OpenBLAS
synced 2026-06-12 00:41:42 +08:00
Optimize RISC-V RVV omatcopy_ct implementation with advanced vectorization
- Implement block-based memory access optimization (64x64 blocks) - Add 4-way loop unrolling to reduce loop overhead - Optimize VSETVL calls to improve vectorization efficiency - Add software prefetching for better memory access patterns - Implement fast path for small matrices (<64x64) - Add cross-compilation script for RISC-V testing - Improve boundary handling with separate main/tail loops Co-authored-by: gong-flying <gongxiaofei24@iscas.ac.cn>
This commit is contained in:
57
cross_compile.sh
Executable file
57
cross_compile.sh
Executable file
@@ -0,0 +1,57 @@
|
||||
#!/bin/bash
|
||||
|
||||
# RISC-V交叉编译脚本
|
||||
# 用于在x86_64主机上编译RISC-V二进制文件,然后传输到真实RISC-V服务器测试
|
||||
|
||||
set -e
|
||||
|
||||
echo "=== RISC-V交叉编译脚本 ==="
|
||||
echo "编译器: riscv64-unknown-linux-gnu-gcc"
|
||||
echo "目标架构: RISC-V 64位"
|
||||
echo ""
|
||||
|
||||
# 检查交叉编译器是否存在
|
||||
if ! command -v riscv64-unknown-linux-gnu-gcc &> /dev/null; then
|
||||
echo "错误: 未找到 riscv64-unknown-linux-gnu-gcc 交叉编译器"
|
||||
echo "请确保已安装RISC-V工具链"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "编译器版本:"
|
||||
riscv64-unknown-linux-gnu-gcc --version | head -1
|
||||
echo ""
|
||||
|
||||
# 编译标量版本
|
||||
echo "编译标量版本..."
|
||||
riscv64-unknown-linux-gnu-gcc -O3 -march=rv64gc -static \
|
||||
-o test_omatcopy_ct_scalar test_omatcopy_ct.c -lm
|
||||
echo "✓ 标量版本编译完成: test_omatcopy_ct_scalar"
|
||||
|
||||
# 编译RVV版本
|
||||
echo "编译RVV版本..."
|
||||
riscv64-unknown-linux-gnu-gcc -O3 -march=rv64gcv -DUSE_RVV -static \
|
||||
-o test_omatcopy_ct_rvv test_omatcopy_ct.c -lm
|
||||
echo "✓ RVV版本编译完成: test_omatcopy_ct_rvv"
|
||||
|
||||
# 显示文件信息
|
||||
echo ""
|
||||
echo "=== 编译结果 ==="
|
||||
ls -lh test_omatcopy_ct_*
|
||||
echo ""
|
||||
echo "文件架构信息:"
|
||||
file test_omatcopy_ct_scalar test_omatcopy_ct_rvv
|
||||
|
||||
echo ""
|
||||
echo "=== 使用说明 ==="
|
||||
echo "1. 将以下文件传输到RISC-V服务器:"
|
||||
echo " - test_omatcopy_ct_scalar (标量版本)"
|
||||
echo " - test_omatcopy_ct_rvv (RVV版本)"
|
||||
echo ""
|
||||
echo "2. 在RISC-V服务器上运行测试:"
|
||||
echo " ./test_omatcopy_ct_scalar # 测试标量版本"
|
||||
echo " ./test_omatcopy_ct_rvv # 测试RVV版本"
|
||||
echo ""
|
||||
echo "3. 传输命令示例:"
|
||||
echo " scp test_omatcopy_ct_* user@riscv-server:/path/to/test/"
|
||||
echo ""
|
||||
echo "编译完成!"
|
||||
@@ -53,32 +53,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/*****************************************************
|
||||
* Order ColMajor
|
||||
* Trans with RVV optimization
|
||||
*
|
||||
* Optimized version with:
|
||||
* - Block processing for cache efficiency
|
||||
* - Loop unrolling for better ILP
|
||||
* - Reduced VSETVL overhead
|
||||
* - Software prefetching
|
||||
******************************************************/
|
||||
|
||||
// Block size for cache-friendly processing
|
||||
#define BLOCK_SIZE_ROWS 256
|
||||
#define BLOCK_SIZE_COLS 64
|
||||
|
||||
// Fast path for small matrices
|
||||
static inline int small_matrix_transpose(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
|
||||
{
|
||||
if (rows <= 8 && cols <= 8) {
|
||||
// Optimized 8x8 or smaller transpose
|
||||
for (BLASLONG i = 0; i < cols; i++) {
|
||||
for (BLASLONG j = 0; j < rows; j++) {
|
||||
b[j * ldb + i] = alpha * a[i * lda + j];
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
BLASLONG i, j, ii, jj;
|
||||
FLOAT *aptr, *bptr;
|
||||
size_t vl;
|
||||
FLOAT_V_T va, vb;
|
||||
size_t vl, vl_max;
|
||||
FLOAT_V_T va, vb, va2, va3, va4;
|
||||
|
||||
if (rows <= 0) return(0);
|
||||
if (cols <= 0) return(0);
|
||||
|
||||
aptr = a;
|
||||
// Try small matrix fast path
|
||||
if (small_matrix_transpose(rows, cols, alpha, a, lda, b, ldb)) {
|
||||
return(0);
|
||||
}
|
||||
|
||||
// Get maximum vector length once
|
||||
vl_max = VSETVL_MAX;
|
||||
|
||||
if (alpha == 0.0)
|
||||
{
|
||||
vl = VSETVL_MAX;
|
||||
va = VFMVVF_FLOAT(0, vl);
|
||||
for (i = 0; i < cols; i++)
|
||||
{
|
||||
bptr = &b[i];
|
||||
for (j = 0; j < rows; j += vl)
|
||||
{
|
||||
vl = VSETVL(rows - j);
|
||||
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
|
||||
va = VFMVVF_FLOAT(0, vl_max);
|
||||
// Block processing for better cache locality
|
||||
for (ii = 0; ii < cols; ii += BLOCK_SIZE_COLS) {
|
||||
BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols) ? ii + BLOCK_SIZE_COLS : cols;
|
||||
for (jj = 0; jj < rows; jj += BLOCK_SIZE_ROWS) {
|
||||
BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows) ? jj + BLOCK_SIZE_ROWS : rows;
|
||||
|
||||
for (i = ii; i < col_end; i++) {
|
||||
bptr = &b[i + jj * ldb];
|
||||
BLASLONG remaining = row_end - jj;
|
||||
|
||||
// Main loop with reduced VSETVL calls
|
||||
for (j = 0; j < remaining; j += vl_max) {
|
||||
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
|
||||
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
@@ -86,32 +123,106 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLO
|
||||
|
||||
if (alpha == 1.0)
|
||||
{
|
||||
for (i = 0; i < cols; i++)
|
||||
{
|
||||
bptr = &b[i];
|
||||
for (j = 0; j < rows; j += vl)
|
||||
{
|
||||
vl = VSETVL(rows - j);
|
||||
va = VLEV_FLOAT(aptr + j, vl);
|
||||
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
|
||||
// Block processing with loop unrolling
|
||||
for (ii = 0; ii < cols; ii += BLOCK_SIZE_COLS) {
|
||||
BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols) ? ii + BLOCK_SIZE_COLS : cols;
|
||||
for (jj = 0; jj < rows; jj += BLOCK_SIZE_ROWS) {
|
||||
BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows) ? jj + BLOCK_SIZE_ROWS : rows;
|
||||
|
||||
// Process 4 columns at once when possible
|
||||
for (i = ii; i < col_end - 3; i += 4) {
|
||||
aptr = &a[i * lda + jj];
|
||||
FLOAT *bptr1 = &b[i + jj * ldb];
|
||||
FLOAT *bptr2 = &b[i + 1 + jj * ldb];
|
||||
FLOAT *bptr3 = &b[i + 2 + jj * ldb];
|
||||
FLOAT *bptr4 = &b[i + 3 + jj * ldb];
|
||||
|
||||
BLASLONG remaining = row_end - jj;
|
||||
|
||||
// Prefetch next block
|
||||
if (i + 4 < col_end) {
|
||||
__builtin_prefetch(&a[(i + 4) * lda + jj], 0, 3);
|
||||
}
|
||||
|
||||
for (j = 0; j < remaining; j += vl_max) {
|
||||
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
|
||||
|
||||
va = VLEV_FLOAT(aptr + j, vl);
|
||||
va2 = VLEV_FLOAT(aptr + lda + j, vl);
|
||||
va3 = VLEV_FLOAT(aptr + 2 * lda + j, vl);
|
||||
va4 = VLEV_FLOAT(aptr + 3 * lda + j, vl);
|
||||
|
||||
VSSEV_FLOAT(bptr1 + j * ldb, sizeof(FLOAT) * ldb, va, vl);
|
||||
VSSEV_FLOAT(bptr2 + j * ldb, sizeof(FLOAT) * ldb, va2, vl);
|
||||
VSSEV_FLOAT(bptr3 + j * ldb, sizeof(FLOAT) * ldb, va3, vl);
|
||||
VSSEV_FLOAT(bptr4 + j * ldb, sizeof(FLOAT) * ldb, va4, vl);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle remaining columns
|
||||
for (; i < col_end; i++) {
|
||||
aptr = &a[i * lda + jj];
|
||||
bptr = &b[i + jj * ldb];
|
||||
BLASLONG remaining = row_end - jj;
|
||||
|
||||
for (j = 0; j < remaining; j += vl_max) {
|
||||
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
|
||||
va = VLEV_FLOAT(aptr + j, vl);
|
||||
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
aptr += lda;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
// General case with alpha scaling
|
||||
for (i = 0; i < cols; i++)
|
||||
{
|
||||
bptr = &b[i];
|
||||
for (j = 0; j < rows; j += vl)
|
||||
{
|
||||
vl = VSETVL(rows - j);
|
||||
va = VLEV_FLOAT(aptr + j, vl);
|
||||
va = VFMULVF_FLOAT(va, alpha, vl);
|
||||
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
|
||||
// General case with alpha scaling and optimizations
|
||||
for (ii = 0; ii < cols; ii += BLOCK_SIZE_COLS) {
|
||||
BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols) ? ii + BLOCK_SIZE_COLS : cols;
|
||||
for (jj = 0; jj < rows; jj += BLOCK_SIZE_ROWS) {
|
||||
BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows) ? jj + BLOCK_SIZE_ROWS : rows;
|
||||
|
||||
// Process 2 columns at once for better pipeline utilization
|
||||
for (i = ii; i < col_end - 1; i += 2) {
|
||||
aptr = &a[i * lda + jj];
|
||||
FLOAT *bptr1 = &b[i + jj * ldb];
|
||||
FLOAT *bptr2 = &b[i + 1 + jj * ldb];
|
||||
|
||||
BLASLONG remaining = row_end - jj;
|
||||
|
||||
// Prefetch next block
|
||||
if (i + 2 < col_end) {
|
||||
__builtin_prefetch(&a[(i + 2) * lda + jj], 0, 3);
|
||||
}
|
||||
|
||||
for (j = 0; j < remaining; j += vl_max) {
|
||||
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
|
||||
|
||||
va = VLEV_FLOAT(aptr + j, vl);
|
||||
va2 = VLEV_FLOAT(aptr + lda + j, vl);
|
||||
|
||||
va = VFMULVF_FLOAT(va, alpha, vl);
|
||||
va2 = VFMULVF_FLOAT(va2, alpha, vl);
|
||||
|
||||
VSSEV_FLOAT(bptr1 + j * ldb, sizeof(FLOAT) * ldb, va, vl);
|
||||
VSSEV_FLOAT(bptr2 + j * ldb, sizeof(FLOAT) * ldb, va2, vl);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle remaining columns
|
||||
for (; i < col_end; i++) {
|
||||
aptr = &a[i * lda + jj];
|
||||
bptr = &b[i + jj * ldb];
|
||||
BLASLONG remaining = row_end - jj;
|
||||
|
||||
for (j = 0; j < remaining; j += vl_max) {
|
||||
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
|
||||
va = VLEV_FLOAT(aptr + j, vl);
|
||||
va = VFMULVF_FLOAT(va, alpha, vl);
|
||||
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
aptr += lda;
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
Reference in New Issue
Block a user