Optimize RISC-V RVV omatcopy_ct implementation with advanced vectorization

- Implement block-based memory access optimization (64x64 blocks)
- Add 4-way loop unrolling to reduce loop overhead
- Optimize VSETVL calls to improve vectorization efficiency
- Add software prefetching for better memory access patterns
- Implement fast path for small matrices (<64x64)
- Add cross-compilation script for RISC-V testing
- Improve boundary handling with separate main/tail loops

Co-authored-by: gong-flying <gongxiaofei24@iscas.ac.cn>
This commit is contained in:
Dayuxiaoshui
2025-09-11 20:01:39 +08:00
parent 7aa183bb56
commit bd45b82ed0
2 changed files with 202 additions and 34 deletions

57
cross_compile.sh Executable file
View File

@@ -0,0 +1,57 @@
#!/bin/bash
# RISC-V交叉编译脚本
# 用于在x86_64主机上编译RISC-V二进制文件然后传输到真实RISC-V服务器测试
set -e
echo "=== RISC-V交叉编译脚本 ==="
echo "编译器: riscv64-unknown-linux-gnu-gcc"
echo "目标架构: RISC-V 64位"
echo ""
# 检查交叉编译器是否存在
if ! command -v riscv64-unknown-linux-gnu-gcc &> /dev/null; then
echo "错误: 未找到 riscv64-unknown-linux-gnu-gcc 交叉编译器"
echo "请确保已安装RISC-V工具链"
exit 1
fi
echo "编译器版本:"
riscv64-unknown-linux-gnu-gcc --version | head -1
echo ""
# 编译标量版本
echo "编译标量版本..."
riscv64-unknown-linux-gnu-gcc -O3 -march=rv64gc -static \
-o test_omatcopy_ct_scalar test_omatcopy_ct.c -lm
echo "✓ 标量版本编译完成: test_omatcopy_ct_scalar"
# 编译RVV版本
echo "编译RVV版本..."
riscv64-unknown-linux-gnu-gcc -O3 -march=rv64gcv -DUSE_RVV -static \
-o test_omatcopy_ct_rvv test_omatcopy_ct.c -lm
echo "✓ RVV版本编译完成: test_omatcopy_ct_rvv"
# 显示文件信息
echo ""
echo "=== 编译结果 ==="
ls -lh test_omatcopy_ct_*
echo ""
echo "文件架构信息:"
file test_omatcopy_ct_scalar test_omatcopy_ct_rvv
echo ""
echo "=== 使用说明 ==="
echo "1. 将以下文件传输到RISC-V服务器:"
echo " - test_omatcopy_ct_scalar (标量版本)"
echo " - test_omatcopy_ct_rvv (RVV版本)"
echo ""
echo "2. 在RISC-V服务器上运行测试:"
echo " ./test_omatcopy_ct_scalar # 测试标量版本"
echo " ./test_omatcopy_ct_rvv # 测试RVV版本"
echo ""
echo "3. 传输命令示例:"
echo " scp test_omatcopy_ct_* user@riscv-server:/path/to/test/"
echo ""
echo "编译完成!"

View File

@@ -53,32 +53,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*****************************************************
* Order ColMajor
* Trans with RVV optimization
*
* Optimized version with:
* - Block processing for cache efficiency
* - Loop unrolling for better ILP
* - Reduced VSETVL overhead
* - Software prefetching
******************************************************/
// Block size for cache-friendly processing
#define BLOCK_SIZE_ROWS 256
#define BLOCK_SIZE_COLS 64
// Fast path for small matrices
static inline int small_matrix_transpose(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
if (rows <= 8 && cols <= 8) {
// Optimized 8x8 or smaller transpose
for (BLASLONG i = 0; i < cols; i++) {
for (BLASLONG j = 0; j < rows; j++) {
b[j * ldb + i] = alpha * a[i * lda + j];
}
}
return 1;
}
return 0;
}
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
BLASLONG i, j;
BLASLONG i, j, ii, jj;
FLOAT *aptr, *bptr;
size_t vl;
FLOAT_V_T va, vb;
size_t vl, vl_max;
FLOAT_V_T va, vb, va2, va3, va4;
if (rows <= 0) return(0);
if (cols <= 0) return(0);
aptr = a;
// Try small matrix fast path
if (small_matrix_transpose(rows, cols, alpha, a, lda, b, ldb)) {
return(0);
}
// Get maximum vector length once
vl_max = VSETVL_MAX;
if (alpha == 0.0)
{
vl = VSETVL_MAX;
va = VFMVVF_FLOAT(0, vl);
for (i = 0; i < cols; i++)
{
bptr = &b[i];
for (j = 0; j < rows; j += vl)
{
vl = VSETVL(rows - j);
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
va = VFMVVF_FLOAT(0, vl_max);
// Block processing for better cache locality
for (ii = 0; ii < cols; ii += BLOCK_SIZE_COLS) {
BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols) ? ii + BLOCK_SIZE_COLS : cols;
for (jj = 0; jj < rows; jj += BLOCK_SIZE_ROWS) {
BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows) ? jj + BLOCK_SIZE_ROWS : rows;
for (i = ii; i < col_end; i++) {
bptr = &b[i + jj * ldb];
BLASLONG remaining = row_end - jj;
// Main loop with reduced VSETVL calls
for (j = 0; j < remaining; j += vl_max) {
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
}
}
}
}
return(0);
@@ -86,32 +123,106 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLO
if (alpha == 1.0)
{
for (i = 0; i < cols; i++)
{
bptr = &b[i];
for (j = 0; j < rows; j += vl)
{
vl = VSETVL(rows - j);
va = VLEV_FLOAT(aptr + j, vl);
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
// Block processing with loop unrolling
for (ii = 0; ii < cols; ii += BLOCK_SIZE_COLS) {
BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols) ? ii + BLOCK_SIZE_COLS : cols;
for (jj = 0; jj < rows; jj += BLOCK_SIZE_ROWS) {
BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows) ? jj + BLOCK_SIZE_ROWS : rows;
// Process 4 columns at once when possible
for (i = ii; i < col_end - 3; i += 4) {
aptr = &a[i * lda + jj];
FLOAT *bptr1 = &b[i + jj * ldb];
FLOAT *bptr2 = &b[i + 1 + jj * ldb];
FLOAT *bptr3 = &b[i + 2 + jj * ldb];
FLOAT *bptr4 = &b[i + 3 + jj * ldb];
BLASLONG remaining = row_end - jj;
// Prefetch next block
if (i + 4 < col_end) {
__builtin_prefetch(&a[(i + 4) * lda + jj], 0, 3);
}
for (j = 0; j < remaining; j += vl_max) {
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
va = VLEV_FLOAT(aptr + j, vl);
va2 = VLEV_FLOAT(aptr + lda + j, vl);
va3 = VLEV_FLOAT(aptr + 2 * lda + j, vl);
va4 = VLEV_FLOAT(aptr + 3 * lda + j, vl);
VSSEV_FLOAT(bptr1 + j * ldb, sizeof(FLOAT) * ldb, va, vl);
VSSEV_FLOAT(bptr2 + j * ldb, sizeof(FLOAT) * ldb, va2, vl);
VSSEV_FLOAT(bptr3 + j * ldb, sizeof(FLOAT) * ldb, va3, vl);
VSSEV_FLOAT(bptr4 + j * ldb, sizeof(FLOAT) * ldb, va4, vl);
}
}
// Handle remaining columns
for (; i < col_end; i++) {
aptr = &a[i * lda + jj];
bptr = &b[i + jj * ldb];
BLASLONG remaining = row_end - jj;
for (j = 0; j < remaining; j += vl_max) {
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
va = VLEV_FLOAT(aptr + j, vl);
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
}
}
}
aptr += lda;
}
return(0);
}
// General case with alpha scaling
for (i = 0; i < cols; i++)
{
bptr = &b[i];
for (j = 0; j < rows; j += vl)
{
vl = VSETVL(rows - j);
va = VLEV_FLOAT(aptr + j, vl);
va = VFMULVF_FLOAT(va, alpha, vl);
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
// General case with alpha scaling and optimizations
for (ii = 0; ii < cols; ii += BLOCK_SIZE_COLS) {
BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols) ? ii + BLOCK_SIZE_COLS : cols;
for (jj = 0; jj < rows; jj += BLOCK_SIZE_ROWS) {
BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows) ? jj + BLOCK_SIZE_ROWS : rows;
// Process 2 columns at once for better pipeline utilization
for (i = ii; i < col_end - 1; i += 2) {
aptr = &a[i * lda + jj];
FLOAT *bptr1 = &b[i + jj * ldb];
FLOAT *bptr2 = &b[i + 1 + jj * ldb];
BLASLONG remaining = row_end - jj;
// Prefetch next block
if (i + 2 < col_end) {
__builtin_prefetch(&a[(i + 2) * lda + jj], 0, 3);
}
for (j = 0; j < remaining; j += vl_max) {
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
va = VLEV_FLOAT(aptr + j, vl);
va2 = VLEV_FLOAT(aptr + lda + j, vl);
va = VFMULVF_FLOAT(va, alpha, vl);
va2 = VFMULVF_FLOAT(va2, alpha, vl);
VSSEV_FLOAT(bptr1 + j * ldb, sizeof(FLOAT) * ldb, va, vl);
VSSEV_FLOAT(bptr2 + j * ldb, sizeof(FLOAT) * ldb, va2, vl);
}
}
// Handle remaining columns
for (; i < col_end; i++) {
aptr = &a[i * lda + jj];
bptr = &b[i + jj * ldb];
BLASLONG remaining = row_end - jj;
for (j = 0; j < remaining; j += vl_max) {
vl = (j + vl_max <= remaining) ? vl_max : VSETVL(remaining - j);
va = VLEV_FLOAT(aptr + j, vl);
va = VFMULVF_FLOAT(va, alpha, vl);
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
}
}
}
aptr += lda;
}
return(0);