Added shgemm_kernel_8x8 for RISCV64_ZVL128B and shgemm_kernel_16x8 for RISCV64_ZVL256B

Added HFLOAT16 support for RISCV64
Added shgemm_kernel_8x8 for RISCV64_ZVL128B and shgemm_kernel_16x8 for RISCV64_ZVL256B based on HFLOAT16
The instruction sets used are ZVFH and ZFH, which need to be supported by RVV1.0

Related to issue #5279
Co-authored-by Linjin Li <linjin_li@163.com>
This commit is contained in:
gkdddd
2025-06-03 20:14:30 +08:00
parent 0a967797a1
commit 670ec6f757
46 changed files with 39890 additions and 620 deletions

View File

@@ -351,6 +351,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
endif ()
if (BUILD_HFLOAT16)
if (SHGEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${SHGEMMINCOPY}" "" "${SHGEMMINCOPYOBJ}" false "" "" true "HFLOAT16")
endif ()
if (SHGEMMITCOPY)
GenerateNamedObjects("${KERNELDIR}/${SHGEMMITCOPY}" "" "${SHGEMMITCOPYOBJ}" false "" "" true "HFLOAT16")
endif ()
if (SHGEMMONCOPY)
GenerateNamedObjects("${KERNELDIR}/${SHGEMMONCOPY}" "" "${SHGEMMONCOPYOBJ}" false "" "" true "HFLOAT16")
endif ()
if (SHGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${SHGEMMOTCOPY}" "" "${SHGEMMOTCOPYOBJ}" false "" "" true "HFLOAT16")
endif ()
GenerateNamedObjects("${KERNELDIR}/${SHGEMMKERNEL}" "" "gemm_kernel" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_BETA}" "" "gemm_beta" false "" "" false "HFLOAT16")
endif ()
foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_char}GEMMINCOPY)
@@ -769,6 +785,45 @@ endif ()
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
endif ()
if (BUILD_HFLOAT16)
if (NOT DEFINED SHGEMM_SMALL_M_PERMIT)
set(SHGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_NN)
set(SHGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_NT)
set(SHGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_TN)
set(SHGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_TT)
set(SHGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_B0_NN)
set(SHGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_B0_NT)
set(SHGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_B0_TN)
set(SHGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_B0_TT)
set(SHGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "HFLOAT16")
endif ()
endif ()
if (NOT DEFINED ${float_char}OMATCOPY_CN)

View File

@@ -129,6 +129,26 @@ SBKERNELOBJS += \
$(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
endif
ifeq ($(BUILD_HFLOAT16), 1)
ifndef SHGEMMKERNEL
SHGEMM_BETA = ../generic/gemm_beta.c
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c
SHGEMMITCOPY = ../generic/gemm_tcopy_2.c
SHGEMMONCOPY = ../generic/gemm_ncopy_2.c
SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
SHKERNELOBJS += \
shgemm_kernel$(TSUFFIX).$(SUFFIX) \
$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \
$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ)
endif
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
SKERNELOBJS += \
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
@@ -192,6 +212,9 @@ XKERNELOBJS += \
ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += $(SBKERNELOBJS)
endif
ifeq ($(BUILD_HFLOAT16),1)
SHBLASOBJS += $(SHKERNELOBJS)
endif
SBLASOBJS += $(SKERNELOBJS)
DBLASOBJS += $(DKERNELOBJS)
QBLASOBJS += $(QKERNELOBJS)
@@ -202,6 +225,9 @@ XBLASOBJS += $(XKERNELOBJS)
ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(BUILD_HFLOAT16),1)
SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX)
endif
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
SBLASOBJS += \
@@ -493,6 +519,15 @@ SBBLASOBJS += \
sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(BUILD_HFLOAT16),1)
SHBLASOBJS += \
shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
endif
SBLASOBJS += \
sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
@@ -599,6 +634,13 @@ SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
endif
ifeq ($(BUILD_HFLOAT16), 1)
SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
endif
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
@@ -629,6 +671,11 @@ $(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
ifeq ($(BUILD_HFLOAT16),1)
$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -671,6 +718,25 @@ $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
endif
endif
ifeq ($(BUILD_HFLOAT16), 1)
$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
endif
$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -853,6 +919,12 @@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMM
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
ifeq ($(BUILD_HFLOAT16), 1)
$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
@@ -2840,6 +2912,11 @@ $(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
ifeq ($(BUILD_HFLOAT16),1)
$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
@@ -2873,6 +2950,23 @@ $(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
endif
endif
ifeq ($(BUILD_HFLOAT16), 1)
$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
endif
$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -2983,6 +3077,11 @@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEM
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
ifeq ($(BUILD_HFLOAT16), 1)
$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -4843,6 +4942,71 @@ $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMA
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
endif
ifeq ($(BUILD_HFLOAT16), 1)
ifndef SHGEMM_SMALL_M_PERMIT
SHGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
endif
ifndef SHGEMM_SMALL_K_NN
SHGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef SHGEMM_SMALL_K_NT
SHGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef SHGEMM_SMALL_K_TN
SHGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef SHGEMM_SMALL_K_TT
SHGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_M_PERMIT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
ifndef SHGEMM_SMALL_K_B0_NN
SHGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef SHGEMM_SMALL_K_B0_NT
SHGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef SHGEMM_SMALL_K_B0_TN
SHGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef SHGEMM_SMALL_K_B0_TT
SHGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
endif
ifndef CGEMM_SMALL_M_PERMIT
CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
endif

View File

@@ -245,3 +245,12 @@ endif
ifndef ZGEMM_BETA
ZGEMM_BETA = zgemm_beta_rvv.c
endif
SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c
SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifndef SHGEMM_BETA
SHGEMM_BETA = gemm_beta_rvv.c
endif

View File

@@ -207,3 +207,19 @@ COMATCOPY_CN = zomatcopy_cn_vector.c
DOMATCOPY_CN = omatcopy_cn_vector.c
SOMATCOPY_CN = omatcopy_cn_vector.c
SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c
SHGEMMITCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_M).c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifndef SHGEMM_BETA
SHGEMM_BETA = gemm_beta_rvv.c
endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,6 @@
#include "common.h"
#include <riscv_vector.h>
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc)
{
@@ -14,7 +15,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
BLASLONG bi=n_top*K;
_Float16 B0 = B[bi+0];
_Float16 B1 = B[bi+1];
@@ -50,17 +51,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
bi += 8;
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
ai += 16;
ai += 8;
result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, A0, B2, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, A0, B3, gvl);
result4 = __riscv_vfwmacc_vf_f32m2(result4, A0, B4, gvl);
result5 = __riscv_vfwmacc_vf_f32m2(result5, A0, B5, gvl);
result6 = __riscv_vfwmacc_vf_f32m2(result6, A0, B6, gvl);
result7 = __riscv_vfwmacc_vf_f32m2(result7, A0, B7, gvl);
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
result4 = __riscv_vfwmacc_vf_f32m2(result4, B4, A0, gvl);
result5 = __riscv_vfwmacc_vf_f32m2(result5, B5, A0, gvl);
result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
}
@@ -86,14 +87,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci = n_top * ldc + m_top;
__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c7, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c3, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c4, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c5, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c6, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c7, gvl); ci += ldc-gvl*0;
m_top += 8;
}
@@ -332,10 +333,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
ai += 8;
result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, A0, B2, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, A0, B3, gvl);
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
}
@@ -353,10 +354,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci = n_top * ldc + m_top;
__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c3, gvl);
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c3, gvl);
m_top += 8;
}
@@ -521,8 +522,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
ai += 8;
result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
}
@@ -536,8 +537,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci = n_top * ldc + m_top;
__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c1, gvl);
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c1, gvl);
m_top += 8;
}
@@ -604,7 +605,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
bi+=2;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1;
@@ -665,7 +665,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
ai += 8;
result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
}
@@ -677,7 +677,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
ci = n_top * ldc + m_top;
__riscv_vse16_v_f16m1( &C[ci], c0, gvl);
__riscv_vse32_v_f32m2( &C[ci], c0, gvl);
m_top += 8;
}

View File

@@ -125,6 +125,23 @@ gotoblas_t TABLE_NAME = {
#endif
#endif
#ifdef BUILD_HFLOAT16
0, 0, 0,
SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N,
#ifdef SHGEMM_DEFAULT_UNROLL_MN
SHGEMM_DEFAULT_UNROLL_MN,
#else
MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N),
#endif
shgemm_kernelTS, shgemm_betaTS,
#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N
shgemm_incopyTS, shgemm_itcopyTS,
#else
shgemm_oncopyTS, shgemm_otcopyTS,
#endif
shgemm_oncopyTS, shgemm_otcopyTS,
#endif
#if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1)
0, 0, 0,
SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N,
@@ -1252,6 +1269,9 @@ static void init_parameter(void) {
#ifdef BUILD_BFLOAT16
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
#endif
#ifdef BUILD_HFLOAT16
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
@@ -1260,6 +1280,9 @@ static void init_parameter(void) {
#ifdef BUILD_BFLOAT16
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
#endif
#ifdef BUILD_HFLOAT16
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
#endif
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
@@ -1269,6 +1292,9 @@ static void init_parameter(void) {
#ifdef BUILD_BFLOAT16
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif
#ifdef BUILD_HFLOAT16
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
#endif
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
@@ -1417,6 +1443,10 @@ static void init_parameter(void) {
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif
#ifdef BUILD_HFLOAT16
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
#endif
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
#endif
@@ -2012,6 +2042,13 @@ static void init_parameter(void) {
) / (TABLE_NAME.sbgemm_q * 4) - 15) & ~15);
#endif
#if BUILD_HFLOAT16==1
TABLE_NAME.shgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.shgemm_p * TABLE_NAME.shgemm_q * 4 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.shgemm_q * 4) - 15) & ~15);
#endif
#if BUILD_SINGLE==1
TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA