mirror of
https://github.com/OpenMathLib/OpenBLAS
synced 2026-06-15 07:51:43 +08:00
Added shgemm_kernel_8x8 for RISCV64_ZVL128B and shgemm_kernel_16x8 for RISCV64_ZVL256B
Added HFLOAT16 support for RISCV64 Added shgemm_kernel_8x8 for RISCV64_ZVL128B and shgemm_kernel_16x8 for RISCV64_ZVL256B based on HFLOAT16 The instruction sets used are ZVFH and ZFH, which need to be supported by RVV1.0 Related to issue #5279 Co-authored-by Linjin Li <linjin_li@163.com>
This commit is contained in:
@@ -351,6 +351,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
if (BUILD_HFLOAT16)
|
||||
if (SHGEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMMINCOPY}" "" "${SHGEMMINCOPYOBJ}" false "" "" true "HFLOAT16")
|
||||
endif ()
|
||||
if (SHGEMMITCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMMITCOPY}" "" "${SHGEMMITCOPYOBJ}" false "" "" true "HFLOAT16")
|
||||
endif ()
|
||||
if (SHGEMMONCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMMONCOPY}" "" "${SHGEMMONCOPYOBJ}" false "" "" true "HFLOAT16")
|
||||
endif ()
|
||||
if (SHGEMMOTCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMMOTCOPY}" "" "${SHGEMMOTCOPYOBJ}" false "" "" true "HFLOAT16")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMMKERNEL}" "" "gemm_kernel" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_BETA}" "" "gemm_beta" false "" "" false "HFLOAT16")
|
||||
endif ()
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_char}GEMMINCOPY)
|
||||
@@ -769,6 +785,45 @@ endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
|
||||
if (BUILD_HFLOAT16)
|
||||
if (NOT DEFINED SHGEMM_SMALL_M_PERMIT)
|
||||
set(SHGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SHGEMM_SMALL_K_NN)
|
||||
set(SHGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SHGEMM_SMALL_K_NT)
|
||||
set(SHGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SHGEMM_SMALL_K_TN)
|
||||
set(SHGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SHGEMM_SMALL_K_TT)
|
||||
set(SHGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SHGEMM_SMALL_K_B0_NN)
|
||||
set(SHGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SHGEMM_SMALL_K_B0_NT)
|
||||
set(SHGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SHGEMM_SMALL_K_B0_TN)
|
||||
set(SHGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SHGEMM_SMALL_K_B0_TT)
|
||||
set(SHGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "HFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "HFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED ${float_char}OMATCOPY_CN)
|
||||
|
||||
@@ -129,6 +129,26 @@ SBKERNELOBJS += \
|
||||
$(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16), 1)
|
||||
ifndef SHGEMMKERNEL
|
||||
SHGEMM_BETA = ../generic/gemm_beta.c
|
||||
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
SHGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
SHGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SHKERNELOBJS += \
|
||||
shgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \
|
||||
$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ)
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
|
||||
SKERNELOBJS += \
|
||||
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -192,6 +212,9 @@ XKERNELOBJS += \
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SBBLASOBJS += $(SBKERNELOBJS)
|
||||
endif
|
||||
ifeq ($(BUILD_HFLOAT16),1)
|
||||
SHBLASOBJS += $(SHKERNELOBJS)
|
||||
endif
|
||||
SBLASOBJS += $(SKERNELOBJS)
|
||||
DBLASOBJS += $(DKERNELOBJS)
|
||||
QBLASOBJS += $(QKERNELOBJS)
|
||||
@@ -202,6 +225,9 @@ XBLASOBJS += $(XKERNELOBJS)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(BUILD_HFLOAT16),1)
|
||||
SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
|
||||
SBLASOBJS += \
|
||||
@@ -493,6 +519,15 @@ SBBLASOBJS += \
|
||||
sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16),1)
|
||||
SHBLASOBJS += \
|
||||
shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
|
||||
shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
|
||||
shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
|
||||
shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
|
||||
shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SBLASOBJS += \
|
||||
sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
|
||||
sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -599,6 +634,13 @@ SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16), 1)
|
||||
SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
endif
|
||||
|
||||
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
@@ -629,6 +671,11 @@ $(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16),1)
|
||||
$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -671,6 +718,25 @@ $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16), 1)
|
||||
|
||||
$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
|
||||
|
||||
$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -853,6 +919,12 @@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMM
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16), 1)
|
||||
|
||||
$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
|
||||
@@ -2840,6 +2912,11 @@ $(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16),1)
|
||||
$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -2873,6 +2950,23 @@ $(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16), 1)
|
||||
$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
|
||||
$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -2983,6 +3077,11 @@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEM
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16), 1)
|
||||
$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -4843,6 +4942,71 @@ $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMA
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HFLOAT16), 1)
|
||||
ifndef SHGEMM_SMALL_M_PERMIT
|
||||
SHGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
|
||||
endif
|
||||
|
||||
ifndef SHGEMM_SMALL_K_NN
|
||||
SHGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef SHGEMM_SMALL_K_NT
|
||||
SHGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef SHGEMM_SMALL_K_TN
|
||||
SHGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef SHGEMM_SMALL_K_TT
|
||||
SHGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_M_PERMIT)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifndef SHGEMM_SMALL_K_B0_NN
|
||||
SHGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef SHGEMM_SMALL_K_B0_NT
|
||||
SHGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef SHGEMM_SMALL_K_B0_TN
|
||||
SHGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef SHGEMM_SMALL_K_B0_TT
|
||||
SHGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_M_PERMIT
|
||||
CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
|
||||
endif
|
||||
|
||||
@@ -245,3 +245,12 @@ endif
|
||||
ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = zgemm_beta_rvv.c
|
||||
endif
|
||||
|
||||
SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c
|
||||
SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
|
||||
SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
|
||||
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ifndef SHGEMM_BETA
|
||||
SHGEMM_BETA = gemm_beta_rvv.c
|
||||
endif
|
||||
@@ -207,3 +207,19 @@ COMATCOPY_CN = zomatcopy_cn_vector.c
|
||||
|
||||
DOMATCOPY_CN = omatcopy_cn_vector.c
|
||||
SOMATCOPY_CN = omatcopy_cn_vector.c
|
||||
|
||||
|
||||
SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c
|
||||
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
|
||||
SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c
|
||||
SHGEMMITCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_M).c
|
||||
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
|
||||
SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
|
||||
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ifndef SHGEMM_BETA
|
||||
SHGEMM_BETA = gemm_beta_rvv.c
|
||||
endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,6 @@
|
||||
|
||||
#include "common.h"
|
||||
#include <riscv_vector.h>
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc)
|
||||
{
|
||||
@@ -14,7 +15,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
_Float16 B0 = B[bi+0];
|
||||
_Float16 B1 = B[bi+1];
|
||||
@@ -50,17 +51,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
bi += 8;
|
||||
|
||||
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
|
||||
ai += 16;
|
||||
ai += 8;
|
||||
|
||||
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
|
||||
result2 = __riscv_vfwmacc_vf_f32m2(result2, A0, B2, gvl);
|
||||
result3 = __riscv_vfwmacc_vf_f32m2(result3, A0, B3, gvl);
|
||||
result4 = __riscv_vfwmacc_vf_f32m2(result4, A0, B4, gvl);
|
||||
result5 = __riscv_vfwmacc_vf_f32m2(result5, A0, B5, gvl);
|
||||
result6 = __riscv_vfwmacc_vf_f32m2(result6, A0, B6, gvl);
|
||||
result7 = __riscv_vfwmacc_vf_f32m2(result7, A0, B7, gvl);
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfwmacc_vf_f32m2(result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfwmacc_vf_f32m2(result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
@@ -86,14 +87,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c7, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c3, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c4, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c5, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c6, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c7, gvl); ci += ldc-gvl*0;
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
@@ -332,10 +333,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
|
||||
result2 = __riscv_vfwmacc_vf_f32m2(result2, A0, B2, gvl);
|
||||
result3 = __riscv_vfwmacc_vf_f32m2(result3, A0, B3, gvl);
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
@@ -353,10 +354,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c3, gvl);
|
||||
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c3, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
@@ -521,8 +522,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
@@ -536,8 +537,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse16_v_f16m1( &C[ci], c1, gvl);
|
||||
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse32_v_f32m2( &C[ci], c1, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
@@ -604,7 +605,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
bi+=2;
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
@@ -665,7 +665,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
|
||||
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
@@ -677,7 +677,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse16_v_f16m1( &C[ci], c0, gvl);
|
||||
__riscv_vse32_v_f32m2( &C[ci], c0, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
@@ -125,6 +125,23 @@ gotoblas_t TABLE_NAME = {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_HFLOAT16
|
||||
0, 0, 0,
|
||||
SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N,
|
||||
#ifdef SHGEMM_DEFAULT_UNROLL_MN
|
||||
SHGEMM_DEFAULT_UNROLL_MN,
|
||||
#else
|
||||
MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N),
|
||||
#endif
|
||||
shgemm_kernelTS, shgemm_betaTS,
|
||||
#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N
|
||||
shgemm_incopyTS, shgemm_itcopyTS,
|
||||
#else
|
||||
shgemm_oncopyTS, shgemm_otcopyTS,
|
||||
#endif
|
||||
shgemm_oncopyTS, shgemm_otcopyTS,
|
||||
#endif
|
||||
|
||||
#if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1)
|
||||
0, 0, 0,
|
||||
SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N,
|
||||
@@ -1252,6 +1269,9 @@ static void init_parameter(void) {
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
|
||||
#endif
|
||||
#ifdef BUILD_HFLOAT16
|
||||
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
@@ -1260,6 +1280,9 @@ static void init_parameter(void) {
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
|
||||
#endif
|
||||
#ifdef BUILD_HFLOAT16
|
||||
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
|
||||
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
|
||||
@@ -1269,6 +1292,9 @@ static void init_parameter(void) {
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
#ifdef BUILD_HFLOAT16
|
||||
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
|
||||
@@ -1417,6 +1443,10 @@ static void init_parameter(void) {
|
||||
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
|
||||
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
#ifdef BUILD_HFLOAT16
|
||||
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
|
||||
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
|
||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
@@ -2012,6 +2042,13 @@ static void init_parameter(void) {
|
||||
) / (TABLE_NAME.sbgemm_q * 4) - 15) & ~15);
|
||||
#endif
|
||||
|
||||
#if BUILD_HFLOAT16==1
|
||||
TABLE_NAME.shgemm_r = (((BUFFER_SIZE -
|
||||
((TABLE_NAME.shgemm_p * TABLE_NAME.shgemm_q * 4 + TABLE_NAME.offsetA
|
||||
+ TABLE_NAME.align) & ~TABLE_NAME.align)
|
||||
) / (TABLE_NAME.shgemm_q * 4) - 15) & ~15);
|
||||
#endif
|
||||
|
||||
#if BUILD_SINGLE==1
|
||||
TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
|
||||
((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA
|
||||
|
||||
Reference in New Issue
Block a user