Fix _Float16 casting issue and reduce LMUL for certain vector instruction from m2 to m1.

2026-05-31 00:45:48 +08:00 · 2025-09-18 21:30:22 +00:00
parent 79a1f38770
commit a4abf7828e
2 changed files with 175 additions and 175 deletions
--- a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c
+++ b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c
@@ -295,22 +295,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;
            
            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+1]*B[bi+0]);
-                result2+=(float)(A[ai+0]*B[bi+1]);
-                result3+=(float)(A[ai+1]*B[bi+1]);
-                result4+=(float)(A[ai+0]*B[bi+2]);
-                result5+=(float)(A[ai+1]*B[bi+2]);
-                result6+=(float)(A[ai+0]*B[bi+3]);
-                result7+=(float)(A[ai+1]*B[bi+3]);
-                result8+=(float)(A[ai+0]*B[bi+4]);
-                result9+=(float)(A[ai+1]*B[bi+4]);
-                result10+=(float)(A[ai+0]*B[bi+5]);
-                result11+=(float)(A[ai+1]*B[bi+5]);
-                result12+=(float)(A[ai+0]*B[bi+6]);
-                result13+=(float)(A[ai+1]*B[bi+6]);
-                result14+=(float)(A[ai+0]*B[bi+7]);
-                result15+=(float)(A[ai+1]*B[bi+7]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+1])*(float)(B[bi+0]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result3+=(float)(A[ai+1])*(float)(B[bi+1]);
+                result4+=(float)(A[ai+0])*(float)(B[bi+2]);
+                result5+=(float)(A[ai+1])*(float)(B[bi+2]);
+                result6+=(float)(A[ai+0])*(float)(B[bi+3]);
+                result7+=(float)(A[ai+1])*(float)(B[bi+3]);
+                result8+=(float)(A[ai+0])*(float)(B[bi+4]);
+                result9+=(float)(A[ai+1])*(float)(B[bi+4]);
+                result10+=(float)(A[ai+0])*(float)(B[bi+5]);
+                result11+=(float)(A[ai+1])*(float)(B[bi+5]);
+                result12+=(float)(A[ai+0])*(float)(B[bi+6]);
+                result13+=(float)(A[ai+1])*(float)(B[bi+6]);
+                result14+=(float)(A[ai+0])*(float)(B[bi+7]);
+                result15+=(float)(A[ai+1])*(float)(B[bi+7]);
                ai+=2;
                bi+=8;
            }
@@ -353,14 +353,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+0]*B[bi+1]);
-                result2+=(float)(A[ai+0]*B[bi+2]);
-                result3+=(float)(A[ai+0]*B[bi+3]);
-                result4+=(float)(A[ai+0]*B[bi+4]);
-                result5+=(float)(A[ai+0]*B[bi+5]);
-                result6+=(float)(A[ai+0]*B[bi+6]);
-                result7+=(float)(A[ai+0]*B[bi+7]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+2]);
+                result3+=(float)(A[ai+0])*(float)(B[bi+3]);
+                result4+=(float)(A[ai+0])*(float)(B[bi+4]);
+                result5+=(float)(A[ai+0])*(float)(B[bi+5]);
+                result6+=(float)(A[ai+0])*(float)(B[bi+6]);
+                result7+=(float)(A[ai+0])*(float)(B[bi+7]);
                ai+=1;
                bi+=8;
            }
@@ -569,14 +569,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+1]*B[bi+0]);
-                result2+=(float)(A[ai+0]*B[bi+1]);
-                result3+=(float)(A[ai+1]*B[bi+1]);
-                result4+=(float)(A[ai+0]*B[bi+2]);
-                result5+=(float)(A[ai+1]*B[bi+2]);
-                result6+=(float)(A[ai+0]*B[bi+3]);
-                result7+=(float)(A[ai+1]*B[bi+3]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+1])*(float)(B[bi+0]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result3+=(float)(A[ai+1])*(float)(B[bi+1]);
+                result4+=(float)(A[ai+0])*(float)(B[bi+2]);
+                result5+=(float)(A[ai+1])*(float)(B[bi+2]);
+                result6+=(float)(A[ai+0])*(float)(B[bi+3]);
+                result7+=(float)(A[ai+1])*(float)(B[bi+3]);
                ai+=2;
                bi+=4;
            }
@@ -607,10 +607,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+0]*B[bi+1]);
-                result2+=(float)(A[ai+0]*B[bi+2]);
-                result3+=(float)(A[ai+0]*B[bi+3]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+2]);
+                result3+=(float)(A[ai+0])*(float)(B[bi+3]);
                ai+=1;
                bi+=4;
            }
@@ -770,10 +770,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+1]*B[bi+0]);
-                result2+=(float)(A[ai+0]*B[bi+1]);
-                result3+=(float)(A[ai+1]*B[bi+1]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+1])*(float)(B[bi+0]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result3+=(float)(A[ai+1])*(float)(B[bi+1]);
                ai+=2;
                bi+=2;
            }
@@ -797,8 +797,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+0]*B[bi+1]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+0])*(float)(B[bi+1]);
                ai+=1;
                bi+=2;
            }
@@ -930,8 +930,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+1]*B[bi+0]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+1])*(float)(B[bi+0]);
                ai+=2;
                bi+=1;
            }
@@ -953,7 +953,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
                ai+=1;
                bi+=1;
            }
@@ -966,4 +966,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        n_top += 1;
    }
    return 0;
-}
+}
--- a/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c
+++ b/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c
@@ -115,17 +115,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            _Float16 B7 = B[bi+7];
            bi += 8;

-            vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
+            vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
            ai += 4;

-            vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
-            vfloat32m2_t result4 = __riscv_vfwmul_vf_f32m2( A0, B4, gvl);
-            vfloat32m2_t result5 = __riscv_vfwmul_vf_f32m2( A0, B5, gvl);
-            vfloat32m2_t result6 = __riscv_vfwmul_vf_f32m2( A0, B6, gvl);
-            vfloat32m2_t result7 = __riscv_vfwmul_vf_f32m2( A0, B7, gvl);
+            vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfwmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfwmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfwmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfwmul_vf_f32m1( A0, B7, gvl);

            for(BLASLONG k=1; k < K; ++k) {
                B0 = B[bi+0];
@@ -138,55 +138,55 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                B7 = B[bi+7];
                bi += 8;

-                A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
+                A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
                ai += 4;

-                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
-                result4 = __riscv_vfwmacc_vf_f32m2(result4, B4, A0, gvl);
-                result5 = __riscv_vfwmacc_vf_f32m2(result5, B5, A0, gvl);
-                result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
-                result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
+                result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
+                result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
+                result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
+                result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
+                result4 = __riscv_vfwmacc_vf_f32m1(result4, B4, A0, gvl);
+                result5 = __riscv_vfwmacc_vf_f32m1(result5, B5, A0, gvl);
+                result6 = __riscv_vfwmacc_vf_f32m1(result6, B6, A0, gvl);
+                result7 = __riscv_vfwmacc_vf_f32m1(result7, B7, A0, gvl);
            }

            BLASLONG ci = n_top * ldc + m_top;

-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
-            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
-            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
-            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
+            c4 = __riscv_vfmacc_vf_f32m1(c4, alpha, result4, gvl);
+            c5 = __riscv_vfmacc_vf_f32m1(c5, alpha, result5, gvl);
+            c6 = __riscv_vfmacc_vf_f32m1(c6, alpha, result6, gvl);
+            c7 = __riscv_vfmacc_vf_f32m1(c7, alpha, result7, gvl);

            ci= n_top * ldc + m_top;

-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c4, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c5, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c6, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c7, gvl); 
+            __riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c3, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c4, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c5, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c6, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c7, gvl); 
            
            m_top += 4;
        }
@@ -215,22 +215,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            float result15 = 0;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+1]*B[bi+0]);
-                result2+=(float)(A[ai+0]*B[bi+1]);
-                result3+=(float)(A[ai+1]*B[bi+1]);
-                result4+=(float)(A[ai+0]*B[bi+2]);
-                result5+=(float)(A[ai+1]*B[bi+2]);
-                result6+=(float)(A[ai+0]*B[bi+3]);
-                result7+=(float)(A[ai+1]*B[bi+3]);
-                result8+=(float)(A[ai+0]*B[bi+4]);
-                result9+=(float)(A[ai+1]*B[bi+4]);
-                result10+=(float)(A[ai+0]*B[bi+5]);
-                result11+=(float)(A[ai+1]*B[bi+5]);
-                result12+=(float)(A[ai+0]*B[bi+6]);
-                result13+=(float)(A[ai+1]*B[bi+6]);
-                result14+=(float)(A[ai+0]*B[bi+7]);
-                result15+=(float)(A[ai+1]*B[bi+7]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+1])*(float)(B[bi+0]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result3+=(float)(A[ai+1])*(float)(B[bi+1]);
+                result4+=(float)(A[ai+0])*(float)(B[bi+2]);
+                result5+=(float)(A[ai+1])*(float)(B[bi+2]);
+                result6+=(float)(A[ai+0])*(float)(B[bi+3]);
+                result7+=(float)(A[ai+1])*(float)(B[bi+3]);
+                result8+=(float)(A[ai+0])*(float)(B[bi+4]);
+                result9+=(float)(A[ai+1])*(float)(B[bi+4]);
+                result10+=(float)(A[ai+0])*(float)(B[bi+5]);
+                result11+=(float)(A[ai+1])*(float)(B[bi+5]);
+                result12+=(float)(A[ai+0])*(float)(B[bi+6]);
+                result13+=(float)(A[ai+1])*(float)(B[bi+6]);
+                result14+=(float)(A[ai+0])*(float)(B[bi+7]);
+                result15+=(float)(A[ai+1])*(float)(B[bi+7]);
                ai+=2;
                bi+=8;
            }
@@ -273,14 +273,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+0]*B[bi+1]);
-                result2+=(float)(A[ai+0]*B[bi+2]);
-                result3+=(float)(A[ai+0]*B[bi+3]);
-                result4+=(float)(A[ai+0]*B[bi+4]);
-                result5+=(float)(A[ai+0]*B[bi+5]);
-                result6+=(float)(A[ai+0]*B[bi+6]);
-                result7+=(float)(A[ai+0]*B[bi+7]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+2]);
+                result3+=(float)(A[ai+0])*(float)(B[bi+3]);
+                result4+=(float)(A[ai+0])*(float)(B[bi+4]);
+                result5+=(float)(A[ai+0])*(float)(B[bi+5]);
+                result6+=(float)(A[ai+0])*(float)(B[bi+6]);
+                result7+=(float)(A[ai+0])*(float)(B[bi+7]);
                ai+=1;
                bi+=8;
            }
@@ -372,13 +372,13 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            _Float16 B3 = B[bi+3];
            bi += 4;

-            vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
+            vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
            ai += 4;

-            vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
+            vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);

            for(BLASLONG k=1; k < K; ++k) {
                B0 = B[bi+0];
@@ -387,35 +387,35 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                B3 = B[bi+3];
                bi += 4;

-                A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
+                A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
                ai += 4;

-                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
+                result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
+                result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
+                result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
+                result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
            }

            BLASLONG ci = n_top * ldc + m_top;

-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);

            ci= n_top * ldc + m_top;

-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl); 
+            __riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c3, gvl); 
            
            m_top += 4;
        }
@@ -436,14 +436,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            float result7 = 0;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+1]*B[bi+0]);
-                result2+=(float)(A[ai+0]*B[bi+1]);
-                result3+=(float)(A[ai+1]*B[bi+1]);
-                result4+=(float)(A[ai+0]*B[bi+2]);
-                result5+=(float)(A[ai+1]*B[bi+2]);
-                result6+=(float)(A[ai+0]*B[bi+3]);
-                result7+=(float)(A[ai+1]*B[bi+3]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+1])*(float)(B[bi+0]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result3+=(float)(A[ai+1])*(float)(B[bi+1]);
+                result4+=(float)(A[ai+0])*(float)(B[bi+2]);
+                result5+=(float)(A[ai+1])*(float)(B[bi+2]);
+                result6+=(float)(A[ai+0])*(float)(B[bi+3]);
+                result7+=(float)(A[ai+1])*(float)(B[bi+3]);
                ai+=2;
                bi+=4;
            }
@@ -474,10 +474,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+0]*B[bi+1]);
-                result2+=(float)(A[ai+0]*B[bi+2]);
-                result3+=(float)(A[ai+0]*B[bi+3]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+2]);
+                result3+=(float)(A[ai+0])*(float)(B[bi+3]);
                ai+=1;
                bi+=4;
            }
@@ -551,36 +551,36 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            _Float16 B1 = B[bi+1];
            bi += 2;

-            vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
+            vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
            ai += 4;

-            vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
+            vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);

            for(BLASLONG k=1; k < K; ++k) {
                B0 = B[bi+0];
                B1 = B[bi+1];
                bi += 2;

-                A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
+                A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
                ai += 4;

-                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
+                result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
+                result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
            }

            BLASLONG ci = n_top * ldc + m_top;

-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);

            ci= n_top * ldc + m_top;

-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl); ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl); 
+            __riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m1(&C[ci], c1, gvl); 
            
            m_top += 4;
        }
@@ -597,10 +597,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            float result3 = 0;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+1]*B[bi+0]);
-                result2+=(float)(A[ai+0]*B[bi+1]);
-                result3+=(float)(A[ai+1]*B[bi+1]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+1])*(float)(B[bi+0]);
+                result2+=(float)(A[ai+0])*(float)(B[bi+1]);
+                result3+=(float)(A[ai+1])*(float)(B[bi+1]);
                ai+=2;
                bi+=2;
            }
@@ -624,8 +624,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+0]*B[bi+1]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+0])*(float)(B[bi+1]);
                ai+=1;
                bi+=2;
            }
@@ -689,29 +689,29 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            _Float16 B0 = B[bi+0];
            bi += 1;

-            vfloat16m1_t A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
+            vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
            ai += 4;

-            vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
+            vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);

            for(BLASLONG k=1; k < K; ++k) {
                B0 = B[bi+0];
                bi += 1;

-                A0 = __riscv_vle16_v_f16m1(&A[ai + 0 * gvl], gvl);
+                A0 = __riscv_vle16_v_f16mf2(&A[ai + 0 * gvl], gvl);
                ai += 4;

-                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
+                result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
            }

            BLASLONG ci = n_top * ldc + m_top;

-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);

            ci= n_top * ldc + m_top;

-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);            
+            __riscv_vse32_v_f32m1(&C[ci], c0, gvl);
            m_top += 4;
        }

@@ -725,8 +725,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            float result1 = 0;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
-                result1+=(float)(A[ai+1]*B[bi+0]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
+                result1+=(float)(A[ai+1])*(float)(B[bi+0]);
                ai+=2;
                bi+=1;
            }
@@ -748,7 +748,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
-                result0+=(float)(A[ai+0]*B[bi+0]);
+                result0+=(float)(A[ai+0])*(float)(B[bi+0]);
                ai+=1;
                bi+=1;
            }
@@ -764,4 +764,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,

    return 0;

-}
+}