mirror of
https://github.com/OpenMathLib/OpenBLAS
synced 2026-06-12 00:41:42 +08:00
Fixed the issue of mixing AVX and SSE codes in S/D/C/ZGEMM.
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -148,74 +148,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#undef MOVQ
|
||||
#define MOVQ movq
|
||||
|
||||
#define XOR_SY vxorps
|
||||
#define XOR_DY vxorpd
|
||||
#define XOR_SX xorps
|
||||
#define XOR_DX xorpd
|
||||
#define XOR_DX vxorpd
|
||||
|
||||
#define LD_SY vmovaps
|
||||
#define LD_DY vmovapd
|
||||
#define LD_SX movaps
|
||||
#define LD_DX movapd
|
||||
#define LD_DX vmovapd
|
||||
#define LDL_DY vmovlpd
|
||||
#define LDL_DX movlpd
|
||||
#define LDL_DX vmovlpd
|
||||
#define LDH_DY vmovhpd
|
||||
#define LDH_DX movhpd
|
||||
#define LDH_DX vmovhpd
|
||||
|
||||
#define ST_SY vmovaps
|
||||
#define ST_DY vmovapd
|
||||
#define ST_SX movaps
|
||||
#define ST_DX movapd
|
||||
#define ST_DX vmovapd
|
||||
#define STL_DY vmovlpd
|
||||
#define STL_DX movlpd
|
||||
#define STL_DX vmovlpd
|
||||
#define STH_DY vmovhpd
|
||||
#define STH_DX movhpd
|
||||
#define STH_DX vmovhpd
|
||||
|
||||
#define EDUP_SY vmovsldup
|
||||
#define ODUP_SY vmovshdup
|
||||
#define EDUP_SX movsldup
|
||||
#define ODUP_SX movshdup
|
||||
#define EDUP_DY vmovddup
|
||||
|
||||
#define ADD_SY vaddps
|
||||
#define ADD_DY vaddpd
|
||||
#define ADD_SX addps
|
||||
#define ADD_DX addpd
|
||||
#define ADD_DX vaddpd
|
||||
#define SUB_DY vsubpd
|
||||
#define SUB_DX subpd
|
||||
#define SUB_DX vsubpd
|
||||
|
||||
#define ADDSUB_DY vaddsubpd
|
||||
#define ADDSUB_DX addsubpd
|
||||
#define ADDSUB_SY vaddsubps
|
||||
#define ADDSUB_DX vaddsubpd
|
||||
|
||||
#define MUL_SY vmulps
|
||||
#define MUL_DY vmulpd
|
||||
#define MUL_SX mulps
|
||||
#define MUL_DX mulpd
|
||||
#define MUL_DX vmulpd
|
||||
|
||||
#define SHUF_SY vperm2f128
|
||||
#define SHUF_DY vperm2f128
|
||||
#define SHUF_DX pshufd
|
||||
#define SHUF_SX pshufd
|
||||
#define SHUF_DX vpshufd
|
||||
|
||||
#define VPERMILP_SY vpermilps
|
||||
#define VPERMILP_SX vpermilps
|
||||
#define VPERMILP_DY vpermilpd
|
||||
|
||||
#define BROAD_SY vbroadcastss
|
||||
#define BROAD_DY vbroadcastsd
|
||||
#define BROAD_SX vbroadcastss
|
||||
#define BROAD_DX movddup
|
||||
#define BROAD_DX vmovddup
|
||||
|
||||
#define MOV_SY vmovaps
|
||||
#define MOV_DY vmovapd
|
||||
#define MOV_SX movaps
|
||||
#define MOV_DX movapd
|
||||
#define MOV_DX vmovapd
|
||||
|
||||
#define REVS_SY vshufps
|
||||
#define REVS_DY vshufpd
|
||||
#define REVS_SX shufps
|
||||
#define REVS_DX movsd
|
||||
#define REVS_DX vmovsd
|
||||
|
||||
#define EXTRA_DY vextractf128
|
||||
|
||||
@@ -282,6 +257,8 @@ movq old_offset, %r11;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
vzeroupper
|
||||
|
||||
vmovlps %xmm0, MEMALPHA_R
|
||||
vmovlps %xmm1, MEMALPHA_I
|
||||
movq old_bm, bm
|
||||
@@ -1373,14 +1350,14 @@ EXTRA_DY $1, yvec14, xvec6;
|
||||
EXTRA_DY $1, yvec13, xvec5;
|
||||
EXTRA_DY $1, yvec12, xvec4;
|
||||
#ifndef TRMMKERNEL
|
||||
ADD_DX 0*SIZE(C0), xvec15;
|
||||
ADD_DX 2*SIZE(C0, ldc, 1), xvec7;
|
||||
ADD_DX 0*SIZE(C0, ldc, 1), xvec13;
|
||||
ADD_DX 2*SIZE(C0), xvec5;
|
||||
ADD_DX 0*SIZE(C1), xvec14;
|
||||
ADD_DX 2*SIZE(C1, ldc, 1), xvec6;
|
||||
ADD_DX 0*SIZE(C1, ldc, 1), xvec12;
|
||||
ADD_DX 2*SIZE(C1), xvec4;
|
||||
ADD_DX 0*SIZE(C0), xvec15, xvec15;
|
||||
ADD_DX 2*SIZE(C0, ldc, 1), xvec7, xvec7;
|
||||
ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13;
|
||||
ADD_DX 2*SIZE(C0), xvec5, xvec5;
|
||||
ADD_DX 0*SIZE(C1), xvec14, xvec14;
|
||||
ADD_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6;
|
||||
ADD_DX 0*SIZE(C1, ldc, 1), xvec12, xvec12;
|
||||
ADD_DX 2*SIZE(C1), xvec4, xvec4;
|
||||
#endif
|
||||
ST_DX xvec15, 0*SIZE(C0);
|
||||
ST_DX xvec7, 2*SIZE(C0, ldc, 1);
|
||||
@@ -1410,18 +1387,18 @@ EXTRA_DY $1, yvec14, xvec6;
|
||||
EXTRA_DY $1, yvec13, xvec5;
|
||||
EXTRA_DY $2, yvec12, xvec4;
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0;
|
||||
LDL_DX 2*SIZE(C0, ldc, 1), xvec1;
|
||||
LDH_DX 3*SIZE(C0, ldc, 1), xvec1;
|
||||
LDL_DX 0*SIZE(C0, ldc, 1), xvec2;
|
||||
LDH_DX 1*SIZE(C0, ldc, 1), xvec2;
|
||||
LDL_DX 2*SIZE(C0), xvec3;
|
||||
LDH_DX 3*SIZE(C0), xvec3;
|
||||
ADD_DX xvec0, xvec15;
|
||||
ADD_DX xvec1, xvec7;
|
||||
ADD_DX xvec2, xvec13;
|
||||
ADD_DX xvec3, xvec5;
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
LDL_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1;
|
||||
LDH_DX 3*SIZE(C0, ldc, 1), xvec1, xvec1;
|
||||
LDL_DX 0*SIZE(C0, ldc, 1), xvec2, xvec2;
|
||||
LDH_DX 1*SIZE(C0, ldc, 1), xvec2, xvec2;
|
||||
LDL_DX 2*SIZE(C0), xvec3, xvec3;
|
||||
LDH_DX 3*SIZE(C0), xvec3, xvec3;
|
||||
ADD_DX xvec0, xvec15, xvec15;
|
||||
ADD_DX xvec1, xvec7, xvec7;
|
||||
ADD_DX xvec2, xvec13, xvec13;
|
||||
ADD_DX xvec3, xvec5, xvec5;
|
||||
#endif
|
||||
STL_DX xvec15, 0*SIZE(C0);
|
||||
STH_DX xvec15, 1*SIZE(C0);
|
||||
@@ -1432,18 +1409,18 @@ STH_DX xvec13, 1*SIZE(C0, ldc, 1);
|
||||
STL_DX xvec6, 2*SIZE(C0);
|
||||
STH_DX xvec6, 3*SIZE(C0);
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C1), xvec0;
|
||||
LDH_DX 1*SIZE(C1), xvec0;
|
||||
LDL_DX 2*SIZE(C1, ldc, 1), xvec1;
|
||||
LDH_DX 3*SIZE(C1, ldc, 1), xvec1;
|
||||
LDL_DX 0*SIZE(C1, ldc, 1), xvec2;
|
||||
LDH_DX 1*SIZE(C1, ldc, 1), xvec2;
|
||||
LDL_DX 2*SIZE(C1), xvec3;
|
||||
LDH_DX 3*SIZE(C1), xvec3;
|
||||
ADD_DX xvec0, xvec14;
|
||||
ADD_DX xvec1, xvec6;
|
||||
ADD_DX xvec2, xvec12;
|
||||
ADD_DX xvec3, xvec4;
|
||||
LDL_DX 0*SIZE(C1), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C1), xvec0, xvec0;
|
||||
LDL_DX 2*SIZE(C1, ldc, 1), xvec1, xvec1;
|
||||
LDH_DX 3*SIZE(C1, ldc, 1), xvec1, xvec1;
|
||||
LDL_DX 0*SIZE(C1, ldc, 1), xvec2, xvec2;
|
||||
LDH_DX 1*SIZE(C1, ldc, 1), xvec2, xvec2;
|
||||
LDL_DX 2*SIZE(C1), xvec3, xvec3;
|
||||
LDH_DX 3*SIZE(C1), xvec3, xvec3;
|
||||
ADD_DX xvec0, xvec14, xvec14;
|
||||
ADD_DX xvec1, xvec6, xvec6;
|
||||
ADD_DX xvec2, xvec12, xvec12;
|
||||
ADD_DX xvec3, xvec4, xvec4;
|
||||
#endif
|
||||
STL_DX xvec14, 0*SIZE(C1);
|
||||
STH_DX xvec14, 1*SIZE(C1);
|
||||
@@ -1680,18 +1657,18 @@ ADD2_DY yvec4, yvec14, yvec14;
|
||||
EXTRA_DY $1, yvec15, xvec7;
|
||||
EXTRA_DY $1, yvec14, xvec6;
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0;
|
||||
LDL_DX 0*SIZE(C0, ldc, 1), xvec1;
|
||||
LDH_DX 1*SIZE(C0, ldc, 1), xvec1;
|
||||
LDL_DX 0*SIZE(C1), xvec2;
|
||||
LDH_DX 1*SIZE(C1), xvec2;
|
||||
LDL_DX 0*SIZE(C1, ldc, 1), xvec3;
|
||||
LDH_DX 1*SIZE(C1, ldc, 1), xvec3;
|
||||
ADD_DX xvec0, xvec15;
|
||||
ADD_DX xvec1, xvec7;
|
||||
ADD_DX xvec2, xvec14;
|
||||
ADD_DX xvec3, xvec6;
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
LDL_DX 0*SIZE(C0, ldc, 1), xvec1, xvec1;
|
||||
LDH_DX 1*SIZE(C0, ldc, 1), xvec1, xvec1;
|
||||
LDL_DX 0*SIZE(C1), xvec2, xvec2;
|
||||
LDH_DX 1*SIZE(C1), xvec2, xvec2;
|
||||
LDL_DX 0*SIZE(C1, ldc, 1), xvec3, xvec3;
|
||||
LDH_DX 1*SIZE(C1, ldc, 1), xvec3, xvec3;
|
||||
ADD_DX xvec0, xvec15, xvec15;
|
||||
ADD_DX xvec1, xvec7, xvec7;
|
||||
ADD_DX xvec2, xvec14, xvec14;
|
||||
ADD_DX xvec3, xvec6, xvec6;
|
||||
#endif
|
||||
STL_DX xvec15, 0*SIZE(C0);
|
||||
STH_DX xvec15, 1*SIZE(C0);
|
||||
@@ -2063,14 +2040,14 @@ JNE .L213_loopEx;
|
||||
ALIGN_5
|
||||
#### Writing back ####
|
||||
#ifndef TRMMKERNEL
|
||||
ADD_DX 0*SIZE(C0),xvec15;
|
||||
ADD_DX 2*SIZE(C1),xvec7;
|
||||
ADD_DX 4*SIZE(C0),xvec14;
|
||||
ADD_DX 6*SIZE(C1),xvec6;
|
||||
ADD_DX 0*SIZE(C1),xvec13;
|
||||
ADD_DX 2*SIZE(C0),xvec5;
|
||||
ADD_DX 4*SIZE(C1),xvec12;
|
||||
ADD_DX 6*SIZE(C0),xvec4;
|
||||
ADD_DX 0*SIZE(C0), xvec15, xvec15;
|
||||
ADD_DX 2*SIZE(C1), xvec7, xvec7;
|
||||
ADD_DX 4*SIZE(C0), xvec14, xvec14;
|
||||
ADD_DX 6*SIZE(C1), xvec6, xvec6;
|
||||
ADD_DX 0*SIZE(C1), xvec13, xvec13;
|
||||
ADD_DX 2*SIZE(C0), xvec5, xvec5;
|
||||
ADD_DX 4*SIZE(C1), xvec12, xvec12;
|
||||
ADD_DX 6*SIZE(C0), xvec4, xvec4;
|
||||
#endif
|
||||
ST_DX xvec15,0*SIZE(C0);
|
||||
ST_DX xvec7,2*SIZE(C1);
|
||||
@@ -2098,18 +2075,18 @@ JMP .L21_loopE;
|
||||
ALIGN_5
|
||||
.L213_loopEx:
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0;
|
||||
LDL_DX 2*SIZE(C1), xvec1;
|
||||
LDH_DX 3*SIZE(C1), xvec1;
|
||||
LDL_DX 4*SIZE(C0), xvec2;
|
||||
LDH_DX 5*SIZE(C0), xvec2;
|
||||
LDL_DX 6*SIZE(C1), xvec3;
|
||||
LDH_DX 7*SIZE(C1), xvec3;
|
||||
ADD_DX xvec0, xvec15;
|
||||
ADD_DX xvec1, xvec7;
|
||||
ADD_DX xvec2, xvec14;
|
||||
ADD_DX xvec3, xvec6;
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
LDL_DX 2*SIZE(C1), xvec1, xvec1;
|
||||
LDH_DX 3*SIZE(C1), xvec1, xvec1;
|
||||
LDL_DX 4*SIZE(C0), xvec2, xvec2;
|
||||
LDH_DX 5*SIZE(C0), xvec2, xvec2;
|
||||
LDL_DX 6*SIZE(C1), xvec3, xvec3;
|
||||
LDH_DX 7*SIZE(C1), xvec3, xvec3;
|
||||
ADD_DX xvec0, xvec15, xvec15;
|
||||
ADD_DX xvec1, xvec7, xvec7;
|
||||
ADD_DX xvec2, xvec14, xvec14;
|
||||
ADD_DX xvec3, xvec6, xvec6;
|
||||
#endif
|
||||
STL_DX xvec15, 0*SIZE(C0);
|
||||
STH_DX xvec15, 1*SIZE(C0);
|
||||
@@ -2120,18 +2097,18 @@ STH_DX xvec14, 5*SIZE(C0);
|
||||
STL_DX xvec6, 6*SIZE(C1);
|
||||
STH_DX xvec6, 7*SIZE(C1);
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C1), xvec3;
|
||||
LDH_DX 1*SIZE(C1), xvec3;
|
||||
LDL_DX 2*SIZE(C0), xvec2;
|
||||
LDH_DX 3*SIZE(C0), xvec2;
|
||||
LDL_DX 4*SIZE(C1), xvec1;
|
||||
LDH_DX 5*SIZE(C1), xvec1;
|
||||
LDL_DX 6*SIZE(C0), xvec0;
|
||||
LDH_DX 7*SIZE(C0), xvec0;
|
||||
ADD_DX xvec3, xvec13;
|
||||
ADD_DX xvec2, xvec5;
|
||||
ADD_DX xvec1, xvec12;
|
||||
ADD_DX xvec0, xvec4;
|
||||
LDL_DX 0*SIZE(C1), xvec3, xvec3;
|
||||
LDH_DX 1*SIZE(C1), xvec3, xvec3;
|
||||
LDL_DX 2*SIZE(C0), xvec2, xvec2;
|
||||
LDH_DX 3*SIZE(C0), xvec2, xvec2;
|
||||
LDL_DX 4*SIZE(C1), xvec1, xvec1;
|
||||
LDH_DX 5*SIZE(C1), xvec1, xvec1;
|
||||
LDL_DX 6*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 7*SIZE(C0), xvec0, xvec0;
|
||||
ADD_DX xvec3, xvec13, xvec13;
|
||||
ADD_DX xvec2, xvec5, xvec5;
|
||||
ADD_DX xvec1, xvec12, xvec12;
|
||||
ADD_DX xvec0, xvec4, xvec4;
|
||||
#endif
|
||||
STL_DX xvec13, 0*SIZE(C1);
|
||||
STH_DX xvec13, 1*SIZE(C1);
|
||||
@@ -2384,18 +2361,18 @@ EXTRA_DY $1, yvec15, xvec7;
|
||||
EXTRA_DY $1, yvec13, xvec5;
|
||||
#### Write back ####
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0;
|
||||
LDL_DX 2*SIZE(C1), xvec1;
|
||||
LDH_DX 3*SIZE(C1), xvec1;
|
||||
LDL_DX 0*SIZE(C1), xvec2;
|
||||
LDH_DX 1*SIZE(C1), xvec2;
|
||||
LDL_DX 2*SIZE(C0), xvec3;
|
||||
LDH_DX 3*SIZE(C0), xvec3;
|
||||
ADD_DX xvec0, xvec15;
|
||||
ADD_DX xvec1, xvec7;
|
||||
ADD_DX xvec2, xvec13;
|
||||
ADD_DX xvec3, xvec5;
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
LDL_DX 2*SIZE(C1), xvec1, xvec1;
|
||||
LDH_DX 3*SIZE(C1), xvec1, xvec1;
|
||||
LDL_DX 0*SIZE(C1), xvec2, xvec2;
|
||||
LDH_DX 1*SIZE(C1), xvec2, xvec2;
|
||||
LDL_DX 2*SIZE(C0), xvec3, xvec3;
|
||||
LDH_DX 3*SIZE(C0), xvec3, xvec3;
|
||||
ADD_DX xvec0, xvec15, xvec15;
|
||||
ADD_DX xvec1, xvec7, xvec7;
|
||||
ADD_DX xvec2, xvec13, xvec13;
|
||||
ADD_DX xvec3, xvec5, xvec5;
|
||||
#endif
|
||||
STL_DX xvec15, 0*SIZE(C0);
|
||||
STH_DX xvec15, 1*SIZE(C0);
|
||||
@@ -2582,12 +2559,12 @@ ADD2_DY yvec5, yvec15, yvec15;
|
||||
EXTRA_DY $1, yvec15, xvec7;
|
||||
#### Writing Back ####
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0;
|
||||
LDL_DX 0*SIZE(C1), xvec1;
|
||||
LDH_DX 1*SIZE(C1), xvec1;
|
||||
ADD_DX xvec0, xvec15;
|
||||
ADD_DX xvec1, xvec7;
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
LDL_DX 0*SIZE(C1), xvec1, xvec1;
|
||||
LDH_DX 1*SIZE(C1), xvec1, xvec1;
|
||||
ADD_DX xvec0, xvec15, xvec15;
|
||||
ADD_DX xvec1, xvec7, xvec7;
|
||||
#endif
|
||||
STL_DX xvec15, 0*SIZE(C0);
|
||||
STH_DX xvec15, 1*SIZE(C0);
|
||||
@@ -2845,18 +2822,18 @@ EXTRA_DY $1, yvec15, xvec7;
|
||||
EXTRA_DY $1, yvec14, xvec6;
|
||||
#### Writing Back ####
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0;
|
||||
LDL_DX 2*SIZE(C0), xvec1;
|
||||
LDH_DX 3*SIZE(C0), xvec1;
|
||||
LDL_DX 4*SIZE(C0), xvec2;
|
||||
LDH_DX 5*SIZE(C0), xvec2;
|
||||
LDL_DX 6*SIZE(C0), xvec3;
|
||||
LDH_DX 7*SIZE(C0), xvec3;
|
||||
ADD_DX xvec0, xvec15;
|
||||
ADD_DX xvec1, xvec7;
|
||||
ADD_DX xvec2, xvec14;
|
||||
ADD_DX xvec3, xvec6;
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
LDL_DX 2*SIZE(C0), xvec1, xvec1;
|
||||
LDH_DX 3*SIZE(C0), xvec1, xvec1;
|
||||
LDL_DX 4*SIZE(C0), xvec2, xvec2;
|
||||
LDH_DX 5*SIZE(C0), xvec2, xvec2;
|
||||
LDL_DX 6*SIZE(C0), xvec3, xvec3;
|
||||
LDH_DX 7*SIZE(C0), xvec3, xvec3;
|
||||
ADD_DX xvec0, xvec15, xvec15;
|
||||
ADD_DX xvec1, xvec7, xvec7;
|
||||
ADD_DX xvec2, xvec14, xvec14;
|
||||
ADD_DX xvec3, xvec6, xvec6;
|
||||
#endif
|
||||
STL_DX xvec15, 0*SIZE(C0);
|
||||
STH_DX xvec15, 1*SIZE(C0);
|
||||
@@ -3026,12 +3003,12 @@ ADD2_DY yvec5, yvec15, yvec15;
|
||||
EXTRA_DY $1, yvec15, xvec7;
|
||||
#### Writing Back ####
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0;
|
||||
LDL_DX 2*SIZE(C0), xvec1;
|
||||
LDH_DX 3*SIZE(C0), xvec1;
|
||||
ADD_DX xvec0, xvec15;
|
||||
ADD_DX xvec1, xvec7;
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
LDL_DX 2*SIZE(C0), xvec1, xvec1;
|
||||
LDH_DX 3*SIZE(C0), xvec1, xvec1;
|
||||
ADD_DX xvec0, xvec15, xvec15;
|
||||
ADD_DX xvec1, xvec7, xvec7;
|
||||
#endif
|
||||
STL_DX xvec15, 0*SIZE(C0);
|
||||
STH_DX xvec15, 1*SIZE(C0);
|
||||
@@ -3084,43 +3061,43 @@ ALIGN_5
|
||||
.L331_bodyB:
|
||||
LD_DX 0*SIZE(ptrba), xvec0;
|
||||
BROAD_DX 0*SIZE(ptrbb), xvec2;
|
||||
MUL_DX xvec0, xvec2;
|
||||
ADD1_DX xvec2, xvec15;
|
||||
MUL_DX xvec0, xvec2, xvec2;
|
||||
ADD1_DX xvec2, xvec15, xvec15;
|
||||
|
||||
SHUF_DX $0x4e, xvec0, xvec1;
|
||||
BROAD_DX 1*SIZE(ptrbb), xvec3;
|
||||
MUL_DX xvec1, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15;
|
||||
MUL_DX xvec1, xvec3, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15, xvec15;
|
||||
|
||||
LD_DX 2*SIZE(ptrba), xvec0;
|
||||
BROAD_DX 2*SIZE(ptrbb), xvec2;
|
||||
MUL_DX xvec0, xvec2;
|
||||
ADD1_DX xvec2, xvec15;
|
||||
MUL_DX xvec0, xvec2, xvec2;
|
||||
ADD1_DX xvec2, xvec15, xvec15;
|
||||
|
||||
SHUF_DX $0x4e, xvec0, xvec1;
|
||||
BROAD_DX 3*SIZE(ptrbb), xvec3;
|
||||
MUL_DX xvec1, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15;
|
||||
MUL_DX xvec1, xvec3, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15, xvec15;
|
||||
|
||||
LD_DX 4*SIZE(ptrba), xvec0;
|
||||
BROAD_DX 4*SIZE(ptrbb), xvec2;
|
||||
MUL_DX xvec0, xvec2;
|
||||
ADD1_DX xvec2, xvec15;
|
||||
MUL_DX xvec0, xvec2, xvec2;
|
||||
ADD1_DX xvec2, xvec15, xvec15;
|
||||
|
||||
SHUF_DX $0x4e, xvec0, xvec1;
|
||||
BROAD_DX 5*SIZE(ptrbb), xvec3;
|
||||
MUL_DX xvec1, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15;
|
||||
MUL_DX xvec1, xvec3, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15, xvec15;
|
||||
|
||||
LD_DX 6*SIZE(ptrba), xvec0;
|
||||
BROAD_DX 6*SIZE(ptrbb), xvec2;
|
||||
MUL_DX xvec0, xvec2;
|
||||
ADD1_DX xvec2, xvec15;
|
||||
MUL_DX xvec0, xvec2, xvec2;
|
||||
ADD1_DX xvec2, xvec15, xvec15;
|
||||
|
||||
SHUF_DX $0x4e, xvec0, xvec1;
|
||||
BROAD_DX 7*SIZE(ptrbb), xvec3;
|
||||
MUL_DX xvec1, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15;
|
||||
MUL_DX xvec1, xvec3, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15, xvec15;
|
||||
ADDQ $8*SIZE, ptrba;
|
||||
ADDQ $8*SIZE, ptrbb;
|
||||
DECQ k;
|
||||
@@ -3137,23 +3114,23 @@ ALIGN_5
|
||||
.L332_bodyB:
|
||||
LD_DX 0*SIZE(ptrba), xvec0;
|
||||
BROAD_DX 0*SIZE(ptrbb), xvec2;
|
||||
MUL_DX xvec0, xvec2;
|
||||
ADD1_DX xvec2, xvec15;
|
||||
MUL_DX xvec0, xvec2, xvec2;
|
||||
ADD1_DX xvec2, xvec15, xvec15;
|
||||
|
||||
SHUF_DX $0x4e, xvec0, xvec1;
|
||||
BROAD_DX 1*SIZE(ptrbb), xvec3;
|
||||
MUL_DX xvec1, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15;
|
||||
MUL_DX xvec1, xvec3, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15, xvec15;
|
||||
|
||||
LD_DX 2*SIZE(ptrba), xvec0;
|
||||
BROAD_DX 2*SIZE(ptrbb), xvec2;
|
||||
MUL_DX xvec0, xvec2;
|
||||
ADD1_DX xvec2, xvec15;
|
||||
MUL_DX xvec0, xvec2, xvec2;
|
||||
ADD1_DX xvec2, xvec15, xvec15;
|
||||
|
||||
SHUF_DX $0x4e, xvec0, xvec1;
|
||||
BROAD_DX 3*SIZE(ptrbb), xvec3;
|
||||
MUL_DX xvec1, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15;
|
||||
MUL_DX xvec1, xvec3, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15, xvec15;
|
||||
ADDQ $4*SIZE, ptrba;
|
||||
ADDQ $4*SIZE, ptrbb;
|
||||
|
||||
@@ -3168,13 +3145,13 @@ ALIGN_5
|
||||
.L333_bodyB:
|
||||
LD_DX 0*SIZE(ptrba), xvec0;
|
||||
BROAD_DX 0*SIZE(ptrbb), xvec2;
|
||||
MUL_DX xvec0, xvec2;
|
||||
ADD1_DX xvec2, xvec15;
|
||||
MUL_DX xvec0, xvec2, xvec2;
|
||||
ADD1_DX xvec2, xvec15, xvec15;
|
||||
|
||||
SHUF_DX $0x4e, xvec0, xvec1;
|
||||
BROAD_DX 1*SIZE(ptrbb), xvec3;
|
||||
MUL_DX xvec1, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15;
|
||||
MUL_DX xvec1, xvec3, xvec3;
|
||||
ADDSUB_DX xvec3, xvec15, xvec15;
|
||||
ADDQ $2*SIZE, ptrba;
|
||||
ADDQ $2*SIZE, ptrbb;
|
||||
|
||||
@@ -3182,14 +3159,14 @@ ADDQ $2*SIZE, ptrbb;
|
||||
#### Handle ####
|
||||
XOR_DY yvec7, yvec7, yvec7;
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
ADDSUB_DX xvec15, xvec7;
|
||||
ADDSUB_DX xvec15, xvec7, xvec7;
|
||||
MOV_DX xvec7, xvec15;
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
SUB_DX xvec15, xvec7;
|
||||
SUB_DX xvec15, xvec7, xvec7;
|
||||
MOV_DX xvec7, xvec15;
|
||||
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
SHUF_DX $0x4e, xvec15, xvec15;
|
||||
ADDSUB_DX xvec15, xvec7;
|
||||
ADDSUB_DX xvec15, xvec7, xvec7;
|
||||
MOV_DX xvec7, xvec15;
|
||||
SHUF_DX $0x4e, xvec15, xvec15;
|
||||
#endif
|
||||
@@ -3199,14 +3176,14 @@ BROAD_DX MEMALPHA_R,xvec7;
|
||||
BROAD_DX MEMALPHA_I,xvec6;
|
||||
#### Multiply Alpha ####
|
||||
SHUF_DX $0x4e, xvec15, xvec5;
|
||||
MUL_DX xvec7, xvec15;
|
||||
MUL_DX xvec6, xvec5;
|
||||
ADDSUB_DX xvec5, xvec15;
|
||||
MUL_DX xvec7, xvec15, xvec15;
|
||||
MUL_DX xvec6, xvec5, xvec5;
|
||||
ADDSUB_DX xvec5, xvec15, xvec15;
|
||||
#### Writing back ####
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0;
|
||||
ADD_DX xvec0, xvec15;
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
ADD_DX xvec0, xvec15, xvec15;
|
||||
#endif
|
||||
STL_DX xvec15, 0*SIZE(C0);
|
||||
STH_DX xvec15, 1*SIZE(C0);
|
||||
@@ -3237,6 +3214,9 @@ movq 24(%rsp), %r13;
|
||||
movq 32(%rsp), %r14;
|
||||
movq 40(%rsp), %r15;
|
||||
|
||||
|
||||
vzeroupper
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
movq 48(%rsp), %rdi
|
||||
movq 56(%rsp), %rsi
|
||||
|
||||
Reference in New Issue
Block a user