diff --git a/src/amd64/Makefile.am b/src/amd64/Makefile.am index 07aae69..2870ab5 100644 --- a/src/amd64/Makefile.am +++ b/src/amd64/Makefile.am @@ -1,3 +1,5 @@ +NAFLAGS += -DASM_ARCH_AMD64 + AMD64_ASM = \ cpuid_amd64.asm \ rfxcodec_encode_dwt_shift_amd64_sse2.asm \ diff --git a/src/amd64/cpuid_amd64.asm b/src/amd64/cpuid_amd64.asm index acc738e..38e2023 100644 --- a/src/amd64/cpuid_amd64.asm +++ b/src/amd64/cpuid_amd64.asm @@ -1,7 +1,5 @@ %include "common.asm" -section .text - ;The first six integer or pointer arguments are passed in registers ;RDI, RSI, RDX, RCX, R8, and R9 @@ -32,5 +30,4 @@ PROC cpuid_amd64 ; restore registers pop rbx ret - align 16 - +END_OF_FILE diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm index 69ccf07..cef3902 100644 --- a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm +++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm @@ -21,8 +21,7 @@ %include "common.asm" -section .data - align 16 +PREPARE_RODATA cw128 times 8 dw 128 cdFFFF times 4 dd 65535 ; these are 1 << (factor - 1) 0 to 15 is factor @@ -43,8 +42,6 @@ section .data cwa8192 times 8 dw 8192 ; 14 cwa16384 times 8 dw 16384 ; 15 -section .text - ;****************************************************************************** ; source 16 bit signed, 16 pixel width rfx_dwt_2d_encode_block_horiz_16_16: @@ -55,8 +52,8 @@ loop1a: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -66,8 +63,8 @@ loop1a: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -85,8 +82,8 @@ loop1a: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -247,8 +244,8 @@ loop1c: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -258,8 +255,8 @@ loop1c: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -277,8 +274,8 @@ loop1c: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -328,8 +325,8 @@ loop1c: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -339,8 +336,8 @@ loop1c: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -358,8 +355,8 @@ loop1c: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -423,8 +420,8 @@ loop1c1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -434,8 +431,8 @@ loop1c1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -453,8 +450,8 @@ loop1c1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -501,8 +498,8 @@ loop1c1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -512,8 +509,8 @@ loop1c1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -531,8 +528,8 @@ loop1c1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -690,8 +687,8 @@ loop1e: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -701,8 +698,8 @@ loop1e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -720,8 +717,8 @@ loop1e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -774,8 +771,8 @@ loop2e: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -785,8 +782,8 @@ loop2e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -804,8 +801,8 @@ loop2e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -857,8 +854,8 @@ loop2e: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -868,8 +865,8 @@ loop2e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -887,8 +884,8 @@ loop2e: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -952,8 +949,8 @@ loop1e1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -963,8 +960,8 @@ loop1e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -982,8 +979,8 @@ loop1e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1033,8 +1030,8 @@ loop2e1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -1044,8 +1041,8 @@ loop2e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -1063,8 +1060,8 @@ loop2e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1113,8 +1110,8 @@ loop2e1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -1124,8 +1121,8 @@ loop2e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -1143,8 +1140,8 @@ loop2e1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1207,9 +1204,9 @@ loop1f: punpcklbw xmm1, xmm0 punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm1, [rel cw128] - psubw xmm2, [rel cw128] - psubw xmm3, [rel cw128] + psubw xmm1, [lsym(cw128)] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm1, 5 psllw xmm2, 5 psllw xmm3, 5 @@ -1241,8 +1238,8 @@ loop2f: movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm2, [rel cw128] - psubw xmm3, [rel cw128] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm2, 5 psllw xmm3, 5 movdqa xmm4, xmm1 @@ -1274,7 +1271,7 @@ loop2f: movdqa xmm1, xmm3 ; src[2n] movq xmm2, [rsi + 64 * 1] ; src[2n + 1] punpcklbw xmm2, xmm0 - psubw xmm2, [rel cw128] + psubw xmm2, [lsym(cw128)] psllw xmm2, 5 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1314,7 +1311,7 @@ set_quants_hi: sub rax, 6 - 5 movd xmm9, eax imul rax, 16 - lea rdx, [rel cwa0] + lea rdx, [lsym(cwa0)] add rdx, rax movdqa xmm8, [rdx] ret @@ -1323,7 +1320,7 @@ set_quants_lo: sub rax, 6 - 5 movd xmm11, eax imul rax, 16 - lea rdx, [rel cwa0] + lea rdx, [lsym(cwa0)] add rdx, rax movdqa xmm10, [rdx] ret @@ -1487,5 +1484,4 @@ PROC rfxcodec_encode_dwt_shift_amd64_sse2 pop rdx pop rbx ret - align 16 - +END_OF_FILE diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm index 2b19f81..da176e7 100644 --- a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm +++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm @@ -21,8 +21,7 @@ %include "common.asm" -section .data - align 16 +PREPARE_RODATA cw128 times 8 dw 128 cdFFFF times 4 dd 65535 ; these are 1 << (factor - 1) 0 to 15 is factor @@ -43,8 +42,6 @@ section .data cwa8192 times 8 dw 8192 ; 14 cwa16384 times 8 dw 16384 ; 15 -section .text - ;****************************************************************************** ; source 16 bit signed, 16 pixel width rfx_dwt_2d_encode_block_horiz_16_16: @@ -55,15 +52,15 @@ loop1a: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -77,8 +74,8 @@ loop1a: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -235,15 +232,15 @@ loop1c: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -257,8 +254,8 @@ loop1c: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -304,15 +301,15 @@ loop1c: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -326,8 +323,8 @@ loop1c: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -387,15 +384,15 @@ loop1c1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -409,8 +406,8 @@ loop1c1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -453,15 +450,15 @@ loop1c1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -475,8 +472,8 @@ loop1c1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -630,15 +627,15 @@ loop1e: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -652,8 +649,8 @@ loop1e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -702,15 +699,15 @@ loop2e: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -724,8 +721,8 @@ loop2e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -773,15 +770,15 @@ loop2e: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -795,8 +792,8 @@ loop2e: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -856,15 +853,15 @@ loop1e1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -878,8 +875,8 @@ loop1e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -925,15 +922,15 @@ loop2e1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -947,8 +944,8 @@ loop2e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -993,15 +990,15 @@ loop2e1: movdqa xmm2, [rsi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [rel cdFFFF] - pand xmm2, [rel cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [rel cdFFFF] - pand xmm3, [rel cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -1015,8 +1012,8 @@ loop2e1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [rel cdFFFF] - pand xmm4, [rel cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1075,9 +1072,9 @@ loop1f: punpcklbw xmm1, xmm0 punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm1, [rel cw128] - psubw xmm2, [rel cw128] - psubw xmm3, [rel cw128] + psubw xmm1, [lsym(cw128)] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm1, 5 psllw xmm2, 5 psllw xmm3, 5 @@ -1109,8 +1106,8 @@ loop2f: movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm2, [rel cw128] - psubw xmm3, [rel cw128] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm2, 5 psllw xmm3, 5 movdqa xmm4, xmm1 @@ -1142,7 +1139,7 @@ loop2f: movdqa xmm1, xmm3 ; src[2n] movq xmm2, [rsi + 64 * 1] ; src[2n + 1] punpcklbw xmm2, xmm0 - psubw xmm2, [rel cw128] + psubw xmm2, [lsym(cw128)] psllw xmm2, 5 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1182,7 +1179,7 @@ set_quants_hi: sub rax, 6 - 5 movd xmm9, eax imul rax, 16 - lea rdx, [rel cwa0] + lea rdx, [lsym(cwa0)] add rdx, rax movdqa xmm8, [rdx] ret @@ -1191,7 +1188,7 @@ set_quants_lo: sub rax, 6 - 5 movd xmm11, eax imul rax, 16 - lea rdx, [rel cwa0] + lea rdx, [lsym(cwa0)] add rdx, rax movdqa xmm10, [rdx] ret @@ -1355,5 +1352,4 @@ PROC rfxcodec_encode_dwt_shift_amd64_sse41 pop rdx pop rbx ret - align 16 - +END_OF_FILE diff --git a/src/common.asm b/src/common.asm index 7e2b84b..cf7102d 100644 --- a/src/common.asm +++ b/src/common.asm @@ -1,5 +1,6 @@ ; ;Copyright 2017 Pavel Roskin +;Copyright 2017 mirabilos ; ;Permission to use, copy, modify, distribute, and sell this software and its ;documentation for any purpose is hereby granted without fee, provided that @@ -33,6 +34,19 @@ %define is_elf 1 %endif +; Detect Mach-O formats +%ifidn __OUTPUT_FORMAT__,macho +%define is_macho 1 +%endif + +%ifidn __OUTPUT_FORMAT__,macho32 +%define is_macho 1 +%endif + +%ifidn __OUTPUT_FORMAT__,macho64 +%define is_macho 1 +%endif + ; Mark stack non-executable %ifdef is_elf section .note.GNU-stack noalloc noexec nowrite progbits @@ -49,3 +63,64 @@ section .note.GNU-stack noalloc noexec nowrite progbits _%1: %endif %endmacro + +; Macros for relative access to local data +%undef lsym + +%ifdef ASM_ARCH_AMD64 +; amd64; don't define or call RETRIEVE_RODATA +%define lsym(name) rel name +%endif + +%ifdef ASM_ARCH_I386 +%ifdef PIC +; i386 PIC + +%macro END_OF_FILE 0 +%ifdef I386_PIC_NEEDED +section .text +..@get_caller_address: + mov ebx, [esp] + ret +%endif +%ifdef is_macho +; see below + align 16 +%endif +%endmacro + +%macro RETRIEVE_RODATA 0 +%define I386_PIC_NEEDED 1 + call ..@get_caller_address +%%the_caller_address: + sub ebx, %%the_caller_address - ..@rodata_begin +%endmacro + +%define lsym(name) ebx + name - ..@rodata_begin +%else +; i386 non-PIC; default case for lsym and RETRIEVE_RODATA +%endif +%endif + +%ifndef lsym +%macro RETRIEVE_RODATA 0 +%endmacro +%define lsym(name) name +%endif + +%macro PREPARE_RODATA 0 +section .text + align 16 +..@rodata_begin: +%endmacro + +%ifnmacro END_OF_FILE 0 +%macro END_OF_FILE 0 +%ifdef is_macho +; cf. https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/simd/jccolext-mmx.asm#L474-L476 + align 16 +%endif +%endmacro +%endif + +section .text diff --git a/src/x86/Makefile.am b/src/x86/Makefile.am index 2d099e5..3c88cee 100644 --- a/src/x86/Makefile.am +++ b/src/x86/Makefile.am @@ -1,3 +1,5 @@ +NAFLAGS += -DASM_ARCH_I386 + X86_ASM = \ cpuid_x86.asm \ rfxcodec_encode_dwt_shift_x86_sse2.asm \ diff --git a/src/x86/cpuid_x86.asm b/src/x86/cpuid_x86.asm index 4ddb8a2..b666732 100644 --- a/src/x86/cpuid_x86.asm +++ b/src/x86/cpuid_x86.asm @@ -1,7 +1,5 @@ %include "common.asm" -section .text - ;int ;cpuid_x86(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx) @@ -29,6 +27,5 @@ PROC cpuid_x86 pop edx pop ecx pop ebx - ret; - align 16 - + ret +END_OF_FILE diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm index fdfbae1..f05a705 100644 --- a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm +++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm @@ -1,5 +1,6 @@ ; ;Copyright 2016 Jay Sorg +;Copyright 2017 mirabilos ; ;Permission to use, copy, modify, distribute, and sell this software and its ;documentation for any purpose is hereby granted without fee, provided that @@ -21,8 +22,7 @@ %include "common.asm" -section .data - align 16 +PREPARE_RODATA cw128 times 8 dw 128 cdFFFF times 4 dd 65535 ; these are 1 << (factor - 1) 0 to 15 is factor @@ -43,8 +43,6 @@ section .data cwa8192 times 8 dw 8192 ; 14 cwa16384 times 8 dw 16384 ; 15 -section .text - %define LHI_ADD [esp + 1 * 16 + 4] %define LHI_SFT [esp + 2 * 16 + 4] %define LLO_ADD [esp + 3 * 16 + 4] @@ -60,8 +58,8 @@ loop1a: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -71,8 +69,8 @@ loop1a: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -90,8 +88,8 @@ loop1a: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -252,8 +250,8 @@ loop1c: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -263,8 +261,8 @@ loop1c: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -282,8 +280,8 @@ loop1c: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -316,7 +314,7 @@ loop1c: paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa xmm6, xmm5 ; out lo paddw xmm6, LLO_ADD @@ -333,8 +331,8 @@ loop1c: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -344,8 +342,8 @@ loop1c: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -363,8 +361,8 @@ loop1c: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -387,7 +385,7 @@ loop1c: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 @@ -428,8 +426,8 @@ loop1c1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -439,8 +437,8 @@ loop1c1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -458,8 +456,8 @@ loop1c1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -492,7 +490,7 @@ loop1c1: paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa [edx], xmm5 ; out lo @@ -506,8 +504,8 @@ loop1c1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -517,8 +515,8 @@ loop1c1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -536,8 +534,8 @@ loop1c1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -560,7 +558,7 @@ loop1c1: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 @@ -695,8 +693,8 @@ loop1e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -706,8 +704,8 @@ loop1e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -725,8 +723,8 @@ loop1e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -759,7 +757,7 @@ loop1e: paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa xmm6, xmm5 ; out lo paddw xmm6, LLO_ADD @@ -779,8 +777,8 @@ loop2e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -790,8 +788,8 @@ loop2e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -809,8 +807,8 @@ loop2e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -834,14 +832,14 @@ loop2e: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa xmm6, xmm5 ; out lo paddw xmm6, LLO_ADD @@ -862,8 +860,8 @@ loop2e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -873,8 +871,8 @@ loop2e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -892,8 +890,8 @@ loop2e: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -916,7 +914,7 @@ loop2e: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 @@ -957,8 +955,8 @@ loop1e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -968,8 +966,8 @@ loop1e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -987,8 +985,8 @@ loop1e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1021,7 +1019,7 @@ loop1e1: paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa [edx], xmm5 ; out lo @@ -1038,8 +1036,8 @@ loop2e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -1049,8 +1047,8 @@ loop2e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -1068,8 +1066,8 @@ loop2e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1093,14 +1091,14 @@ loop2e1: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa [edx], xmm5 ; out lo @@ -1118,8 +1116,8 @@ loop2e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -1129,8 +1127,8 @@ loop2e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -1148,8 +1146,8 @@ loop2e1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1172,7 +1170,7 @@ loop2e1: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 @@ -1212,9 +1210,9 @@ loop1f: punpcklbw xmm1, xmm0 punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm1, [cw128] - psubw xmm2, [cw128] - psubw xmm3, [cw128] + psubw xmm1, [lsym(cw128)] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm1, 5 psllw xmm2, 5 psllw xmm3, 5 @@ -1246,8 +1244,8 @@ loop2f: movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm2, [cw128] - psubw xmm3, [cw128] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm2, 5 psllw xmm3, 5 movdqa xmm4, xmm1 @@ -1279,7 +1277,7 @@ loop2f: movdqa xmm1, xmm3 ; src[2n] movq xmm2, [esi + 64 * 1] ; src[2n + 1] punpcklbw xmm2, xmm0 - psubw xmm2, [cw128] + psubw xmm2, [lsym(cw128)] psllw xmm2, 5 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1320,7 +1318,7 @@ set_quants_hi: movd xmm1, eax movdqa LHI_SFT, xmm1 imul eax, 16 - lea edx, [cwa0] + lea edx, [lsym(cwa0)] add edx, eax movdqa xmm1, [edx] movdqa LHI_ADD, xmm1 @@ -1331,7 +1329,7 @@ set_quants_lo: movd xmm1, eax movdqa LLO_SFT, xmm1 imul eax, 16 - lea edx, [cwa0] + lea edx, [lsym(cwa0)] add edx, eax movdqa xmm1, [edx] movdqa LLO_ADD, xmm1 @@ -1363,6 +1361,7 @@ PROC rfxcodec_encode_dwt_shift_x86_sse2 movdqu [esp], xmm0 ; save registers push ebx + RETRIEVE_RODATA push esi push edi push ebp @@ -1517,5 +1516,4 @@ PROC rfxcodec_encode_dwt_shift_x86_sse2 ; return value mov eax, 0 ret - align 16 - +END_OF_FILE diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm index 501a0bc..00d4b1d 100644 --- a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm +++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm @@ -1,5 +1,6 @@ ; ;Copyright 2016 Jay Sorg +;Copyright 2017 mirabilos ; ;Permission to use, copy, modify, distribute, and sell this software and its ;documentation for any purpose is hereby granted without fee, provided that @@ -21,8 +22,7 @@ %include "common.asm" -section .data - align 16 +PREPARE_RODATA cw128 times 8 dw 128 cdFFFF times 4 dd 65535 ; these are 1 << (factor - 1) 0 to 15 is factor @@ -43,8 +43,6 @@ section .data cwa8192 times 8 dw 8192 ; 14 cwa16384 times 8 dw 16384 ; 15 -section .text - %define LHI_ADD [esp + 1 * 16 + 4] %define LHI_SFT [esp + 2 * 16 + 4] %define LLO_ADD [esp + 3 * 16 + 4] @@ -60,15 +58,15 @@ loop1a: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -82,8 +80,8 @@ loop1a: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -240,15 +238,15 @@ loop1c: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -262,8 +260,8 @@ loop1c: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -292,7 +290,7 @@ loop1c: paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa xmm6, xmm5 ; out lo paddw xmm6, LLO_ADD @@ -309,15 +307,15 @@ loop1c: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -331,8 +329,8 @@ loop1c: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -351,7 +349,7 @@ loop1c: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 @@ -392,15 +390,15 @@ loop1c1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -414,8 +412,8 @@ loop1c1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -444,7 +442,7 @@ loop1c1: paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa [edx], xmm5 ; out lo @@ -458,15 +456,15 @@ loop1c1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -480,8 +478,8 @@ loop1c1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -500,7 +498,7 @@ loop1c1: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 @@ -635,15 +633,15 @@ loop1e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -657,8 +655,8 @@ loop1e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -687,7 +685,7 @@ loop1e: paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa xmm6, xmm5 ; out lo paddw xmm6, LLO_ADD @@ -707,15 +705,15 @@ loop2e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -729,8 +727,8 @@ loop2e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -750,14 +748,14 @@ loop2e: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa xmm6, xmm5 ; out lo paddw xmm6, LLO_ADD @@ -778,15 +776,15 @@ loop2e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -800,8 +798,8 @@ loop2e: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -820,7 +818,7 @@ loop2e: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 @@ -861,15 +859,15 @@ loop1e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -883,8 +881,8 @@ loop1e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -913,7 +911,7 @@ loop1e1: paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa [edx], xmm5 ; out lo @@ -930,15 +928,15 @@ loop2e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -952,8 +950,8 @@ loop2e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -973,14 +971,14 @@ loop2e1: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 paddw xmm5, xmm1 psrldq xmm2, 14 - movd ebx, xmm2 ; save hi + movd ebp, xmm2 ; save hi movdqa [edx], xmm5 ; out lo @@ -998,15 +996,15 @@ loop2e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -1020,8 +1018,8 @@ loop2e1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1040,7 +1038,7 @@ loop2e1: ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) movdqa xmm7, xmm5 pslldq xmm7, 2 - movd xmm6, ebx + movd xmm6, ebp por xmm7, xmm6 paddw xmm5, xmm7 psraw xmm5, 1 @@ -1080,9 +1078,9 @@ loop1f: punpcklbw xmm1, xmm0 punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm1, [cw128] - psubw xmm2, [cw128] - psubw xmm3, [cw128] + psubw xmm1, [lsym(cw128)] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm1, 5 psllw xmm2, 5 psllw xmm3, 5 @@ -1114,8 +1112,8 @@ loop2f: movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm2, [cw128] - psubw xmm3, [cw128] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm2, 5 psllw xmm3, 5 movdqa xmm4, xmm1 @@ -1147,7 +1145,7 @@ loop2f: movdqa xmm1, xmm3 ; src[2n] movq xmm2, [esi + 64 * 1] ; src[2n + 1] punpcklbw xmm2, xmm0 - psubw xmm2, [cw128] + psubw xmm2, [lsym(cw128)] psllw xmm2, 5 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1188,7 +1186,7 @@ set_quants_hi: movd xmm1, eax movdqa LHI_SFT, xmm1 imul eax, 16 - lea edx, [cwa0] + lea edx, [lsym(cwa0)] add edx, eax movdqa xmm1, [edx] movdqa LHI_ADD, xmm1 @@ -1199,7 +1197,7 @@ set_quants_lo: movd xmm1, eax movdqa LLO_SFT, xmm1 imul eax, 16 - lea edx, [cwa0] + lea edx, [lsym(cwa0)] add edx, eax movdqa xmm1, [edx] movdqa LLO_ADD, xmm1 @@ -1231,6 +1229,7 @@ PROC rfxcodec_encode_dwt_shift_x86_sse41 movdqu [esp], xmm0 ; save registers push ebx + RETRIEVE_RODATA push esi push edi push ebp @@ -1385,5 +1384,4 @@ PROC rfxcodec_encode_dwt_shift_x86_sse41 ; return value mov eax, 0 ret - align 16 - +END_OF_FILE