/* synth_stereo_x86_64_float: SSE optimized synth for x86-64 (stereo specific, float output version) copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 see COPYING and AUTHORS files in distribution or http://mpg123.org initially written by Taihei Monma */ #include "mangle.h" #ifdef _WIN64 /* short *window; */ #define WINDOW %r10 /* short *b0l; */ #define B0L %rdx /* short *b0r; */ #define B0R %r8 /* short *samples; */ #define SAMPLES %r9 #else /* real *window; */ #define WINDOW %rdi /* real *b0l; */ #define B0L %rsi /* real *b0r; */ #define B0R %rdx /* real *samples; */ #define SAMPLES %r9 #endif #define XMMREG_SCALE (%r11) /* {1/32768.0, 1/32768.0, 1/32768.0, 1/32768.0} */ /* int synth_1to1_real_s_x86_64_asm(real *window, real *b0l, real *b0r, real *samples, int bo1); return value: number of clipped samples (0) */ #ifndef __APPLE__ .section .rodata #else .data #endif ALIGN32 ASM_NAME(scale_x86_64): .long 939524096 .long 939524096 .long 939524096 .long 939524096 .text ALIGN16 .globl ASM_NAME(synth_1to1_real_s_x86_64_asm) ASM_NAME(synth_1to1_real_s_x86_64_asm): #ifdef _WIN64 /* should save xmm6-15 */ movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */ subq $168, %rsp /* stack alignment + 10 xmm registers */ movaps %xmm6, (%rsp) movaps %xmm7, 16(%rsp) movaps %xmm8, 32(%rsp) movaps %xmm9, 48(%rsp) movaps %xmm10, 64(%rsp) movaps %xmm11, 80(%rsp) movaps %xmm12, 96(%rsp) movaps %xmm13, 112(%rsp) movaps %xmm14, 128(%rsp) movaps %xmm15, 144(%rsp) #endif leaq ASM_NAME(scale_x86_64)(%rip), %r11 #ifdef _WIN64 shlq $32, %rax shrq $30, %rax movq %rcx, %r10 #else movq %r8, %rax shlq $32, %rax shrq $30, %rax movq %rcx, %r9 #endif leaq 64(WINDOW), WINDOW subq %rax, WINDOW movl $4, %ecx ALIGN16 1: movups (WINDOW), %xmm0 movups 16(WINDOW), %xmm1 movups 32(WINDOW), %xmm2 movups 48(WINDOW), %xmm3 add $128, WINDOW movaps %xmm0, %xmm4 movaps %xmm1, %xmm5 movaps %xmm2, %xmm6 movaps %xmm3, %xmm7 mulps (B0L), %xmm0 mulps 16(B0L), %xmm1 mulps 32(B0L), %xmm2 mulps 48(B0L), %xmm3 mulps (B0R), %xmm4 mulps 16(B0R), %xmm5 mulps 32(B0R), %xmm6 mulps 48(B0R), %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 movups (WINDOW), %xmm1 movups 16(WINDOW), %xmm3 movups 32(WINDOW), %xmm5 movups 48(WINDOW), %xmm7 add $128, WINDOW movaps %xmm1, %xmm8 movaps %xmm3, %xmm9 movaps %xmm5, %xmm10 movaps %xmm7, %xmm11 mulps 64(B0L), %xmm1 mulps 80(B0L), %xmm3 mulps 96(B0L), %xmm5 mulps 112(B0L), %xmm7 mulps 64(B0R), %xmm8 mulps 80(B0R), %xmm9 mulps 96(B0R), %xmm10 mulps 112(B0R), %xmm11 addps %xmm3, %xmm1 addps %xmm7, %xmm5 addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm2, %xmm0 addps %xmm6, %xmm4 addps %xmm5, %xmm1 addps %xmm10, %xmm8 movaps %xmm0, %xmm2 movaps %xmm1, %xmm3 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm2 unpcklps %xmm8, %xmm1 unpckhps %xmm8, %xmm3 addps %xmm2, %xmm0 addps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 subps %xmm2, %xmm0 mulps XMMREG_SCALE, %xmm0 movups %xmm0, (SAMPLES) add $128, B0L add $128, B0R movups (WINDOW), %xmm0 movups 16(WINDOW), %xmm1 movups 32(WINDOW), %xmm2 movups 48(WINDOW), %xmm3 add $128, WINDOW movaps %xmm0, %xmm4 movaps %xmm1, %xmm5 movaps %xmm2, %xmm6 movaps %xmm3, %xmm7 mulps (B0L), %xmm0 mulps 16(B0L), %xmm1 mulps 32(B0L), %xmm2 mulps 48(B0L), %xmm3 mulps (B0R), %xmm4 mulps 16(B0R), %xmm5 mulps 32(B0R), %xmm6 mulps 48(B0R), %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 movups (WINDOW), %xmm1 movups 16(WINDOW), %xmm3 movups 32(WINDOW), %xmm5 movups 48(WINDOW), %xmm7 add $128, WINDOW movaps %xmm1, %xmm8 movaps %xmm3, %xmm9 movaps %xmm5, %xmm10 movaps %xmm7, %xmm11 mulps 64(B0L), %xmm1 mulps 80(B0L), %xmm3 mulps 96(B0L), %xmm5 mulps 112(B0L), %xmm7 mulps 64(B0R), %xmm8 mulps 80(B0R), %xmm9 mulps 96(B0R), %xmm10 mulps 112(B0R), %xmm11 addps %xmm3, %xmm1 addps %xmm7, %xmm5 addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm2, %xmm0 addps %xmm6, %xmm4 addps %xmm5, %xmm1 addps %xmm10, %xmm8 movaps %xmm0, %xmm2 movaps %xmm1, %xmm3 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm2 unpcklps %xmm8, %xmm1 unpckhps %xmm8, %xmm3 addps %xmm2, %xmm0 addps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 subps %xmm2, %xmm0 mulps XMMREG_SCALE, %xmm0 movups %xmm0, 16(SAMPLES) add $128, B0L add $128, B0R add $32, SAMPLES decl %ecx jnz 1b movl $4, %ecx ALIGN16 1: movups (WINDOW), %xmm0 movups 16(WINDOW), %xmm1 movups 32(WINDOW), %xmm2 movups 48(WINDOW), %xmm3 add $128, WINDOW movaps %xmm0, %xmm4 movaps %xmm1, %xmm5 movaps %xmm2, %xmm6 movaps %xmm3, %xmm7 mulps (B0L), %xmm0 mulps 16(B0L), %xmm1 mulps 32(B0L), %xmm2 mulps 48(B0L), %xmm3 mulps (B0R), %xmm4 mulps 16(B0R), %xmm5 mulps 32(B0R), %xmm6 mulps 48(B0R), %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 movups (WINDOW), %xmm1 movups 16(WINDOW), %xmm3 movups 32(WINDOW), %xmm5 movups 48(WINDOW), %xmm7 add $128, WINDOW movaps %xmm1, %xmm8 movaps %xmm3, %xmm9 movaps %xmm5, %xmm10 movaps %xmm7, %xmm11 mulps -64(B0L), %xmm1 mulps -48(B0L), %xmm3 mulps -32(B0L), %xmm5 mulps -16(B0L), %xmm7 mulps -64(B0R), %xmm8 mulps -48(B0R), %xmm9 mulps -32(B0R), %xmm10 mulps -16(B0R), %xmm11 addps %xmm3, %xmm1 addps %xmm7, %xmm5 addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm2, %xmm0 addps %xmm6, %xmm4 addps %xmm5, %xmm1 addps %xmm10, %xmm8 movaps %xmm0, %xmm2 movaps %xmm1, %xmm3 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm2 unpcklps %xmm8, %xmm1 unpckhps %xmm8, %xmm3 addps %xmm2, %xmm0 addps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 addps %xmm2, %xmm0 mulps XMMREG_SCALE, %xmm0 movups %xmm0, (SAMPLES) add $-128, B0L add $-128, B0R movups (WINDOW), %xmm0 movups 16(WINDOW), %xmm1 movups 32(WINDOW), %xmm2 movups 48(WINDOW), %xmm3 add $128, WINDOW movaps %xmm0, %xmm4 movaps %xmm1, %xmm5 movaps %xmm2, %xmm6 movaps %xmm3, %xmm7 mulps (B0L), %xmm0 mulps 16(B0L), %xmm1 mulps 32(B0L), %xmm2 mulps 48(B0L), %xmm3 mulps (B0R), %xmm4 mulps 16(B0R), %xmm5 mulps 32(B0R), %xmm6 mulps 48(B0R), %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 movups (WINDOW), %xmm1 movups 16(WINDOW), %xmm3 movups 32(WINDOW), %xmm5 movups 48(WINDOW), %xmm7 add $128, WINDOW movaps %xmm1, %xmm8 movaps %xmm3, %xmm9 movaps %xmm5, %xmm10 movaps %xmm7, %xmm11 mulps -64(B0L), %xmm1 mulps -48(B0L), %xmm3 mulps -32(B0L), %xmm5 mulps -16(B0L), %xmm7 mulps -64(B0R), %xmm8 mulps -48(B0R), %xmm9 mulps -32(B0R), %xmm10 mulps -16(B0R), %xmm11 addps %xmm3, %xmm1 addps %xmm7, %xmm5 addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm2, %xmm0 addps %xmm6, %xmm4 addps %xmm5, %xmm1 addps %xmm10, %xmm8 movaps %xmm0, %xmm2 movaps %xmm1, %xmm3 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm2 unpcklps %xmm8, %xmm1 unpckhps %xmm8, %xmm3 addps %xmm2, %xmm0 addps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 addps %xmm2, %xmm0 mulps XMMREG_SCALE, %xmm0 movups %xmm0, 16(SAMPLES) add $-128, B0L add $-128, B0R add $32, SAMPLES decl %ecx jnz 1b xorl %eax, %eax #ifdef _WIN64 movaps (%rsp), %xmm6 movaps 16(%rsp), %xmm7 movaps 32(%rsp), %xmm8 movaps 48(%rsp), %xmm9 movaps 64(%rsp), %xmm10 movaps 80(%rsp), %xmm11 movaps 96(%rsp), %xmm12 movaps 112(%rsp), %xmm13 movaps 128(%rsp), %xmm14 movaps 144(%rsp), %xmm15 addq $168, %rsp #endif ret NONEXEC_STACK