; AMD64 mpn_sub_n ; Copyright 2016 Alexander Kruppa ; This file is part of the MPIR Library. ; The MPIR Library is free software; you can redistribute it and/or modify ; it under the terms of the GNU Lesser General Public License as published ; by the Free Software Foundation; either version 2.1 of the License, or (at ; your option) any later version. ; The MPIR Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. ; You should have received a copy of the GNU Lesser General Public License ; along with the MPIR Library; see the file COPYING.LIB. If not, write ; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ; Boston, MA 02110-1301, USA. ; (rdi,rcx) = (rsi,rcx)-(rdx,rcx) ; rax = borrow %include 'yasm_mac.inc' %define USE_WIN64 %ifdef USE_WIN64 %define SumP rcx %define Inp1P rdx %define Inp2P r8 %define Size r9 %define SizeRest r11 %define LIMB1 rax %define LIMB2 r10 %else %define SumP rdi %define Inp1P rsi %define Inp2P rdx %define Size rcx %define SizeRest r11 %define LIMB1 rax %define LIMB2 r8 %endif %define ADDSUB sub %define ADCSBB sbb ; Skylake has problems sustaining 2 read and 1 write per clock cycle. ; It sometimes gets into a "mode" (for the lack of a better word) where ; it does not fully utilize port 7, causing store uops to compete with ; the reads for ports 2,3. We try to alleviate the problem by turning ; some of the 64-bit writes into 128-bit writes, reducing the number of ; write instructions. Unfortunately, SSE2/AVX2 do not have particularly ; good instructions for assembling an SSE2 128-bit word from two GPR ; 64-bit words, so the instruction count is greatly inflated. %macro STORE 1 mov [SumP %1], LIMB1 mov [SumP %1 + 8], LIMB2 %endmacro %macro SSESTORE 1 movq xmm0, LIMB1 movq xmm1, LIMB2 vpermilpd xmm1, xmm1, 0 pblendw xmm0, xmm1, 0xf0 movaps [SumP %1], xmm0 %endmacro BITS 64 LEAF_PROC mpn_sub_n ; Make dest 16-bytes aligned test SumP, 8 jz .aligned dec Size mov SizeRest, Size and SizeRest, 7 shr Size, 3 ; Unaligned and Size > 8: do one limb separately, then the normal loop jnz .unaligned ; Unaligned and Size <= 8: do all with .rest loop inc SizeRest clc jmp .rest ;ajs:notshortform .aligned: mov SizeRest, Size and SizeRest, 7 shr Size, 3 clc jz .rest ;ajs:notshortform jmp .loop1 .unaligned: mov LIMB1, [Inp1P] ADDSUB LIMB1, [Inp2P] mov [SumP], LIMB1 lea Inp1P, [Inp1P+8] lea Inp2P, [Inp2P+8] lea SumP, [SumP+8] align 16 .loop1: mov LIMB1, [Inp1P] mov LIMB2, [Inp1P+8] ADCSBB LIMB1, [Inp2P] ADCSBB LIMB2, [Inp2P+8] SSESTORE +0 mov LIMB1, [Inp1P+16] mov LIMB2, [Inp1P+24] ADCSBB LIMB1, [Inp2P+16] ADCSBB LIMB2, [Inp2P+24] STORE +16 mov LIMB1, [Inp1P+32] mov LIMB2, [Inp1P+40] ADCSBB LIMB1, [Inp2P+32] ADCSBB LIMB2, [Inp2P+40] STORE +32 mov LIMB1, [Inp1P+48] mov LIMB2, [Inp1P+56] ADCSBB LIMB1, [Inp2P+48] ADCSBB LIMB2, [Inp2P+56] STORE +48 lea Inp1P, [Inp1P+64] lea Inp2P, [Inp2P+64] lea SumP, [SumP+64] dec Size jnz .loop1 inc SizeRest dec SizeRest jz .end .rest: mov LIMB1, [Inp1P] ADCSBB LIMB1, [Inp2P] mov [SumP], LIMB1 dec SizeRest jz .end mov LIMB1, [Inp1P+8] ADCSBB LIMB1, [Inp2P+8] mov [SumP+8], LIMB1 dec SizeRest jz .end mov LIMB1, [Inp1P+16] ADCSBB LIMB1, [Inp2P+16] mov [SumP+16], LIMB1 dec SizeRest jz .end mov LIMB1, [Inp1P+24] ADCSBB LIMB1, [Inp2P+24] mov [SumP+24], LIMB1 dec SizeRest jz .end lea Inp1P, [Inp1P+32] lea Inp2P, [Inp2P+32] lea SumP, [SumP+32] jmp .rest .end: mov eax, 0 adc eax, eax ret