New asm functions mpn_add_n mpn_sub_n for Core2/penryn/nehalem
This commit is contained in:
parent
0c3c909910
commit
b07549802a
@ -1,183 +1,98 @@
|
||||
; **************************************************************************
|
||||
; x86_64 mpn_add_n -- Add two limb vectors of the same length > 0 and store
|
||||
; sum in a third limb vector.
|
||||
;
|
||||
; Copyright (C) 2006 Jason Worth Martin <jason.worth.martin@gmail.com>
|
||||
;
|
||||
; This program is free software; you can redistribute it and/or modify
|
||||
|
||||
; mpn_add_n
|
||||
|
||||
; Copyright 2009 Jason Moxham
|
||||
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2 of the License, or
|
||||
; (at your option) any later version.
|
||||
;
|
||||
; This program is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; GNU General Public License for more details.
|
||||
;
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with this program; if not, write to the Free Software Foundation,
|
||||
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
;
|
||||
; **************************************************************************
|
||||
;
|
||||
;
|
||||
; CREDITS
|
||||
;
|
||||
; This code is based largely on Pierrick Gaudry's excellent assembly
|
||||
; support for the AMD64 architecture. (Note that Intel64 and AMD64,
|
||||
; while using the same instruction set, have very different
|
||||
; microarchitectures. So, this code performs very poorly on AMD64
|
||||
; machines even though it is near-optimal on Intel64.)
|
||||
;
|
||||
; Roger Golliver works for Intel and provided insightful improvements
|
||||
; particularly in using the "lea" instruction to perform additions
|
||||
; and register-to-register moves.
|
||||
;
|
||||
; Eric Bainville has a brilliant exposition of optimizing arithmetic for
|
||||
; AMD64 (http://www.bealto.it). I adapted many of the ideas he
|
||||
; describes to Intel64.
|
||||
;
|
||||
; Agner Fog is a demigod in the x86 world. If you are reading assembly
|
||||
; code files and you haven't heard of Agner Fog, then take a minute to
|
||||
; look over his software optimization manuals (http://www.agner.org/).
|
||||
; They are superb.
|
||||
;
|
||||
; *********************************************************************
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
;
|
||||
; If YASM supports lahf and sahf instructions, then we'll get rid
|
||||
; of this.
|
||||
;
|
||||
%define save_CF_to_reg_a db 0x9f
|
||||
%define get_CF_from_reg_a db 0x9e
|
||||
; rax=mpn_add_n(mp_ptr rdi ,mp_ptr rsi ,mp_ptr rdx ,mp_size_t rcx)
|
||||
; (rdi,rcx)=(rsi,rcx)+(rdx,rcx) return rax=carry
|
||||
|
||||
|
||||
; cycles/limb
|
||||
; Hammer: 2.5 (for 1024 limbs)
|
||||
; Woodcrest: 2.6 (for 1024 limbs)
|
||||
|
||||
; INPUT PARAMETERS
|
||||
; rp rdi
|
||||
; up rsi
|
||||
; vp rdx
|
||||
; n rcx
|
||||
BITS 64
|
||||
GLOBAL_FUNC mpn_add_n
|
||||
push rbp ; Save off callee-save registers
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
xor r15,r15 ; r15 will be our index, so
|
||||
; I'll call it i here after
|
||||
save_CF_to_reg_a ; Save CF
|
||||
|
||||
mov r9,rcx
|
||||
sub r9,4 ; r9 = n-(i+4)
|
||||
|
||||
align 16 ; aligning for loop
|
||||
L_mpn_add_n_main_loop:
|
||||
; The goal of our main unrolled loop is to keep all the
|
||||
; execution units as busy as possible. Since
|
||||
; there are three ALUs, we try to perform three
|
||||
; adds at a time. Of course, we will have the
|
||||
; carry dependency, so there is at least one
|
||||
; clock cycle between each adc. However, we'll
|
||||
; try to keep the other execution units busy
|
||||
; with loads and stores at the same time so that
|
||||
; our net throughput is close to one add per clock
|
||||
; cycle. Hopefully this function will have asymptotic
|
||||
; behavior of taking 3*n clock cycles where n is the
|
||||
; number of limbs to add.
|
||||
;
|
||||
; Note that I'm using FOUR adds at a time, this is just
|
||||
; because I wanted to use up all available registers since
|
||||
; I'm hoping the out-of-order and loop-pipeline logic in
|
||||
; the Xeon will help us out.
|
||||
|
||||
; See if we are still looping
|
||||
jle L_mpn_add_n_loop_done
|
||||
|
||||
get_CF_from_reg_a ; recover CF
|
||||
|
||||
; Load inputs into rbx and r8
|
||||
; add with carry, and put result in r8
|
||||
; then store r8 to output.
|
||||
mov rbx,[rsi+r15*8]
|
||||
mov r8,[rdx+r15*8]
|
||||
adc r8,rbx
|
||||
mov [rdi+r15*8],r8
|
||||
|
||||
; Load inputs into r9 and r10
|
||||
; add with carry, and put result in r10
|
||||
; then store r10 to output.
|
||||
mov r9,[8+rsi+r15*8]
|
||||
mov r10,[8+rdx+r15*8]
|
||||
adc r10,r9
|
||||
mov [8+rdi+r15*8],r10
|
||||
|
||||
; Load inputs into r11 and r12
|
||||
; add with carry, and put result in r12
|
||||
; then store r12 to output.
|
||||
mov r11,[16+rsi+r15*8]
|
||||
mov r12,[16+rdx+r15*8]
|
||||
adc r12,r11
|
||||
mov [16+rdi+r15*8],r12
|
||||
|
||||
; Load inputs into r13 and r14
|
||||
; add with carry, and put result in r14
|
||||
; then store r14 to output.
|
||||
mov r13,[24+rsi+r15*8]
|
||||
mov r14,[24+rdx+r15*8]
|
||||
adc r14,r13
|
||||
mov [24+rdi+r15*8],r14
|
||||
|
||||
save_CF_to_reg_a ; save CF
|
||||
|
||||
mov r10,r15
|
||||
add r10,8
|
||||
add r15,4 ; increment by 4.
|
||||
|
||||
mov r9,rcx
|
||||
sub r9,r10 ; r9 = n-(i+4)
|
||||
jmp L_mpn_add_n_main_loop
|
||||
|
||||
L_mpn_add_n_loop_done:
|
||||
mov r15,rcx ;
|
||||
sub r15,r9 ; r15 = n-(n-(i+4))=i+4
|
||||
sub r15,4 ; r15 = i
|
||||
cmp r15,rcx
|
||||
L_mpn_add_n_post_loop:
|
||||
je L_mpn_add_n_exit
|
||||
get_CF_from_reg_a ; recover CF
|
||||
|
||||
; Load inputs into rbx and r8
|
||||
; add with carry, and put result in r8
|
||||
; then store r8 to output.
|
||||
mov rbx,[rsi+r15*8]
|
||||
mov r8,[rdx+r15*8]
|
||||
adc r8,rbx
|
||||
mov [rdi+r15*8],r8
|
||||
save_CF_to_reg_a ; save CF
|
||||
add r15,1
|
||||
cmp r15,rcx
|
||||
jmp L_mpn_add_n_post_loop
|
||||
|
||||
|
||||
L_mpn_add_n_exit:
|
||||
xor rcx,rcx
|
||||
get_CF_from_reg_a ; recover the CF
|
||||
mov rax,rcx ; Clears rax without affecting carry flag
|
||||
adc rax,rax ; returns carry status.
|
||||
|
||||
pop r15 ; restore callee-save registers
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
pop rbp
|
||||
GLOBAL_FUNC mpn_add_n
|
||||
mov rax, rcx
|
||||
and rax, 3
|
||||
sub rcx, rax
|
||||
lea rdi, [rdi+rcx*8]
|
||||
lea rsi, [rsi+rcx*8]
|
||||
lea rdx, [rdx+rcx*8]
|
||||
neg rcx
|
||||
cmp rcx, 0
|
||||
jz L_skiplp
|
||||
align 16
|
||||
L_lp:
|
||||
mov r8, [rsi+rcx*8]
|
||||
mov r10, [rsi+rcx*8+16]
|
||||
adc r8, [rdx+rcx*8]
|
||||
mov [rdi+rcx*8], r8
|
||||
mov r9, [rsi+rcx*8+8]
|
||||
adc r9, [rdx+rcx*8+8]
|
||||
mov [rdi+rcx*8+8], r9
|
||||
lea rcx, [rcx+4]
|
||||
mov r11, [rsi+rcx*8-8]
|
||||
adc r10, [rdx+rcx*8-16]
|
||||
adc r11, [rdx+rcx*8-8]
|
||||
mov [rdi+rcx*8-16], r10
|
||||
mov [rdi+rcx*8-8], r11
|
||||
jrcxz L_exitlp
|
||||
jmp L_lp
|
||||
L_exitlp:
|
||||
sbb rcx, rcx
|
||||
L_skiplp:
|
||||
cmp rax, 2
|
||||
ja L_case3
|
||||
jz L_case2
|
||||
jp L_case1
|
||||
L_case0:
|
||||
sub rax, rcx
|
||||
ret
|
||||
align 16
|
||||
L_case1:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
adc r8, [rdx]
|
||||
mov [rdi], r8
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
align 16
|
||||
L_case3:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
mov r10, [rsi+16]
|
||||
adc r8, [rdx]
|
||||
mov [rdi], r8
|
||||
mov r9, [rsi+8]
|
||||
adc r9, [rdx+8]
|
||||
mov [rdi+8], r9
|
||||
adc r10, [rdx+16]
|
||||
mov [rdi+16], r10
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
align 16
|
||||
L_case2:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
adc r8, [rdx]
|
||||
mov [rdi], r8
|
||||
mov r9, [rsi+8]
|
||||
adc r9, [rdx+8]
|
||||
mov [rdi+8], r9
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
end
|
||||
|
@ -1,183 +1,98 @@
|
||||
; x86_64 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
|
||||
; store difference in a third limb vector.
|
||||
;
|
||||
; Copyright (C) 2006 Jason Worth Martin <jason.worth.martin@gmail.com>
|
||||
;
|
||||
; This program is free software; you can redistribute it and/or modify
|
||||
|
||||
; mpn_sub_n
|
||||
|
||||
; Copyright 2009 Jason Moxham
|
||||
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2 of the License, or
|
||||
; (at your option) any later version.
|
||||
;
|
||||
; This program is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; GNU General Public License for more details.
|
||||
;
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with this program; if not, write to the Free Software Foundation,
|
||||
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
;
|
||||
; **************************************************************************
|
||||
;
|
||||
;
|
||||
; CREDITS
|
||||
;
|
||||
; This code is based largely on Pierrick Gaudry's excellent assembly
|
||||
; support for the AMD64 architecture. (Note that Intel64 and AMD64,
|
||||
; while using the same instruction set, have very different
|
||||
; microarchitectures. So, this code performs very poorly on AMD64
|
||||
; machines even though it is near-optimal on Intel64.)
|
||||
;
|
||||
; Roger Golliver works for Intel and provided insightful improvements
|
||||
; particularly in using the "lea" instruction to perform additions
|
||||
; and register-to-register moves.
|
||||
;
|
||||
; Eric Bainville has a brilliant exposition of optimizing arithmetic for
|
||||
; AMD64 (http://www.bealto.it). I adapted many of the ideas he
|
||||
; describes to Intel64.
|
||||
;
|
||||
; Agner Fog is a demigod in the x86 world. If you are reading assembly
|
||||
; code files and you haven't heard of Agner Fog, then take a minute to
|
||||
; look over his software optimization manuals (http://www.agner.org/).
|
||||
; They are superb.
|
||||
;
|
||||
; *********************************************************************
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
;
|
||||
; If YASM supports lahf and sahf instructions, then we'll get rid
|
||||
; of this.
|
||||
;
|
||||
%define save_CF_to_reg_a db 0x9f
|
||||
%define get_CF_from_reg_a db 0x9e
|
||||
; ret mpn_sub_n(mp_ptr,mp_ptr,mp_ptr,mp_size_t)
|
||||
; (rdi,rcx)=(rsi,rcx)-(rdx,rcx) return rax=borrow
|
||||
|
||||
|
||||
; cycles/limb
|
||||
; Hammer: 2.5 (for 1024 limbs)
|
||||
; Woodcrest: 2.6 (for 1024 limbs)
|
||||
|
||||
; INPUT PARAMETERS
|
||||
; rp rdi
|
||||
; up rsi
|
||||
; vp rdx
|
||||
; n rcx
|
||||
|
||||
BITS 64
|
||||
GLOBAL_FUNC mpn_sub_n
|
||||
push rbp ; Save off callee-save registers
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
xor r15,r15 ; r15 will be our index, so
|
||||
; I'll call it i here after
|
||||
save_CF_to_reg_a ; Save CF
|
||||
|
||||
mov r9,rcx
|
||||
sub r9,4 ; r9 = n-(i+4)
|
||||
|
||||
align 16 ; aligning for loop
|
||||
L_mpn_sub_n_main_loop:
|
||||
; The goal of our main unrolled loop is to keep all the
|
||||
; execution units as busy as possible. Since
|
||||
; there are three ALUs, we try to perform three
|
||||
; adds at a time. Of course, we will have the
|
||||
; borrow dependency, so there is at least one
|
||||
; clock cycle between each sbb. However, we'll
|
||||
; try to keep the other execution units busy
|
||||
; with loads and stores at the same time so that
|
||||
; our net throughput is close to one sbb per clock
|
||||
; cycle. Hopefully this function will have asymptotic
|
||||
; behavior of taking 3*n clock cycles where n is the
|
||||
; number of limbs to sub.
|
||||
;
|
||||
; Note that I'm using FOUR sbbs at a time, this is just
|
||||
; because I wanted to use up all available registers since
|
||||
; I'm hoping the out-of-order and loop-pipeline logic in
|
||||
; the Xeon will help us out.
|
||||
|
||||
; See if we are still looping
|
||||
jle L_mpn_sub_n_loop_done
|
||||
|
||||
get_CF_from_reg_a ; recover CF
|
||||
|
||||
; Load inputs into rbx and r8
|
||||
; sub with borrow, and put result in r8
|
||||
; then store r8 to output.
|
||||
mov rbx,[rdx+r15*8]
|
||||
mov r8,[rsi+r15*8]
|
||||
sbb r8,rbx
|
||||
mov [rdi+r15*8],r8
|
||||
|
||||
; Load inputs into r9 and r10
|
||||
; sub with borrow, and put result in r10
|
||||
; then store r10 to output.
|
||||
mov r9,[8+rdx+r15*8]
|
||||
mov r10,[8+rsi+r15*8]
|
||||
sbb r10,r9
|
||||
mov [8+rdi+r15*8],r10
|
||||
|
||||
; Load inputs into r11 and r12
|
||||
; sub with borrow, and put result in r12
|
||||
; then store r12 to output.
|
||||
mov r11,[16+rdx+r15*8]
|
||||
mov r12,[16+rsi+r15*8]
|
||||
sbb r12,r11
|
||||
mov [16+rdi+r15*8],r12
|
||||
|
||||
; Load inputs into r13 and r14
|
||||
; sub with borrow, and put result in r14
|
||||
; then store r14 to output.
|
||||
mov r13,[24+rdx+r15*8]
|
||||
mov r14,[24+rsi+r15*8]
|
||||
sbb r14,r13
|
||||
mov [24+rdi+r15*8],r14
|
||||
|
||||
save_CF_to_reg_a ; save CF
|
||||
|
||||
mov r10,r15
|
||||
add r10,8
|
||||
add r15,4 ; increment by 4.
|
||||
|
||||
mov r9,rcx
|
||||
sub r9,r10 ; r9 = n-(i+4)
|
||||
jmp L_mpn_sub_n_main_loop
|
||||
|
||||
L_mpn_sub_n_loop_done:
|
||||
mov r15,rcx ;
|
||||
sub r15,r9 ; r15 = n-(n-(i+4))=i+4
|
||||
sub r15,4 ; r15 = i
|
||||
cmp r15,rcx
|
||||
L_mpn_sub_n_post_loop:
|
||||
je L_mpn_sub_n_exit
|
||||
get_CF_from_reg_a ; recover CF
|
||||
|
||||
; Load inputs into rbx and r8
|
||||
; sub with borrow, and put result in r8
|
||||
; then store r8 to output.
|
||||
mov rbx,[rdx+r15*8]
|
||||
mov r8,[rsi+r15*8]
|
||||
sbb r8,rbx
|
||||
mov [rdi+r15*8],r8
|
||||
save_CF_to_reg_a ; save CF
|
||||
add r15,1
|
||||
cmp r15,rcx
|
||||
jmp L_mpn_sub_n_post_loop
|
||||
|
||||
|
||||
L_mpn_sub_n_exit:
|
||||
xor rcx,rcx
|
||||
get_CF_from_reg_a ; recover the CF
|
||||
mov rax,rcx ; Clears rax without affecting carry flag
|
||||
adc rax,rax ; returns carry status.
|
||||
|
||||
pop r15 ; restore callee-save registers
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
pop rbp
|
||||
GLOBAL_FUNC mpn_sub_n
|
||||
mov rax, rcx
|
||||
and rax, 3
|
||||
sub rcx, rax
|
||||
lea rdi, [rdi+rcx*8]
|
||||
lea rsi, [rsi+rcx*8]
|
||||
lea rdx, [rdx+rcx*8]
|
||||
neg rcx
|
||||
cmp rcx, 0
|
||||
jz L_skiplp
|
||||
align 16
|
||||
L_lp:
|
||||
mov r8, [rsi+rcx*8]
|
||||
mov r10, [rsi+rcx*8+16]
|
||||
sbb r8, [rdx+rcx*8]
|
||||
mov [rdi+rcx*8], r8
|
||||
mov r9, [rsi+rcx*8+8]
|
||||
sbb r9, [rdx+rcx*8+8]
|
||||
mov [rdi+rcx*8+8], r9
|
||||
lea rcx, [rcx+4]
|
||||
mov r11, [rsi+rcx*8-8]
|
||||
sbb r10, [rdx+rcx*8-16]
|
||||
sbb r11, [rdx+rcx*8-8]
|
||||
mov [rdi+rcx*8-16], r10
|
||||
mov [rdi+rcx*8-8], r11
|
||||
jrcxz L_exitlp
|
||||
jmp L_lp
|
||||
L_exitlp:
|
||||
sbb rcx, rcx
|
||||
L_skiplp:
|
||||
cmp rax, 2
|
||||
ja L_case3
|
||||
jz L_case2
|
||||
jp L_case1
|
||||
L_case0:
|
||||
sub rax, rcx
|
||||
ret
|
||||
align 16
|
||||
L_case1:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
sbb r8, [rdx]
|
||||
mov [rdi], r8
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
align 16
|
||||
L_case3:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
mov r10, [rsi+16]
|
||||
sbb r8, [rdx]
|
||||
mov [rdi], r8
|
||||
mov r9, [rsi+8]
|
||||
sbb r9, [rdx+8]
|
||||
mov [rdi+8], r9
|
||||
sbb r10, [rdx+16]
|
||||
mov [rdi+16], r10
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
align 16
|
||||
L_case2:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
sbb r8, [rdx]
|
||||
mov [rdi], r8
|
||||
mov r9, [rsi+8]
|
||||
sbb r9, [rdx+8]
|
||||
mov [rdi+8], r9
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
end
|
||||
|
@ -1,183 +1,98 @@
|
||||
; **************************************************************************
|
||||
; x86_64 mpn_add_n -- Add two limb vectors of the same length > 0 and store
|
||||
; sum in a third limb vector.
|
||||
;
|
||||
; Copyright (C) 2006 Jason Worth Martin <jason.worth.martin@gmail.com>
|
||||
;
|
||||
; This program is free software; you can redistribute it and/or modify
|
||||
|
||||
; mpn_add_n
|
||||
|
||||
; Copyright 2009 Jason Moxham
|
||||
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2 of the License, or
|
||||
; (at your option) any later version.
|
||||
;
|
||||
; This program is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; GNU General Public License for more details.
|
||||
;
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with this program; if not, write to the Free Software Foundation,
|
||||
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
;
|
||||
; **************************************************************************
|
||||
;
|
||||
;
|
||||
; CREDITS
|
||||
;
|
||||
; This code is based largely on Pierrick Gaudry's excellent assembly
|
||||
; support for the AMD64 architecture. (Note that Intel64 and AMD64,
|
||||
; while using the same instruction set, have very different
|
||||
; microarchitectures. So, this code performs very poorly on AMD64
|
||||
; machines even though it is near-optimal on Intel64.)
|
||||
;
|
||||
; Roger Golliver works for Intel and provided insightful improvements
|
||||
; particularly in using the "lea" instruction to perform additions
|
||||
; and register-to-register moves.
|
||||
;
|
||||
; Eric Bainville has a brilliant exposition of optimizing arithmetic for
|
||||
; AMD64 (http://www.bealto.it). I adapted many of the ideas he
|
||||
; describes to Intel64.
|
||||
;
|
||||
; Agner Fog is a demigod in the x86 world. If you are reading assembly
|
||||
; code files and you haven't heard of Agner Fog, then take a minute to
|
||||
; look over his software optimization manuals (http://www.agner.org/).
|
||||
; They are superb.
|
||||
;
|
||||
; *********************************************************************
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
;
|
||||
; If YASM supports lahf and sahf instructions, then we'll get rid
|
||||
; of this.
|
||||
;
|
||||
%define save_CF_to_reg_a db 0x9f
|
||||
%define get_CF_from_reg_a db 0x9e
|
||||
; rax=mpn_add_n(mp_ptr rdi ,mp_ptr rsi ,mp_ptr rdx ,mp_size_t rcx)
|
||||
; (rdi,rcx)=(rsi,rcx)+(rdx,rcx) return rax=carry
|
||||
|
||||
|
||||
; cycles/limb
|
||||
; Hammer: 2.5 (for 1024 limbs)
|
||||
; Woodcrest: 2.6 (for 1024 limbs)
|
||||
|
||||
; INPUT PARAMETERS
|
||||
; rp rdi
|
||||
; up rsi
|
||||
; vp rdx
|
||||
; n rcx
|
||||
BITS 64
|
||||
GLOBAL_FUNC mpn_add_n
|
||||
push rbp ; Save off callee-save registers
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
xor r15,r15 ; r15 will be our index, so
|
||||
; I'll call it i here after
|
||||
save_CF_to_reg_a ; Save CF
|
||||
|
||||
mov r9,rcx
|
||||
sub r9,4 ; r9 = n-(i+4)
|
||||
|
||||
align 16 ; aligning for loop
|
||||
L_mpn_add_n_main_loop:
|
||||
; The goal of our main unrolled loop is to keep all the
|
||||
; execution units as busy as possible. Since
|
||||
; there are three ALUs, we try to perform three
|
||||
; adds at a time. Of course, we will have the
|
||||
; carry dependency, so there is at least one
|
||||
; clock cycle between each adc. However, we'll
|
||||
; try to keep the other execution units busy
|
||||
; with loads and stores at the same time so that
|
||||
; our net throughput is close to one add per clock
|
||||
; cycle. Hopefully this function will have asymptotic
|
||||
; behavior of taking 3*n clock cycles where n is the
|
||||
; number of limbs to add.
|
||||
;
|
||||
; Note that I'm using FOUR adds at a time, this is just
|
||||
; because I wanted to use up all available registers since
|
||||
; I'm hoping the out-of-order and loop-pipeline logic in
|
||||
; the Xeon will help us out.
|
||||
|
||||
; See if we are still looping
|
||||
jle L_mpn_add_n_loop_done
|
||||
|
||||
get_CF_from_reg_a ; recover CF
|
||||
|
||||
; Load inputs into rbx and r8
|
||||
; add with carry, and put result in r8
|
||||
; then store r8 to output.
|
||||
mov rbx,[rsi+r15*8]
|
||||
mov r8,[rdx+r15*8]
|
||||
adc r8,rbx
|
||||
mov [rdi+r15*8],r8
|
||||
|
||||
; Load inputs into r9 and r10
|
||||
; add with carry, and put result in r10
|
||||
; then store r10 to output.
|
||||
mov r9,[8+rsi+r15*8]
|
||||
mov r10,[8+rdx+r15*8]
|
||||
adc r10,r9
|
||||
mov [8+rdi+r15*8],r10
|
||||
|
||||
; Load inputs into r11 and r12
|
||||
; add with carry, and put result in r12
|
||||
; then store r12 to output.
|
||||
mov r11,[16+rsi+r15*8]
|
||||
mov r12,[16+rdx+r15*8]
|
||||
adc r12,r11
|
||||
mov [16+rdi+r15*8],r12
|
||||
|
||||
; Load inputs into r13 and r14
|
||||
; add with carry, and put result in r14
|
||||
; then store r14 to output.
|
||||
mov r13,[24+rsi+r15*8]
|
||||
mov r14,[24+rdx+r15*8]
|
||||
adc r14,r13
|
||||
mov [24+rdi+r15*8],r14
|
||||
|
||||
save_CF_to_reg_a ; save CF
|
||||
|
||||
mov r10,r15
|
||||
add r10,8
|
||||
add r15,4 ; increment by 4.
|
||||
|
||||
mov r9,rcx
|
||||
sub r9,r10 ; r9 = n-(i+4)
|
||||
jmp L_mpn_add_n_main_loop
|
||||
|
||||
L_mpn_add_n_loop_done:
|
||||
mov r15,rcx ;
|
||||
sub r15,r9 ; r15 = n-(n-(i+4))=i+4
|
||||
sub r15,4 ; r15 = i
|
||||
cmp r15,rcx
|
||||
L_mpn_add_n_post_loop:
|
||||
je L_mpn_add_n_exit
|
||||
get_CF_from_reg_a ; recover CF
|
||||
|
||||
; Load inputs into rbx and r8
|
||||
; add with carry, and put result in r8
|
||||
; then store r8 to output.
|
||||
mov rbx,[rsi+r15*8]
|
||||
mov r8,[rdx+r15*8]
|
||||
adc r8,rbx
|
||||
mov [rdi+r15*8],r8
|
||||
save_CF_to_reg_a ; save CF
|
||||
add r15,1
|
||||
cmp r15,rcx
|
||||
jmp L_mpn_add_n_post_loop
|
||||
|
||||
|
||||
L_mpn_add_n_exit:
|
||||
xor rcx,rcx
|
||||
get_CF_from_reg_a ; recover the CF
|
||||
mov rax,rcx ; Clears rax without affecting carry flag
|
||||
adc rax,rax ; returns carry status.
|
||||
|
||||
pop r15 ; restore callee-save registers
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
pop rbp
|
||||
GLOBAL_FUNC mpn_add_n
|
||||
mov rax, rcx
|
||||
and rax, 3
|
||||
sub rcx, rax
|
||||
lea rdi, [rdi+rcx*8]
|
||||
lea rsi, [rsi+rcx*8]
|
||||
lea rdx, [rdx+rcx*8]
|
||||
neg rcx
|
||||
cmp rcx, 0
|
||||
jz L_skiplp
|
||||
align 16
|
||||
L_lp:
|
||||
mov r8, [rsi+rcx*8]
|
||||
mov r10, [rsi+rcx*8+16]
|
||||
adc r8, [rdx+rcx*8]
|
||||
mov [rdi+rcx*8], r8
|
||||
mov r9, [rsi+rcx*8+8]
|
||||
adc r9, [rdx+rcx*8+8]
|
||||
mov [rdi+rcx*8+8], r9
|
||||
lea rcx, [rcx+4]
|
||||
mov r11, [rsi+rcx*8-8]
|
||||
adc r10, [rdx+rcx*8-16]
|
||||
adc r11, [rdx+rcx*8-8]
|
||||
mov [rdi+rcx*8-16], r10
|
||||
mov [rdi+rcx*8-8], r11
|
||||
jrcxz L_exitlp
|
||||
jmp L_lp
|
||||
L_exitlp:
|
||||
sbb rcx, rcx
|
||||
L_skiplp:
|
||||
cmp rax, 2
|
||||
ja L_case3
|
||||
jz L_case2
|
||||
jp L_case1
|
||||
L_case0:
|
||||
sub rax, rcx
|
||||
ret
|
||||
align 16
|
||||
L_case1:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
adc r8, [rdx]
|
||||
mov [rdi], r8
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
align 16
|
||||
L_case3:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
mov r10, [rsi+16]
|
||||
adc r8, [rdx]
|
||||
mov [rdi], r8
|
||||
mov r9, [rsi+8]
|
||||
adc r9, [rdx+8]
|
||||
mov [rdi+8], r9
|
||||
adc r10, [rdx+16]
|
||||
mov [rdi+16], r10
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
align 16
|
||||
L_case2:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
adc r8, [rdx]
|
||||
mov [rdi], r8
|
||||
mov r9, [rsi+8]
|
||||
adc r9, [rdx+8]
|
||||
mov [rdi+8], r9
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
end
|
||||
|
@ -1,183 +1,98 @@
|
||||
; x86_64 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
|
||||
; store difference in a third limb vector.
|
||||
;
|
||||
; Copyright (C) 2006 Jason Worth Martin <jason.worth.martin@gmail.com>
|
||||
;
|
||||
; This program is free software; you can redistribute it and/or modify
|
||||
|
||||
; mpn_sub_n
|
||||
|
||||
; Copyright 2009 Jason Moxham
|
||||
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2 of the License, or
|
||||
; (at your option) any later version.
|
||||
;
|
||||
; This program is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; GNU General Public License for more details.
|
||||
;
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with this program; if not, write to the Free Software Foundation,
|
||||
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
;
|
||||
; **************************************************************************
|
||||
;
|
||||
;
|
||||
; CREDITS
|
||||
;
|
||||
; This code is based largely on Pierrick Gaudry's excellent assembly
|
||||
; support for the AMD64 architecture. (Note that Intel64 and AMD64,
|
||||
; while using the same instruction set, have very different
|
||||
; microarchitectures. So, this code performs very poorly on AMD64
|
||||
; machines even though it is near-optimal on Intel64.)
|
||||
;
|
||||
; Roger Golliver works for Intel and provided insightful improvements
|
||||
; particularly in using the "lea" instruction to perform additions
|
||||
; and register-to-register moves.
|
||||
;
|
||||
; Eric Bainville has a brilliant exposition of optimizing arithmetic for
|
||||
; AMD64 (http://www.bealto.it). I adapted many of the ideas he
|
||||
; describes to Intel64.
|
||||
;
|
||||
; Agner Fog is a demigod in the x86 world. If you are reading assembly
|
||||
; code files and you haven't heard of Agner Fog, then take a minute to
|
||||
; look over his software optimization manuals (http://www.agner.org/).
|
||||
; They are superb.
|
||||
;
|
||||
; *********************************************************************
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
;
|
||||
; If YASM supports lahf and sahf instructions, then we'll get rid
|
||||
; of this.
|
||||
;
|
||||
%define save_CF_to_reg_a db 0x9f
|
||||
%define get_CF_from_reg_a db 0x9e
|
||||
; ret mpn_sub_n(mp_ptr,mp_ptr,mp_ptr,mp_size_t)
|
||||
; (rdi,rcx)=(rsi,rcx)-(rdx,rcx) return rax=borrow
|
||||
|
||||
|
||||
; cycles/limb
|
||||
; Hammer: 2.5 (for 1024 limbs)
|
||||
; Woodcrest: 2.6 (for 1024 limbs)
|
||||
|
||||
; INPUT PARAMETERS
|
||||
; rp rdi
|
||||
; up rsi
|
||||
; vp rdx
|
||||
; n rcx
|
||||
|
||||
BITS 64
|
||||
GLOBAL_FUNC mpn_sub_n
|
||||
push rbp ; Save off callee-save registers
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
xor r15,r15 ; r15 will be our index, so
|
||||
; I'll call it i here after
|
||||
save_CF_to_reg_a ; Save CF
|
||||
|
||||
mov r9,rcx
|
||||
sub r9,4 ; r9 = n-(i+4)
|
||||
|
||||
align 16 ; aligning for loop
|
||||
L_mpn_sub_n_main_loop:
|
||||
; The goal of our main unrolled loop is to keep all the
|
||||
; execution units as busy as possible. Since
|
||||
; there are three ALUs, we try to perform three
|
||||
; adds at a time. Of course, we will have the
|
||||
; borrow dependency, so there is at least one
|
||||
; clock cycle between each sbb. However, we'll
|
||||
; try to keep the other execution units busy
|
||||
; with loads and stores at the same time so that
|
||||
; our net throughput is close to one sbb per clock
|
||||
; cycle. Hopefully this function will have asymptotic
|
||||
; behavior of taking 3*n clock cycles where n is the
|
||||
; number of limbs to sub.
|
||||
;
|
||||
; Note that I'm using FOUR sbbs at a time, this is just
|
||||
; because I wanted to use up all available registers since
|
||||
; I'm hoping the out-of-order and loop-pipeline logic in
|
||||
; the Xeon will help us out.
|
||||
|
||||
; See if we are still looping
|
||||
jle L_mpn_sub_n_loop_done
|
||||
|
||||
get_CF_from_reg_a ; recover CF
|
||||
|
||||
; Load inputs into rbx and r8
|
||||
; sub with borrow, and put result in r8
|
||||
; then store r8 to output.
|
||||
mov rbx,[rdx+r15*8]
|
||||
mov r8,[rsi+r15*8]
|
||||
sbb r8,rbx
|
||||
mov [rdi+r15*8],r8
|
||||
|
||||
; Load inputs into r9 and r10
|
||||
; sub with borrow, and put result in r10
|
||||
; then store r10 to output.
|
||||
mov r9,[8+rdx+r15*8]
|
||||
mov r10,[8+rsi+r15*8]
|
||||
sbb r10,r9
|
||||
mov [8+rdi+r15*8],r10
|
||||
|
||||
; Load inputs into r11 and r12
|
||||
; sub with borrow, and put result in r12
|
||||
; then store r12 to output.
|
||||
mov r11,[16+rdx+r15*8]
|
||||
mov r12,[16+rsi+r15*8]
|
||||
sbb r12,r11
|
||||
mov [16+rdi+r15*8],r12
|
||||
|
||||
; Load inputs into r13 and r14
|
||||
; sub with borrow, and put result in r14
|
||||
; then store r14 to output.
|
||||
mov r13,[24+rdx+r15*8]
|
||||
mov r14,[24+rsi+r15*8]
|
||||
sbb r14,r13
|
||||
mov [24+rdi+r15*8],r14
|
||||
|
||||
save_CF_to_reg_a ; save CF
|
||||
|
||||
mov r10,r15
|
||||
add r10,8
|
||||
add r15,4 ; increment by 4.
|
||||
|
||||
mov r9,rcx
|
||||
sub r9,r10 ; r9 = n-(i+4)
|
||||
jmp L_mpn_sub_n_main_loop
|
||||
|
||||
L_mpn_sub_n_loop_done:
|
||||
mov r15,rcx ;
|
||||
sub r15,r9 ; r15 = n-(n-(i+4))=i+4
|
||||
sub r15,4 ; r15 = i
|
||||
cmp r15,rcx
|
||||
L_mpn_sub_n_post_loop:
|
||||
je L_mpn_sub_n_exit
|
||||
get_CF_from_reg_a ; recover CF
|
||||
|
||||
; Load inputs into rbx and r8
|
||||
; sub with borrow, and put result in r8
|
||||
; then store r8 to output.
|
||||
mov rbx,[rdx+r15*8]
|
||||
mov r8,[rsi+r15*8]
|
||||
sbb r8,rbx
|
||||
mov [rdi+r15*8],r8
|
||||
save_CF_to_reg_a ; save CF
|
||||
add r15,1
|
||||
cmp r15,rcx
|
||||
jmp L_mpn_sub_n_post_loop
|
||||
|
||||
|
||||
L_mpn_sub_n_exit:
|
||||
xor rcx,rcx
|
||||
get_CF_from_reg_a ; recover the CF
|
||||
mov rax,rcx ; Clears rax without affecting carry flag
|
||||
adc rax,rax ; returns carry status.
|
||||
|
||||
pop r15 ; restore callee-save registers
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
pop rbp
|
||||
GLOBAL_FUNC mpn_sub_n
|
||||
mov rax, rcx
|
||||
and rax, 3
|
||||
sub rcx, rax
|
||||
lea rdi, [rdi+rcx*8]
|
||||
lea rsi, [rsi+rcx*8]
|
||||
lea rdx, [rdx+rcx*8]
|
||||
neg rcx
|
||||
cmp rcx, 0
|
||||
jz L_skiplp
|
||||
align 16
|
||||
L_lp:
|
||||
mov r8, [rsi+rcx*8]
|
||||
mov r10, [rsi+rcx*8+16]
|
||||
sbb r8, [rdx+rcx*8]
|
||||
mov [rdi+rcx*8], r8
|
||||
mov r9, [rsi+rcx*8+8]
|
||||
sbb r9, [rdx+rcx*8+8]
|
||||
mov [rdi+rcx*8+8], r9
|
||||
lea rcx, [rcx+4]
|
||||
mov r11, [rsi+rcx*8-8]
|
||||
sbb r10, [rdx+rcx*8-16]
|
||||
sbb r11, [rdx+rcx*8-8]
|
||||
mov [rdi+rcx*8-16], r10
|
||||
mov [rdi+rcx*8-8], r11
|
||||
jrcxz L_exitlp
|
||||
jmp L_lp
|
||||
L_exitlp:
|
||||
sbb rcx, rcx
|
||||
L_skiplp:
|
||||
cmp rax, 2
|
||||
ja L_case3
|
||||
jz L_case2
|
||||
jp L_case1
|
||||
L_case0:
|
||||
sub rax, rcx
|
||||
ret
|
||||
align 16
|
||||
L_case1:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
sbb r8, [rdx]
|
||||
mov [rdi], r8
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
align 16
|
||||
L_case3:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
mov r10, [rsi+16]
|
||||
sbb r8, [rdx]
|
||||
mov [rdi], r8
|
||||
mov r9, [rsi+8]
|
||||
sbb r9, [rdx+8]
|
||||
mov [rdi+8], r9
|
||||
sbb r10, [rdx+16]
|
||||
mov [rdi+16], r10
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
align 16
|
||||
L_case2:
|
||||
add rcx, rcx
|
||||
mov r8, [rsi]
|
||||
sbb r8, [rdx]
|
||||
mov [rdi], r8
|
||||
mov r9, [rsi+8]
|
||||
sbb r9, [rdx+8]
|
||||
mov [rdi+8], r9
|
||||
sbb rax, rax
|
||||
neg rax
|
||||
ret
|
||||
end
|
||||
|
24
tune/aligntest
Executable file
24
tune/aligntest
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
if [ $# -ne 3 ] ; then
|
||||
echo "$0 mpn_fn max_time size"
|
||||
exit 1
|
||||
fi
|
||||
for x in 0 1 2 3 4 5 6 7
|
||||
do
|
||||
for y in 0 1 2 3 4 5 6 7
|
||||
do
|
||||
for w in 0 1 2 3 4 5 6 7
|
||||
do
|
||||
for W in 0 1 2 3 4 5 6 7
|
||||
do
|
||||
while true
|
||||
do
|
||||
c=$(./speed -x $x -y $y -w $w -W $W -c -s $3 $1 | tail -n 1 | tr -s " " | cut -d " " -f 2 | cut -d . -f 1)
|
||||
if [ $c -lt $2 ] ; then break ; fi
|
||||
echo "$x $y $w $W $c"
|
||||
done
|
||||
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
Loading…
Reference in New Issue
Block a user