diff --git a/mpn/x86_64/core2/add_n.as b/mpn/x86_64/core2/add_n.as new file mode 100644 index 00000000..5f52099a --- /dev/null +++ b/mpn/x86_64/core2/add_n.as @@ -0,0 +1,189 @@ +;; ************************************************************************** +;; x86_64 mpn_add_n -- Add two limb vectors of the same length > 0 and store +;; sum in a third limb vector. +;; +;; Copyright (C) 2006 Jason Worth Martin +;; +;; This program is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2 of the License, or +;; (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License along +;; with this program; if not, write to the Free Software Foundation, Inc., +;; 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;; +;; ************************************************************************** +;; +;; +;; CREDITS +;; +;; This code is based largely on Pierrick Gaudry's excellent assembly +;; support for the AMD64 architecture. (Note that Intel64 and AMD64, +;; while using the same instruction set, have very different +;; microarchitectures. So, this code performs very poorly on AMD64 +;; machines even though it is near-optimal on Intel64.) +;; +;; Roger Golliver works for Intel and provided insightful improvements +;; particularly in using the "lea" instruction to perform additions +;; and register-to-register moves. +;; +;; Eric Bainville has a brilliant exposition of optimizing arithmetic for +;; AMD64 (http://www.bealto.it). I adapted many of the ideas he +;; describes to Intel64. +;; +;; Agner Fog is a demigod in the x86 world. If you are reading assembly +;; code files and you haven't heard of Agner Fog, then take a minute to +;; look over his software optimization manuals (http://www.agner.org/). +;; They are superb. +;; +;; ********************************************************************* + +%ifdef __JWM_Test_Code__ +%include 'yasm_mac.inc' +%else +%include '../yasm_mac.inc' +%endif + +;; +;; If YASM supports lahf and sahf instructions, then we'll get rid +;; of this. +;; +%define save_CF_to_reg_a db 0x9f +%define get_CF_from_reg_a db 0x9e +;; define(`save_CF_to_reg_a',`setc %al') +;; define(`get_CF_from_reg_a',`bt `$'0x0,%rax') + + +;; cycles/limb +;; Hammer: 2.5 (for 1024 limbs) +;; Woodcrest: 2.6 (for 1024 limbs) + +;; INPUT PARAMETERS +;; rp rdi +;; up rsi +;; vp rdx +;; n rcx + BITS 64 +GLOBAL_FUNC mpn_add_n + push rbp ;; Save off callee-save registers + push rbx + push r12 + push r13 + push r14 + push r15 + + xor r15,r15 ;; r15 will be our index, so + ;; I'll call it i here after + save_CF_to_reg_a ;; Save CF + + mov r9,rcx + sub r9,4 ;; r9 = n-(i+4) + + align 16 ;; aligning for loop +L_mpn_add_n_main_loop: + ;; The goal of our main unrolled loop is to keep all the + ;; execution units as busy as possible. Since + ;; there are three ALUs, we try to perform three + ;; adds at a time. Of course, we will have the + ;; carry dependency, so there is at least one + ;; clock cycle between each adc. However, we'll + ;; try to keep the other execution units busy + ;; with loads and stores at the same time so that + ;; our net throughput is close to one add per clock + ;; cycle. Hopefully this function will have asymptotic + ;; behavior of taking 3*n clock cycles where n is the + ;; number of limbs to add. + ;; + ;; Note that I'm using FOUR adds at a time, this is just + ;; because I wanted to use up all available registers since + ;; I'm hoping the out-of-order and loop-pipeline logic in + ;; the Xeon will help us out. + + ;; See if we are still looping + jle L_mpn_add_n_loop_done + + get_CF_from_reg_a ;; recover CF + + ;; Load inputs into rbx and r8 + ;; add with carry, and put result in r8 + ;; then store r8 to output. + mov rbx,[rsi+r15*8] + mov r8,[rdx+r15*8] + adc r8,rbx + mov [rdi+r15*8],r8 + + ;; Load inputs into r9 and r10 + ;; add with carry, and put result in r10 + ;; then store r10 to output. + mov r9,[8+rsi+r15*8] + mov r10,[8+rdx+r15*8] + adc r10,r9 + mov [8+rdi+r15*8],r10 + + ;; Load inputs into r11 and r12 + ;; add with carry, and put result in r12 + ;; then store r12 to output. + mov r11,[16+rsi+r15*8] + mov r12,[16+rdx+r15*8] + adc r12,r11 + mov [16+rdi+r15*8],r12 + + ;; Load inputs into r13 and r14 + ;; add with carry, and put result in r14 + ;; then store r14 to output. + mov r13,[24+rsi+r15*8] + mov r14,[24+rdx+r15*8] + adc r14,r13 + mov [24+rdi+r15*8],r14 + + save_CF_to_reg_a ;; save CF + + mov r10,r15 + add r10,8 + add r15,4 ;; increment by 4. + + mov r9,rcx + sub r9,r10 ;; r9 = n-(i+4) + jmp L_mpn_add_n_main_loop + +L_mpn_add_n_loop_done: + mov r15,rcx ;; + sub r15,r9 ;; r15 = n-(n-(i+4))=i+4 + sub r15,4 ;; r15 = i + cmp r15,rcx +L_mpn_add_n_post_loop: + je L_mpn_add_n_exit + get_CF_from_reg_a ;; recover CF + + ;; Load inputs into rbx and r8 + ;; add with carry, and put result in r8 + ;; then store r8 to output. + mov rbx,[rsi+r15*8] + mov r8,[rdx+r15*8] + adc r8,rbx + mov [rdi+r15*8],r8 + save_CF_to_reg_a ;; save CF + add r15,1 + cmp r15,rcx + jmp L_mpn_add_n_post_loop + + +L_mpn_add_n_exit: + xor rcx,rcx + get_CF_from_reg_a ;; recover the CF + mov rax,rcx ;; Clears rax without affecting carry flag + adc rax,rax ;; returns carry status. + + pop r15 ;; restore callee-save registers + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + ret