; Note this file has been converted to linux/gcc calling conventions, but the PIC ; friendly code does not work. The stack unwinding code has not been removed. ; Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software ; Foundation, Inc. ; ; This file is part of the GNU MP Library. ; ; The GNU MP Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The GNU MP Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the GNU MP Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 51 Franklin Street, ; Fifth Floor, Boston, MA 02110-1301, USA. ; ; cycles/limb ; Hammer: 10 ; Prescott/Nocona: 33 ; mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, ; mp_limb_t divisor); ; mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, ; mp_limb_t divisor, mp_limb_t carry); ; ; The dependent chain in the main loop is ; ; cycles ; subq %r8, %rax 1 ; imulq %r9, %rax 4 ; mulq %rsi 5 ; ---- ; total 10 ; ; The movq load from src seems to need to be scheduled back before the jz to ; achieve this speed, out-of-order execution apparently can't completely ; hide the latency otherwise. ; ; The l=src[i]-cbit step is rotated back too, since that allows us to avoid ; it for the first iteration (where there's no cbit). ; ; The code alignment used (32-byte) for the loop also seems necessary. ; Without that the non-PIC case has adcq crossing the 0x60 offset, ; apparently making it run at 11 cycles instead of 10. ; ; Not done: ; ; divq for size==1 was measured at about 79 cycles, compared to the inverse ; at about 25 cycles (both including function call overheads), so that's not ; used. ; ; Enhancements: ; ; For PIC, we shouldn't really need the GOT fetch for modlimb_invert_table, ; it'll be in rodata or text in libgmp.so and can be accessed directly %rip ; relative. This would be for small model only (something we don't ; presently detect, but which is all that gcc 3.3.3 supports), since 8-byte ; PC-relative relocations are apparently not available. Some rough ; experiments with binutils 2.13 looked worrylingly like it might come out ; with an unwanted text segment relocation though, even with ".protected". ; AMD64 mpn_modexact_1_odd -- exact division style remainder. ; ; mp_limb_t mpn_modexact_1_odd ( ; mp_srcptr src, rcx ; mp_size_t size, rdx ; mp_limb_t divisor r8 ; ); ; mp_limb_t mpn_modexact_1c_odd ( ; mp_srcptr src, rcx ; mp_size_t size, rdx ; mp_limb_t divisor, r8 ; mp_limb_t carry r9 ; ); ; ; mp_limb_t mpn_modexact_1_odd ( ; mp_srcptr src, rdi ; mp_size_t size, rsi ; mp_limb_t divisor rdx ; ); ; mp_limb_t mpn_modexact_1c_odd ( ; mp_srcptr src, rdi ; mp_size_t size, rsi ; mp_limb_t divisor, rdx ; mp_limb_t carry rcx ; ); ; ; This is an SEH Frame Function with a leaf prologue %ifdef _WIN64_ABI %define src rcx %define srcd ecx %define srcl cl %define dv2 rdx %define dvd2 edx %define cy r9 %define dv r8 %else %define src rbx %define srcd ebx %define srcl bl %define dv2 r8 %define dvd2 r8d %define cy rcx %define dv rdx %endif bits 64 section .text align 32 global __gmpn_modexact_1_odd:function global __gmpn_modexact_1c_odd:function extern __gmp_modlimb_invert_table %ifdef DLL export __gmpn_modexact_1_odd export __gmpn_modexact_1c_odd %endif __gmpn_modexact_1_odd: mov cy, 0 ; carry %ifdef _WIN64_ABI PROC_FRAME __gmpn_modexact_1c_odd push_reg rsi push_reg rdi alloc_stack 8 ; align to 16 byte boundary END_PROLOGUE mov rsi, rdx %else __gmpn_modexact_1c_odd: push rbx mov src, rdi %endif mov dv2, dv shr dvd2, 1 ; div / 2 %ifdef _WIN64_ABI lea r10, [rel __gmp_modlimb_invert_table] %else mov r10, [__gmp_modlimb_invert_table wrt rip wrt ..gotpcrel] %endif and dvd2, 127 movzx dvd2, byte [dv2+r10] ; inv -> dv (8-bit approx) mov rax, [src] lea r11, [src+rsi*8] ; pointer to top of src mov rdi, dv ; save divisor lea srcd, [dv2+dv2] imul dvd2, dvd2 neg rsi ; limb offset from top of drc imul dvd2, edi sub srcd, dvd2 ; inv -> src (16-bit approx) lea dvd2, [src+src] imul srcd, srcd imul srcd, edi sub dvd2, srcd ; inv -> dv (32-bit approx) xor srcd, srcd lea r10, [dv2+dv2] imul dv2, dv2 imul dv2, dv sub r10, dv2 ; inv -> r10 (64-bit approx) mov dv2, cy ; intial carry -> dv inc rsi ; adjust limb offset jz .1 mov cy, r11 lea rsi,[r11+rsi*8] align 16 .0: sub rax, dv2 adc src, 0 imul rax, r10 mul dv lodsq sub rax, src setc srcl cmp cy, rsi jne .0 .1: sub rax, dv2 adc src, 0 imul rax, r10 mul dv lea rax, [src+dv2] %ifdef _WIN64_ABI add rsp, 8 pop rdi pop rsi ret ENDPROC_FRAME %else pop rbx ret %endif end