; Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software ; Foundation, Inc. ; ; This file is part of the GNU MP Library. ; ; The GNU MP Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The GNU MP Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the GNU MP Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 51 Franklin Street, ; Fifth Floor, Boston, MA 02110-1301, USA. ; ; cycles/limb ; Hammer: 10 ; Prescott/Nocona: 33 ; mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, ; mp_limb_t divisor); ; mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, ; mp_limb_t divisor, mp_limb_t carry); ; ; The dependent chain in the main loop is ; ; cycles ; subq %r8, %rax 1 ; imulq %r9, %rax 4 ; mulq %rsi 5 ; ---- ; total 10 ; ; The movq load from src seems to need to be scheduled back before the jz to ; achieve this speed, out-of-order execution apparently can't completely ; hide the latency otherwise. ; ; The l=src[i]-cbit step is rotated back too, since that allows us to avoid ; it for the first iteration (where there's no cbit). ; ; The code alignment used (32-byte) for the loop also seems necessary. ; Without that the non-PIC case has adcq crossing the 0x60 offset, ; apparently making it run at 11 cycles instead of 10. ; ; Not done: ; ; divq for size==1 was measured at about 79 cycles, compared to the inverse ; at about 25 cycles (both including function call overheads), so that's not ; used. ; ; Enhancements: ; ; For PIC, we shouldn't really need the GOT fetch for modlimb_invert_table, ; it'll be in rodata or text in libgmp.so and can be accessed directly %rip ; relative. This would be for small model only (something we don't ; presently detect, but which is all that gcc 3.3.3 supports), since 8-byte ; PC-relative relocations are apparently not available. Some rough ; experiments with binutils 2.13 looked worrylingly like it might come out ; with an unwanted text segment relocation though, even with ".protected". ; AMD64 mpn_modexact_1_odd -- exact division style remainder. ; ; mp_limb_t mpn_modexact_1_odd ( ; mp_srcptr src, rdi ; mp_size_t size, rsi ; mp_limb_t divisor rdx ; ); ; mp_limb_t mpn_modexact_1c_odd ( ; mp_srcptr src, rdi ; mp_size_t size, rsi ; mp_limb_t divisor, rdx ; mp_limb_t carry rcx ; ); ; bits 64 section .text align 32 global __gmpn_modexact_1_odd:function global __gmpn_modexact_1c_odd:function extern __gmp_modlimb_invert_table __gmpn_modexact_1_odd: mov ecx, 0 ; carry __gmpn_modexact_1c_odd: mov r8, rdx shr edx, 1 %ifdef PIC mov r9, [__gmp_modlimb_invert_table wrt rip wrt ..gotpcrel] %else lea r9, [__gmp_modlimb_invert_table wrt rip] %endif and edx, 127 mov r10, rcx movzx edx, byte [rdx+r9] mov rax, [rdi] lea r11, [rdi+rsi*8] mov rdi, r8 lea ecx, [rdx+rdx] imul rdx, rdx neg rsi imul edx, edi sub ecx, edx lea edx, [rcx+rcx] imul ecx, ecx imul ecx, edi sub edx, ecx xor ecx, ecx lea r9, [rdx+rdx] imul rdx, rdx imul rdx, r8 sub r9, rdx mov rdx, r10 ; According to Brian Gladman, the ; following three lines are the ; remnant of a "dead" assert ; and so can be omitted ;mov r10, r8 ;imul r10, r9 ;cmp r10, 1 inc rsi jz .1 align 16 .0: sub rax, rdx adc rcx, 0 imul rax, r9 mul r8 mov rax, [r11+rsi*8] sub rax, rcx setc cl inc rsi jnz .0 .1: sub rax, rdx adc rcx, 0 imul rax, r9 mul r8 lea rax, [rcx+rdx] ret end