; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. ; ; Copyright 2008 Brian Gladman ; ; This file is part of the MPIR Library. ; ; The MPIR Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The MPIR Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the MPIR Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 51 Franklin Street, ; Fifth Floor, Boston, MA 02110-1301, USA. ; ; AMD64 mpn_divexact_1 -- mpn by limb exact division ; ; Calling interface: ; ; void mpn_divexact_1( ; mp_ptr dst, rcx ; mp_srcptr src, rdx ; mp_size_t size, r8 ; mp_limb_t divisor r9 ; ) ; ; since the inverse takes a while to setup,plain division is used for small ; Multiplying works out faster for size>=3 when the divisor is odd or size>=4 ; when the divisor is even. ; ; This is an SEH Frame Function with a leaf prologue %include "yasm_mac.inc" %define reg_save_list rsi, rdi BITS 64 extern __gmp_modlimb_invert_table LEAF_PROC mpn_divexact_1 movsxd r8, r8d mov r10, rdx mov rax, r9 and rax, byte 1 add rax, r8 cmp rax, byte 4 jae .1 xor rdx,rdx .0: mov rax, [r10+r8*8-8] div r9 mov [rcx+r8*8-8], rax sub r8, 1 jnz .0 ret ; avoid single byte return .1: FRAME_PROC mul_by_inverse, 0, reg_save_list mov rsi, rdx ; src pointer mov rdi, rcx ; dst pointer bsf rcx, r9 ; remove powers of two shr r9, cl mov rax, r9 shr rax, 1 and rax, 127 lea rdx, [rel __gmp_modlimb_invert_table] movzx rax, byte [rdx+rax] ; If f(x) = 0, then x[n+1] = x[n] - f(x) / f'(x) is Newton's iteration for a ; root. With f(x) = 1/x - v we obtain x[n + 1] = 2 * x[n] - v * x[n] * x[n] ; as an iteration for x = 1 / v. This provides quadratic convergence so ; that the number of bits of precision doubles on each iteration. The ; iteration starts with 8-bit precision. lea edx, [rax+rax] imul eax, eax imul eax, r9d sub edx, eax ; inv -> rdx (16-bit approx) lea eax, [rdx+rdx] imul edx, edx imul edx, r9d sub eax, edx ; inv -> rdx (32-bit approx) lea rdx, [rax+rax] imul rax, rax imul rax, r9 sub rdx, rax ; inv -> rdx (64-bit approx) lea rsi, [rsi+r8*8] lea rdi, [rdi+r8*8] neg r8 mov r10, rdx ; inverse multiplier -> r10 xor r11, r11 mov rax, [rsi+r8*8] or rcx, rcx mov rdx, [rsi+r8*8+8] jz .3 ; if divisor is odd shrd rax, rdx, cl add r8, 1 jmp .5 alignb 16, nop .2: mul r9 ; divisor is odd mov rax, [rsi+r8*8] sub rdx, r11 sub rax, rdx sbb r11, r11 .3: imul rax, r10 mov [rdi+r8*8], rax add r8, 1 jnz .2 jmp .6 alignb 16, nop .4: mul r9 ; divisor is even sub rdx, r11 mov rax, [rsi+r8*8-8] mov r11, [rsi+r8*8] shrd rax, r11, cl sub rax, rdx sbb r11, r11 .5: imul rax, r10 mov [rdi+r8*8-8],rax add r8, 1 jnz .4 mul r9 mov rax, [rsi-8] sub rdx, r11 shr rax, cl sub rax, rdx imul rax, r10 mov [rdi-8], rax .6: END_PROC reg_save_list end