mpir/mpn/x86_64w/dive_1.asm


;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
;  Copyright 2008 Brian Gladman
;
;  This file is part of the MPIR Library.
;
;  The MPIR Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
;
;  The MPIR Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public
;  License along with the MPIR Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
;  Fifth Floor, Boston, MA 02110-1301, USA.
;
; since the inverse takes a while to setup,plain division is used for small
; Multiplying works out faster for size>=3 when the divisor is odd or size>=4
; when the divisor is even.
;
;  void mpn_divexact_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
;                         rdi     rsi        rdx        rcx
;                         rcx     rdx        r8d         r9

%include "yasm_mac.inc"

%define reg_save_list       rsi, rdi

    BITS 64

    extern  __gmp_modlimb_invert_table

    LEAF_PROC mpn_divexact_1
    movsxd  r8, r8d
    mov     r10, rdx
    mov     rax, r9
    and     rax, byte 1
    add     rax, r8
    cmp     rax, byte 4
    jae     .2
    xor     rdx,rdx
.1: 
	mov     rax, [r10+r8*8-8]
    div     r9
    mov     [rcx+r8*8-8], rax
    sub     r8, 1
    jnz     .1
    ret                     ; avoid single byte return
.2:
    FRAME_PROC ?mpn_divexact, 0, reg_save_list
    mov     rsi, rdx        ; src pointer
    mov     rdi, rcx        ; dst pointer
    bsf     rcx, r9         ; remove powers of two
    shr     r9, cl
    mov     rax, r9
    shr     rax, 1
    and     rax, 127
    lea     rdx, [rel __gmp_modlimb_invert_table]
    movzx   rax, byte [rdx+rax]

; If f(x) = 0, then x[n+1] = x[n] - f(x) / f'(x) is Newton's iteration for a
; root. With f(x) = 1/x - v we obtain x[n + 1] = 2 * x[n] - v * x[n] * x[n]
; as an iteration for x = 1 / v.  This provides quadratic convergence so
; that the number of bits of precision doubles on each iteration.  The
; iteration starts with 8-bit precision.

    lea     edx, [rax+rax]
    imul    eax, eax
    imul    eax, r9d
    sub     edx, eax            ; inv -> rdx (16-bit approx)

    lea     eax, [rdx+rdx]
    imul    edx, edx
    imul    edx, r9d
    sub     eax, edx            ; inv -> rdx (32-bit approx)

    lea     rdx, [rax+rax]
    imul    rax, rax
    imul    rax, r9
    sub     rdx, rax            ; inv -> rdx (64-bit approx)

    lea     rsi, [rsi+r8*8]
    lea     rdi, [rdi+r8*8]
    neg     r8

    mov     r10, rdx            ; inverse multiplier -> r10
    xor     r11, r11
    mov     rax, [rsi+r8*8]
    or      rcx, rcx
    mov     rdx, [rsi+r8*8+8]
    jz      .4                  ; if divisor is odd
    shrd    rax, rdx, cl
    add     r8, 1
    jmp     .6

    xalign  16
.3: 
	mul     r9                  ; divisor is odd
    mov     rax, [rsi+r8*8]
    sub     rdx, r11
    sub     rax, rdx
    sbb     r11, r11
.4: 
	imul    rax, r10
    mov     [rdi+r8*8], rax
    add     r8, 1
    jnz     .3
    jmp     .7

    xalign  16
.5: 
	mul     r9                  ; divisor is even
    sub     rdx, r11
    mov     rax, [rsi+r8*8-8]
    mov     r11, [rsi+r8*8]
    shrd    rax, r11, cl
    sub     rax, rdx
    sbb     r11, r11
.6: 
	imul    rax, r10
    mov     [rdi+r8*8-8],rax
    add     r8, 1
    jnz     .5

    mul     r9
    mov     rax, [rsi-8]
    sub     rdx, r11
    shr     rax, cl
    sub     rax, rdx
    imul    rax, r10
    mov     [rdi-8], rax
.7: 
	END_PROC reg_save_list

    end
Update copyright notices in Windows assembler code 2009-01-10 10:15:37 -05:00
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.`
			`;`
Update copyright notices in Windows assembler code 2009-01-10 10:15:37 -05:00			`; Copyright 2008 Brian Gladman`
			`;`
			`; This file is part of the MPIR Library.`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`;`
Update copyright notices in Windows assembler code 2009-01-10 10:15:37 -05:00			`; The MPIR Library is free software; you can redistribute it and/or`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
Update copyright notices in Windows assembler code 2009-01-10 10:15:37 -05:00			`; The MPIR Library is distributed in the hope that it will be useful,`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
Update copyright notices in Windows assembler code 2009-01-10 10:15:37 -05:00			`; License along with the MPIR Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 51 Franklin Street,`
			`; Fifth Floor, Boston, MA 02110-1301, USA.`
Bring Windows up to date on new trunk 2009-02-22 16:03:08 -05:00			`;`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`; since the inverse takes a while to setup,plain division is used for small`
			`; Multiplying works out faster for size>=3 when the divisor is odd or size>=4`
			`; when the divisor is even.`
			`;`
1. Add new x64 assembler functions to the Windows build 2009-09-02 07:41:43 -04:00			`; void mpn_divexact_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t)`
			`; rdi rsi rdx rcx`
1. update Core2 x64 build 2. Improve assembler interface descriptions in AMD64 headers 2009-09-02 09:35:23 -04:00			`; rcx rdx r8d r9`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
Major tidy up of Windows x86_64 assembler code 2009-03-07 10:00:35 -05:00			`%include "yasm_mac.inc"`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
			`%define reg_save_list rsi, rdi`

Major tidy up of Windows x86_64 assembler code 2009-03-07 10:00:35 -05:00			`BITS 64`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
			`extern __gmp_modlimb_invert_table`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00
Major tidy up of Windows x86_64 assembler code 2009-03-07 10:00:35 -05:00			`LEAF_PROC mpn_divexact_1`
correct 32 to 64 bit sign extension in Windows assembler code 2009-03-09 17:27:31 -04:00			`movsxd r8, r8d`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`mov r10, rdx`
			`mov rax, r9`
			`and rax, byte 1`
			`add rax, r8`
			`cmp rax, byte 4`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`jae .2`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`xor rdx,rdx`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`.1:`
			`mov rax, [r10+r8*8-8]`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`div r9`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`mov [rcx+r8*8-8], rax`
			`sub r8, 1`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`jnz .1`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`ret ; avoid single byte return`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`.2:`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`FRAME_PROC ?mpn_divexact, 0, reg_save_list`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`mov rsi, rdx ; src pointer`
			`mov rdi, rcx ; dst pointer`
			`bsf rcx, r9 ; remove powers of two`
			`shr r9, cl`
			`mov rax, r9`
			`shr rax, 1`
			`and rax, 127`
			`lea rdx, [rel __gmp_modlimb_invert_table]`
			`movzx rax, byte [rdx+rax]`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
			`; If f(x) = 0, then x[n+1] = x[n] - f(x) / f'(x) is Newton's iteration for a`
			`; root. With f(x) = 1/x - v we obtain x[n + 1] = 2 * x[n] - v * x[n] * x[n]`
			`; as an iteration for x = 1 / v. This provides quadratic convergence so`
			`; that the number of bits of precision doubles on each iteration. The`
			`; iteration starts with 8-bit precision.`

			`lea edx, [rax+rax]`
			`imul eax, eax`
			`imul eax, r9d`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`sub edx, eax ; inv -> rdx (16-bit approx)`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
			`lea eax, [rdx+rdx]`
			`imul edx, edx`
			`imul edx, r9d`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`sub eax, edx ; inv -> rdx (32-bit approx)`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
			`lea rdx, [rax+rax]`
			`imul rax, rax`
			`imul rax, r9`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`sub rdx, rax ; inv -> rdx (64-bit approx)`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`lea rsi, [rsi+r8*8]`
			`lea rdi, [rdi+r8*8]`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`neg r8`

Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`mov r10, rdx ; inverse multiplier -> r10`
			`xor r11, r11`
			`mov rax, [rsi+r8*8]`
			`or rcx, rcx`
			`mov rdx, [rsi+r8*8+8]`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`jz .4 ; if divisor is odd`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`shrd rax, rdx, cl`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`add r8, 1`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`jmp .6`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00
1. Update Windows version of getrusage 2. Use YASM's new alignment padding feature in the Windows assembler code 2009-03-24 10:40:39 -04:00			`xalign 16`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`.3:`
			`mul r9 ; divisor is odd`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`mov rax, [rsi+r8*8]`
			`sub rdx, r11`
			`sub rax, rdx`
			`sbb r11, r11`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`.4:`
			`imul rax, r10`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`mov [rdi+r8*8], rax`
			`add r8, 1`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`jnz .3`
			`jmp .7`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00
1. Update Windows version of getrusage 2. Use YASM's new alignment padding feature in the Windows assembler code 2009-03-24 10:40:39 -04:00			`xalign 16`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`.5:`
			`mul r9 ; divisor is even`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`sub rdx, r11`
			`mov rax, [rsi+r8*8-8]`
			`mov r11, [rsi+r8*8]`
			`shrd rax, r11, cl`
			`sub rax, rdx`
			`sbb r11, r11`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`.6:`
			`imul rax, r10`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`mov [rdi+r8*8-8],rax`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`add r8, 1`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`jnz .5`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`mul r9`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`mov rax, [rsi-8]`
			`sub rdx, r11`
			`shr rax, cl`
			`sub rax, rdx`
			`imul rax, r10`
			`mov [rdi-8], rax`
1. Add Jason's new assembler code to the Windows builds 2. Tidy up assembler to prepare for Windows nehalem build 2009-12-02 11:24:00 -05:00			`.7:`
			`END_PROC reg_save_list`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
			`end`