mpir/mpn/x86_64/divrem_2.as


;  x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.

;  Copyright 2007, 2008 Free Software Foundation, Inc.

;  Copyright Brian Gladman 2010 (Conversion to yasm format)

;  This file is part of the GNU MP Library.

;  The GNU MP Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 3 of the License, or (at
;  your option) any later version.

;  The GNU MP Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.

;  You should have received a copy of the GNU Lesser General Public License

;		norm	frac
; k8		20	20
; netburst	73	73
; core2	37	37
; nehalem	33	33

; INPUT PARAMETERS
; qp 	rdi
; fn 	rsi
; np	rdx
; nn 	rcx
; dp	r8
; dinv r9

%include 'yasm_mac.inc'
    BITS 64
	TEXT

%define reg_save_list   rbx, rbp, rsi, rdi, r12, r13, r14, r15

    align  16
GLOBAL_FUNC mpn_divrem_2

	push    r15
	lea     rax, [rdx+rcx*8]
	push    r14
	push    r13
	mov     r13, rsi
	push    r12
	lea     r12, [rax-24]
	push    rbp
	mov     rbp, rdi
	push    rbx
	mov     r11, [r8+8]
	mov     r9, [rax-8]
	mov     r8, [r8]
	mov     r10, [rax-16]
	xor     r15d, r15d
	cmp     r11, r9
	ja      L_2
	setb    dl
	cmp     r8, r10
	setbe   al
	or      dl, al
	jne     L_23
L_2:
	lea     rbx, [rcx+r13-3]
	test    rbx, rbx
	js      L_6
	mov     rdx, r11
	mov     rax, -1
	not     rdx
	div     r11
	mov     rdx, r11
	mov     rdi, rax
	imul    rdx, rax
	mov     r14, rdx
	mul     r8
	mov     rcx, rdx
	mov     rdx, -1
	add     r14, r8
	adc     rdx, 0
	add     r14, rcx
	adc     rdx, 0
	js      L_8
L_18:
	dec     rdi
	sub     r14, r11
	sbb     rdx, 0
	jns     L_18
L_8:

%ifdef NEW
	lea     rbp, [rbp+rbx*8]
	mov     rcx, rbx
	mov     rbx, r9
	mov     r9, rdi
	mov     r14, r10
	mov     rsi, r11
	neg     rsi
	align  16
L_loop:
	mov     rax, r9
	mul     rbx
	add     rax, r14
	mov     r10, rax
	adc     rdx, rbx
	mov     rdi, rdx
	imul    rdx, rsi
	mov     rax, r8
	lea     rbx, [rdx+r14]
	mul     rdi
	xor     r14d, r14d
	cmp     r13, rcx
	jg      L_19
	mov     r14, [r12]
	sub     r12, 8
L_19:
	sub	 r14, r8
	sbb     rbx, r11
	sub     r14, rax
	sbb     rbx, rdx
	inc     rdi
	xor     edx, edx
	cmp     rbx, r10
	mov     rax, r8
	adc     rdx, -1
	add     rdi, rdx
	and     rax, rdx
	and     rdx, r11
	add     r14, rax
	adc     rbx, rdx
	cmp     rbx, r11
	jae     L_fix
L_bck:
	mov	 [rbp], rdi 
	sub     rbp, 8
	dec     rcx
	jns     L_loop

	mov     r10, r14
	mov     r9, rbx

%else

	lea     rbp, [rbp+rbx*8]
	mov     rcx, rbx
	mov     rax, r9
	mov     rsi, r10
	align  16
L_loop:
	mov     r14, rax
	mul     rdi
	mov     r9, r11
	add     rax, rsi
	mov     rbx, rax
	adc     rdx, r14
	lea     r10, [rdx+1]
	mov     rax, rdx
	imul    r9, rdx
	sub     rsi, r9
	xor     r9d, r9d
	mul     r8
	cmp     r13, rcx
	jg      L_13
	mov     r9, [r12]
	sub     r12, 8
L_13:
	sub	 r9, r8
	sbb     rsi, r11
	sub     r9, rax
	sbb     rsi, rdx
	cmp     rsi, rbx
	sbb     rax, rax
	not     rax
	add     r10, rax
	mov     rbx, r8
	and     rbx, rax
	and     rax, r11
	add     r9, rbx
	adc     rax, rsi
	cmp     r11, rax
	jbe     L_fix
L_bck:
	mov	 [rbp], r10
	sub     rbp, 8
	mov     rsi, r9
	dec     rcx
	jns     L_loop

	mov     r10, rsi
	mov     r9, rax

%endif

L_6:
	mov     [r12+8], r10
	mov     [r12+16], r9
	pop     rbx
	pop     rbp
	pop     r12
	pop     r13
	pop     r14
	mov     rax, r15
	pop     r15
	ret

L_23:
	inc     r15d
	sub     r10, r8
	sbb     r9, r11
	jmp     L_2

%ifdef NEW

L_fix:
	seta	 dl
	cmp     r14, r8
	setae   al
	orb     al, dl
	je      L_bck
	inc     rdi
	sub     r14, r8
	sbb     rbx, r11
	jmp     L_bck

%else
L_fix:
	jb	 L_88
	cmp     r9, r8
	jb      L_bck
L_88:
	inc	 r10
	sub     r9, r8
	sbb     rax, r11
	jmp     L_bck

%endif
Convert divrem_2.asm to yasm format. 2010-02-07 09:02:39 -05:00
			`; x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.`

			`; Copyright 2007, 2008 Free Software Foundation, Inc.`

Added a clarification to the copyright. 2010-02-07 09:07:42 -05:00			`; Copyright Brian Gladman 2010 (Conversion to yasm format)`
Convert divrem_2.asm to yasm format. 2010-02-07 09:02:39 -05:00
			`; This file is part of the GNU MP Library.`

			`; The GNU MP Library is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
			`; by the Free Software Foundation; either version 3 of the License, or (at`
			`; your option) any later version.`

			`; The GNU MP Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`; License for more details.`

			`; You should have received a copy of the GNU Lesser General Public License`

			`; norm frac`
			`; k8 20 20`
			`; netburst 73 73`
			`; core2 37 37`
			`; nehalem 33 33`

			`; INPUT PARAMETERS`
			`; qp rdi`
			`; fn rsi`
			`; np rdx`
			`; nn rcx`
			`; dp r8`
			`; dinv r9`

			`%include 'yasm_mac.inc'`
			`BITS 64`
			`TEXT`

			`%define reg_save_list rbx, rbp, rsi, rdi, r12, r13, r14, r15`

			`align 16`
			`GLOBAL_FUNC mpn_divrem_2`

			`push r15`
			`lea rax, [rdx+rcx*8]`
			`push r14`
			`push r13`
			`mov r13, rsi`
			`push r12`
			`lea r12, [rax-24]`
			`push rbp`
			`mov rbp, rdi`
			`push rbx`
			`mov r11, [r8+8]`
			`mov r9, [rax-8]`
			`mov r8, [r8]`
			`mov r10, [rax-16]`
			`xor r15d, r15d`
			`cmp r11, r9`
			`ja L_2`
			`setb dl`
			`cmp r8, r10`
			`setbe al`
			`or dl, al`
			`jne L_23`
			`L_2:`
			`lea rbx, [rcx+r13-3]`
			`test rbx, rbx`
			`js L_6`
			`mov rdx, r11`
			`mov rax, -1`
			`not rdx`
			`div r11`
			`mov rdx, r11`
			`mov rdi, rax`
			`imul rdx, rax`
			`mov r14, rdx`
			`mul r8`
			`mov rcx, rdx`
			`mov rdx, -1`
			`add r14, r8`
			`adc rdx, 0`
			`add r14, rcx`
			`adc rdx, 0`
			`js L_8`
			`L_18:`
			`dec rdi`
			`sub r14, r11`
			`sbb rdx, 0`
			`jns L_18`
			`L_8:`

			`%ifdef NEW`
			`lea rbp, [rbp+rbx*8]`
			`mov rcx, rbx`
			`mov rbx, r9`
			`mov r9, rdi`
			`mov r14, r10`
			`mov rsi, r11`
			`neg rsi`
			`align 16`
			`L_loop:`
			`mov rax, r9`
			`mul rbx`
			`add rax, r14`
			`mov r10, rax`
			`adc rdx, rbx`
			`mov rdi, rdx`
			`imul rdx, rsi`
			`mov rax, r8`
			`lea rbx, [rdx+r14]`
			`mul rdi`
			`xor r14d, r14d`
			`cmp r13, rcx`
			`jg L_19`
			`mov r14, [r12]`
			`sub r12, 8`
			`L_19:`
			`sub r14, r8`
			`sbb rbx, r11`
			`sub r14, rax`
			`sbb rbx, rdx`
			`inc rdi`
			`xor edx, edx`
			`cmp rbx, r10`
			`mov rax, r8`
			`adc rdx, -1`
			`add rdi, rdx`
			`and rax, rdx`
			`and rdx, r11`
			`add r14, rax`
			`adc rbx, rdx`
			`cmp rbx, r11`
			`jae L_fix`
			`L_bck:`
			`mov [rbp], rdi`
			`sub rbp, 8`
			`dec rcx`
			`jns L_loop`

			`mov r10, r14`
			`mov r9, rbx`

			`%else`

			`lea rbp, [rbp+rbx*8]`
			`mov rcx, rbx`
			`mov rax, r9`
			`mov rsi, r10`
			`align 16`
			`L_loop:`
			`mov r14, rax`
			`mul rdi`
			`mov r9, r11`
			`add rax, rsi`
			`mov rbx, rax`
			`adc rdx, r14`
			`lea r10, [rdx+1]`
			`mov rax, rdx`
			`imul r9, rdx`
			`sub rsi, r9`
			`xor r9d, r9d`
			`mul r8`
			`cmp r13, rcx`
			`jg L_13`
			`mov r9, [r12]`
			`sub r12, 8`
			`L_13:`
			`sub r9, r8`
			`sbb rsi, r11`
			`sub r9, rax`
			`sbb rsi, rdx`
			`cmp rsi, rbx`
			`sbb rax, rax`
			`not rax`
			`add r10, rax`
			`mov rbx, r8`
			`and rbx, rax`
			`and rax, r11`
			`add r9, rbx`
			`adc rax, rsi`
			`cmp r11, rax`
			`jbe L_fix`
			`L_bck:`
			`mov [rbp], r10`
			`sub rbp, 8`
			`mov rsi, r9`
			`dec rcx`
			`jns L_loop`

			`mov r10, rsi`
			`mov r9, rax`

			`%endif`

			`L_6:`
			`mov [r12+8], r10`
			`mov [r12+16], r9`
			`pop rbx`
			`pop rbp`
			`pop r12`
			`pop r13`
			`pop r14`
			`mov rax, r15`
			`pop r15`
			`ret`

			`L_23:`
			`inc r15d`
			`sub r10, r8`
			`sbb r9, r11`
			`jmp L_2`

			`%ifdef NEW`

			`L_fix:`
			`seta dl`
			`cmp r14, r8`
			`setae al`
			`orb al, dl`
			`je L_bck`
			`inc rdi`
			`sub r14, r8`
			`sbb rbx, r11`
			`jmp L_bck`

			`%else`
			`L_fix:`
			`jb L_88`
			`cmp r9, r8`
			`jb L_bck`
			`L_88:`
			`inc r10`
			`sub r9, r8`
			`sbb rax, r11`
			`jmp L_bck`

			`%endif`