mpir/mpn/x86_64w/amd64/rsh_divrem_hensel_qr_1.asm


;  Copyright 2009 Jason Moxham
;
;  Windows Conversion Copyright 2008 Brian Gladman
;
;  This file is part of the MPIR Library.
;
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.
;
;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.
;
;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.
;
;  mp_limb_t  mpn_rsh_divrem_hensel_qr_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t, mp_uint)
;  rax                                      rdi     rsi        rdx        rcx       r8
;  rax                                      rcx     rdx        r8d         r9 [rsp+40]

%include "..\yasm_mac.inc"

    CPU  Athlon64
    BITS 64

%define reg_save_list rsi, rdi

	FRAME_PROC mpn_rsh_divrem_hensel_qr_1, 0, reg_save_list
    movsxd  rax, r8d
	lea     rdi, [rcx+rax*8]
	lea     rsi, [rdx+rax*8]
    mov     rcx, r9
    movsxd  r8, dword[rsp+stack_use+40]

	mov     r9, 1
	sub     r9, rax
     
	mov     rdx, rcx
	
	mov     rax, rdx
	imul    edx, ecx
	mov     r11, 2
	sub     r11, rdx
	imul    r11d, eax

	mov     rax, r11
	imul    r11d, ecx
	mov     rdx, 2
	sub     rdx, r11
	imul    edx, eax

	mov     rax, rdx
	imul    edx, ecx
	mov     r11, 2
	sub     r11, rdx
	imul    r11d, eax

	mov     rax, r11
	imul    r11d, ecx
	mov     rdx, 2
	sub     rdx, r11
	imul    edx, eax

	mov     rax, rdx
	imul    rdx, rcx
	mov     r11, 2
	sub     r11, rdx
	imul    r11, rax

	mov     rax, 64
	sub     rax, r8
	movq    mm0, r8
	movq    mm1, rax
	mov     rax, [rsi+r9*8-8]
	imul    rax, r11
	movq    mm4, rax
	movq    mm5, mm4
	psrlq   mm4, mm0
	psllq   mm5, mm1
	psrlq   mm5, mm1
	mul     rcx
	mov     r8, 1
; cmp below clears carry
	cmp     r9, 0
	jz      L_skiploop
	xalign  16
L_loop:
	movq    mm2, mm4
	mov     rax, [rsi+r9*8]
	sbb     rax, rdx
	sbb     r8, r8
	imul    rax, r11
	movq    mm3, rax
	movq    mm4, mm3
	psllq   mm3, mm1
	psrlq   mm4, mm0
	por     mm2, mm3
	movq    [rdi+r9*8-8], mm2
	mul     rcx
	add     r8, 1
	inc     r9
	jnz     L_loop
L_skiploop:
	movq    [rdi+r9*8-8], mm4
	mov     rax, 0
	adc     rax, rdx
	emms
	END_PROC reg_save_list
	end
Add new assembler, C and test files to Windows builds 2009-09-30 12:21:46 -04:00
			`; Copyright 2009 Jason Moxham`
			`;`
			`; Windows Conversion Copyright 2008 Brian Gladman`
			`;`
			`; This file is part of the MPIR Library.`
			`;`
			`; The MPIR Library is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
			`; by the Free Software Foundation; either version 2.1 of the License, or (at`
			`; your option) any later version.`
			`;`
			`; The MPIR Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`; License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public License`
			`; along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`; Boston, MA 02110-1301, USA.`
			`;`
			`; mp_limb_t mpn_rsh_divrem_hensel_qr_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t, mp_uint)`
			`; rax rdi rsi rdx rcx r8`
			`; rax rcx rdx r8d r9 [rsp+40]`

			`%include "..\yasm_mac.inc"`

			`CPU Athlon64`
			`BITS 64`

			`%define reg_save_list rsi, rdi`

			`FRAME_PROC mpn_rsh_divrem_hensel_qr_1, 0, reg_save_list`
			`movsxd rax, r8d`
			`lea rdi, [rcx+rax*8]`
			`lea rsi, [rdx+rax*8]`
			`mov rcx, r9`
			`movsxd r8, dword[rsp+stack_use+40]`

			`mov r9, 1`
			`sub r9, rax`

			`mov rdx, rcx`

			`mov rax, rdx`
			`imul edx, ecx`
			`mov r11, 2`
			`sub r11, rdx`
			`imul r11d, eax`

			`mov rax, r11`
			`imul r11d, ecx`
			`mov rdx, 2`
			`sub rdx, r11`
			`imul edx, eax`

			`mov rax, rdx`
			`imul edx, ecx`
			`mov r11, 2`
			`sub r11, rdx`
			`imul r11d, eax`

			`mov rax, r11`
			`imul r11d, ecx`
			`mov rdx, 2`
			`sub rdx, r11`
			`imul edx, eax`

			`mov rax, rdx`
			`imul rdx, rcx`
			`mov r11, 2`
			`sub r11, rdx`
			`imul r11, rax`

			`mov rax, 64`
			`sub rax, r8`
			`movq mm0, r8`
			`movq mm1, rax`
			`mov rax, [rsi+r9*8-8]`
			`imul rax, r11`
			`movq mm4, rax`
			`movq mm5, mm4`
			`psrlq mm4, mm0`
			`psllq mm5, mm1`
			`psrlq mm5, mm1`
			`mul rcx`
			`mov r8, 1`
			`; cmp below clears carry`
			`cmp r9, 0`
			`jz L_skiploop`
			`xalign 16`
			`L_loop:`
			`movq mm2, mm4`
			`mov rax, [rsi+r9*8]`
			`sbb rax, rdx`
			`sbb r8, r8`
			`imul rax, r11`
			`movq mm3, rax`
			`movq mm4, mm3`
			`psllq mm3, mm1`
			`psrlq mm4, mm0`
			`por mm2, mm3`
			`movq [rdi+r9*8-8], mm2`
			`mul rcx`
			`add r8, 1`
			`inc r9`
			`jnz L_loop`
			`L_skiploop:`
			`movq [rdi+r9*8-8], mm4`
			`mov rax, 0`
			`adc rax, rdx`
			`emms`
			`END_PROC reg_save_list`
			`end`