mpir/mpn/x86_64w/core2/divebyff.asm


;  core2 mpn_divexact_byff

;  Copyright 2009 Jason Moxham

;  Windows Converdxon Copyright 2008 Brian Gladman
;
;  This file is part of the MPIR Library.
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either verdxon 2.1 of the License, or (at
;  your option) any later verdxon.
;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.
;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.
;
;	(rcx, r8) = (rdx, r8)/0xFFFFFFFFFFFFFFFF
;	rax = "remainder"
;	where (rdx, r8) = (rcx, r8)*(B - 1) -rax*B^r8    and 0 <= rax < B - 1      B = 0xFFFFFFFFFFFFFFFF
;
;	this is good but suffers from alignment slowdown
;	we dont seem to have much freedom to re-arrange the instructions to avoid
;	it , I suppose we could detect alignment at the start and have different
;	routines for different alignments

%include "..\yasm_mac.inc"

    CPU  Core2
    BITS 64

    LEAF_PROC mpn_divexact_byff
    movsxd  r8, r8d

	xor     eax, eax
	mov     r9, r8
	and     r9, 3
	shr     r8, 2
	cmp     r8, 0
;	carry flag is clear here
	jnz     loop1
	sbb     rax, [rdx]
	mov     [rcx], rax
	dec     r9
	jz      end1
	sbb     rax, [rdx+8]
	mov     [rcx+8], rax
	dec     r9
	jz      end1
	sbb     rax, [rdx+16]
	mov     [rcx+16], rax
	dec     r9
end1:
	sbb     rax, 0
	ret
	xalign  16
loop1:
	sbb     rax, [rdx]
	mov     [rcx], rax
	sbb     rax, [rdx+8]
	mov     [rcx+8], rax
	sbb     rax, [rdx+16]
	mov     [rcx+16], rax
	sbb     rax, [rdx+24]
	mov     [rcx+24], rax
	lea     rdx, [rdx+32]
	dec     r8
	lea     rcx, [rcx+32]
	jnz     loop1
	inc     r9
	dec     r9
	jz      end
	sbb     rax, [rdx]
	mov     [rcx], rax
	dec     r9
	jz      end
	sbb     rax, [rdx+8]
	mov     [rcx+8], rax
	dec     r9
	jz      end
	sbb     rax, [rdx+16]
	mov     [rcx+16], rax
	dec     r9
end:
	sbb     rax, 0
	ret
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00
1. Add new/changed Core2 assembler files to the Windows build 2. Workaround VC++ optimisation bug in mul_fft.c 2009-05-13 05:54:24 -04:00			`; core2 mpn_divexact_byff`

			`; Copyright 2009 Jason Moxham`

			`; Windows Converdxon Copyright 2008 Brian Gladman`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`;`
			`; This file is part of the MPIR Library.`
			`; The MPIR Library is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
1. Add new/changed Core2 assembler files to the Windows build 2. Workaround VC++ optimisation bug in mul_fft.c 2009-05-13 05:54:24 -04:00			`; by the Free Software Foundation; either verdxon 2.1 of the License, or (at`
			`; your option) any later verdxon.`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`; The MPIR Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`; License for more details.`
			`; You should have received a copy of the GNU Lesser General Public License`
			`; along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`; Boston, MA 02110-1301, USA.`
			`;`
1. Add new/changed Core2 assembler files to the Windows build 2. Workaround VC++ optimisation bug in mul_fft.c 2009-05-13 05:54:24 -04:00			`; (rcx, r8) = (rdx, r8)/0xFFFFFFFFFFFFFFFF`
			`; rax = "remainder"`
			`; where (rdx, r8) = (rcx, r8)(B - 1) -raxB^r8 and 0 <= rax < B - 1 B = 0xFFFFFFFFFFFFFFFF`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00			`;`
1. Add new/changed Core2 assembler files to the Windows build 2. Workaround VC++ optimisation bug in mul_fft.c 2009-05-13 05:54:24 -04:00			`; this is good but suffers from alignment slowdown`
			`; we dont seem to have much freedom to re-arrange the instructions to avoid`
			`; it , I suppose we could detect alignment at the start and have different`
			`; routines for different alignments`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00
Major tidy up of Windows x86_64 assembler code 2009-03-07 10:00:35 -05:00			`%include "..\yasm_mac.inc"`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00
1. Update Windows version of getrusage 2. Use YASM's new alignment padding feature in the Windows assembler code 2009-03-24 10:40:39 -04:00			`CPU Core2`
Major tidy up of Windows x86_64 assembler code 2009-03-07 10:00:35 -05:00			`BITS 64`
Two minor Windows assembler changes plus code formatting to remove tabs and trailing white space 2009-03-13 16:32:09 -04:00
Major tidy up of Windows x86_64 assembler code 2009-03-07 10:00:35 -05:00			`LEAF_PROC mpn_divexact_byff`
correct 32 to 64 bit sign extension in Windows assembler code 2009-03-09 17:27:31 -04:00			`movsxd r8, r8d`
Major Windows Commit: 1. Add tune/speed build capability 2. Add JM's Core2 code 2009-02-27 09:24:25 -05:00
1. Add new/changed Core2 assembler files to the Windows build 2. Workaround VC++ optimisation bug in mul_fft.c 2009-05-13 05:54:24 -04:00			`xor eax, eax`
			`mov r9, r8`
			`and r9, 3`
			`shr r8, 2`
			`cmp r8, 0`
			`; carry flag is clear here`
			`jnz loop1`
			`sbb rax, [rdx]`
			`mov [rcx], rax`
			`dec r9`
			`jz end1`
			`sbb rax, [rdx+8]`
			`mov [rcx+8], rax`
			`dec r9`
			`jz end1`
			`sbb rax, [rdx+16]`
			`mov [rcx+16], rax`
			`dec r9`
			`end1:`
			`sbb rax, 0`
			`ret`
			`xalign 16`
			`loop1:`
			`sbb rax, [rdx]`
			`mov [rcx], rax`
			`sbb rax, [rdx+8]`
			`mov [rcx+8], rax`
			`sbb rax, [rdx+16]`
			`mov [rcx+16], rax`
			`sbb rax, [rdx+24]`
			`mov [rcx+24], rax`
			`lea rdx, [rdx+32]`
			`dec r8`
			`lea rcx, [rcx+32]`
			`jnz loop1`
			`inc r9`
			`dec r9`
			`jz end`
			`sbb rax, [rdx]`
			`mov [rcx], rax`
			`dec r9`
			`jz end`
			`sbb rax, [rdx+8]`
			`mov [rcx+8], rax`
			`dec r9`
			`jz end`
			`sbb rax, [rdx+16]`
			`mov [rcx+16], rax`
			`dec r9`
			`end:`
			`sbb rax, 0`
			`ret`