Conversion of redc_basecase of Jason Moxham to yasm format.

This commit is contained in:
wbhart 2009-03-04 19:38:45 +00:00
parent 5ceb500330
commit 661b1673c9
2 changed files with 420 additions and 424 deletions

View File

@ -0,0 +1,420 @@
; AMD64 mpn_redc_basecase
; Copyright 2009 Jason Moxham
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; Version 1.0.4
; (rdi,rcx)=(rsi,rcx)+(rdx,rcx) with the carry flag set for the carry
; this is the usual mpn_add_n with the final dec rax;adc rax,rax;ret removed
; and a jump where we have two rets
%include '../yasm_mac.inc'
BITS 64
%macro mpn_add 0
mov rax, rcx
and rax, 3
shr rcx, 2
cmp rcx, 0
; carry flag is clear here
jnz %%1
mov r11, [rsi]
add r11, [rdx]
mov [rdi], r11
dec rax
jz %%2
mov r11, [rsi+8]
adc r11, [rdx+8]
mov [rdi+8], r11
dec rax
jz %%2
mov r11, [rsi+16]
adc r11, [rdx+16]
mov [rdi+16], r11
jmp %%2
align 16
%%1:
mov r11, [rsi]
mov r8, [rsi+8]
lea rsi, [rsi+32]
adc r11, [rdx]
adc r8, [rdx+8]
lea rdx, [rdx+32]
mov [rdi], r11
mov [rdi+8], r8
lea rdi, [rdi+32]
mov r9, [rsi-16]
mov r10, [rsi-8]
adc r9, [rdx-16]
adc r10, [rdx-8]
mov [rdi-16], r9
dec rcx
mov [rdi-8], r10
jnz %%1
inc rax
dec rax
jz %%2
mov r11, [rsi]
adc r11, [rdx]
mov [rdi], r11
dec rax
jz %%2
mov r11, [rsi+8]
adc r11, [rdx+8]
mov [rdi+8], r11
dec rax
jz %%2
mov r11, [rsi+16]
adc r11, [rdx+16]
mov [rdi+16], r11
%%2:
%endmacro
; (rbx, rbp) = (rsi, rbp) - (rdx, rbp)
%macro mpn_sub 0
mov rax, rbp
and rax, 3
shr rbp, 2
cmp rbp, 0
; carry flag is clear here
jnz %%1
mov r11, [rsi]
sub r11, [rdx]
mov [rbx], r11
dec rax
jz %%2
mov r11, [rsi+8]
sbb r11, [rdx+8]
mov [rbx+8], r11
dec rax
jz %%2
mov r11, [rsi+16]
sbb r11, [rdx+16]
mov [rbx+16], r11
jmp %%2
align 16
%%1:
mov r11, [rsi]
mov r8, [rsi+8]
lea rsi, [rsi+32]
sbb r11, [rdx]
sbb r8, [rdx+8]
lea rdx, [rdx+32]
mov [rbx], r11
mov [rbx+8], r8
lea rbx, [rbx+32]
mov r9, [rsi-16]
mov r10, [rsi-8]
sbb r9, [rdx-16]
sbb r10, [rdx-8]
mov [rbx-16], r9
dec rbp
mov [rbx-8], r10
jnz %%1
inc rax
dec rax
jz %%2
mov r11, [rsi]
sbb r11, [rdx]
mov [rbx], r11
dec rax
jz %%2
mov r11, [rsi+8]
sbb r11, [rdx+8]
mov [rbx+8], r11
dec rax
jz %%2
mov r11, [rsi+16]
sbb r11, [rdx+16]
mov [rbx+16], r11
%%2:
%endmacro
; changes from standard addmul
; change r8 to r12 and rcx to r13 and rdi to r8
; reemove ret and write last limb but to beginning
%macro addmulloop 1
align 16
%%1:
mov r10, 0
mul r13
add [r8+r11*8], r12
adc r9, rax
db 0x26
adc r10, rdx
mov rax, [rsi+r11*8+16]
mul r13
add [r8+r11*8+8], r9
adc r10, rax
mov ebx, 0
adc rbx, rdx
mov rax, [rsi+r11*8+24]
mov r12, 0
mov r9, 0
mul r13
add [r8+r11*8+16], r10
db 0x26
adc rbx, rax
db 0x26
adc r12, rdx
mov rax, [rsi+r11*8+32]
mul r13
add [r8+r11*8+24], rbx
db 0x26
adc r12, rax
db 0x26
adc r9, rdx
add r11, 4
mov rax, [rsi+r11*8+8]
jnc %%1
%endmacro
%macro addmulpropro0 0
imul r13, rcx
lea r8, [r8-8]
%endmacro
%macro addmulpro0 0
mov r11, r14
lea r8, [r8+8]
mov rax, [rsi+r14*8]
mul r13
mov r12, rax
mov rax, [rsi+r14*8+8]
mov r9, rdx
cmp r14, 0
%endmacro
%macro addmulnext0 0
mov r10d, 0
mul r13
add [r8+r11*8], r12
adc r9, rax
adc r10, rdx
mov rax, [rsi+r11*8+16]
mul r13
add [r8+r11*8+8], r9
adc r10, rax
mov ebx, 0
adc rbx, rdx
mov rax, [rsi+r11*8+24]
mov r12d, 0
mov r9d, 0
mul r13
add [r8+r11*8+16], r10
adc rbx, rax
adc r12, rdx
mov rax, [rsi+r11*8+32]
mul r13
add [r8+r11*8+24], rbx
mov r13, [r8+r14*8+8]
adc r12, rax
adc r9, rdx
imul r13, rcx
add [r8+r11*8+32], r12
adc r9, 0
dec r15
mov [r8+r14*8], r9
%endmacro
%macro addmulpropro1 0
%endmacro
%macro addmulpro1 0
imul r13, rcx
mov rax, [rsi+r14*8]
mov r11, r14
mul r13
mov r12, rax
mov rax, [rsi+r14*8+8]
mov r9, rdx
cmp r14, 0
%endmacro
%macro addmulnext1 0
mov r10d, 0
mul r13
add [r8+r11*8], r12
adc r9, rax
adc r10, rdx
mov rax, [rsi+r11*8+16]
mul r13
add [r8+r11*8+8], r9
adc r10, rax
mov ebx, 0
adc rbx, rdx
mov rax, [rsi+r11*8+24]
mov r12d, 0
mul r13
add [r8+r11*8+16], r10
adc rbx, rax
adc r12, rdx
add [r8+r11*8+24], rbx
mov r13, [r8+r14*8+8]
adc r12, 0
dec r15
mov [r8+r14*8], r12
lea r8, [r8+8]
%endmacro
%macro addmulpropro2 0
%endmacro
%macro addmulpro2 0
imul r13, rcx
mov rax, [rsi+r14*8]
mov r11, r14
mul r13
mov r12, rax
mov rax, [rsi+r14*8+8]
mov r9, rdx
cmp r14, 0
%endmacro
%macro addmulnext2 0
mul r13
add [r8+r11*8], r12
adc r9, rax
mov r10d, 0
adc r10, rdx
mov rax, [rsi+r11*8+16]
mul r13
add [r8+r11*8+8], r9
adc r10, rax
mov ebx, 0
adc rbx, rdx
mov r13, [r8+r14*8+8]
add [r8+r11*8+16], r10
adc rbx, 0
mov [r8+r14*8], rbx
dec r15
lea r8, [r8+8]
%endmacro
%macro addmulpropro3 0
%endmacro
%macro addmulpro3 0
imul r13, rcx
mov rax, [rsi+r14*8]
mov r11, r14
mul r13
mov r12, rax
mov rax, [rsi+r14*8+8]
mov r9, rdx
cmp r14, 0
%endmacro
%macro addmulnext3 0
mul r13
add [r8+r11*8], r12
adc r9, rax
mov r10d, 0
adc r10, rdx
add [r8+r11*8+8], r9
adc r10, 0
mov r13, [r8+r14*8+8]
mov [r8+r14*8], r10
lea r8, [r8+8]
dec r15
%endmacro
; change r8 to r12
; write top limb ax straight to mem dont return (NOTE we WRITE NOT ADD)
%macro mpn_addmul_1_int 1
addmulpropro%1
align 16
%%1:
addmulpro%1
jge %%2
addmulloop %1
%%2:
addmulnext%1
jnz %%1
jmp end
%endmacro
GLOBAL_FUNC mpn_redc_basecase
cmp rdx, 1
je one
push r13
push r14
push rbx
push r12
push r15
push rbp
mov r14, 5
sub r14, rdx
; store copys
push rsi
push r8
lea r8, [r8+rdx*8-40]
lea rsi, [rsi+rdx*8-40]
mov rbp, rdx
mov r15, rdx
mov rax, r14
and rax, 3
mov r13, [r8+r14*8]
je case0
jp case3
cmp rax, 1
je case1
case2:
mpn_addmul_1_int 2
align 16
case0:
mpn_addmul_1_int 0
align 16
case1:
mpn_addmul_1_int 1
align 16
case3:
mpn_addmul_1_int 3
align 16
end:
mov rcx, rbp
pop rdx
lea rsi, [rdx+rbp*8]
mov rbx, rdi
mpn_add
; mpnadd(rdi,rsi,rdx,rcx)
pop rdx
jnc skip
mov rsi, rbx
mpn_sub
; mpn_sub_n(rbx,rsi,rdx,rbp) we can certainly improve this sub
skip:
pop rbp
pop r15
pop r12
pop rbx
pop r14
pop r13
ret
align 16
one:
mov r9, [r8]
mov r11, [rsi]
imul rcx, r9
mov rax, rcx
mul r11
add rax, r9
; rax is zero here
adc rdx, [r8+8]
cmovnc r11, rax
sub rdx, r11
mov [rdi], rdx
ret

View File

@ -1,424 +0,0 @@
dnl AMD64 mpn_redc_basecase
dnl Copyright 2009 Jason Moxham
dnl This file is part of the MPIR Library.
dnl The MPIR Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
dnl your option) any later version.
dnl The MPIR Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the MPIR Library; see the file COPYING.LIB. If not, write
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl Boston, MA 02110-1301, USA.
include(`../config.m4')
C Version 1.0.4
C (rdi,rcx)=(rsi,rcx)+(rdx,rcx) with the carry flag set for the carry
C this is the usual mpn_add_n with the final dec rax;adc rax,rax;ret removed
C and a jump where we have two rets
define(`MPN_ADD',`
mov %rcx,%rax
and `$'3,%rax
shr `$'2,%rcx
cmp `$'0,%rcx
C carry flag is clear here
jnz addloop
mov (%rsi),%r11
add (%rdx),%r11
mov %r11,(%rdi)
dec %rax
jz addend
mov 8(%rsi),%r11
adc 8(%rdx),%r11
mov %r11,8(%rdi)
dec %rax
jz addend
mov 16(%rsi),%r11
adc 16(%rdx),%r11
mov %r11,16(%rdi)
jmp addend
ALIGN(16)
addloop:
mov (%rsi),%r11
mov 8(%rsi),%r8
lea 32(%rsi),%rsi
adc (%rdx),%r11
adc 8(%rdx),%r8
lea 32(%rdx),%rdx
mov %r11,(%rdi)
mov %r8,8(%rdi)
lea 32(%rdi),%rdi
mov -16(%rsi),%r9
mov -8(%rsi),%r10
adc -16(%rdx),%r9
adc -8(%rdx),%r10
mov %r9,-16(%rdi)
dec %rcx
mov %r10,-8(%rdi)
jnz addloop
inc %rax
dec %rax
jz addend
mov (%rsi),%r11
adc (%rdx),%r11
mov %r11,(%rdi)
dec %rax
jz addend
mov 8(%rsi),%r11
adc 8(%rdx),%r11
mov %r11,8(%rdi)
dec %rax
jz addend
mov 16(%rsi),%r11
adc 16(%rdx),%r11
mov %r11,16(%rdi)
addend:
')
C (rbx,rbp)=(rsi,rbp)-(rdx,rbp)
define(`MPN_SUB',`
mov %rbp,%rax
and `$'3,%rax
shr `$'2,%rbp
cmp `$'0,%rbp
C carry flag is clear here
jnz subloop
mov (%rsi),%r11
sub (%rdx),%r11
mov %r11,(%rbx)
dec %rax
jz subend
mov 8(%rsi),%r11
sbb 8(%rdx),%r11
mov %r11,8(%rbx)
dec %rax
jz subend
mov 16(%rsi),%r11
sbb 16(%rdx),%r11
mov %r11,16(%rbx)
jmp subend
ALIGN(16)
subloop:
mov (%rsi),%r11
mov 8(%rsi),%r8
lea 32(%rsi),%rsi
sbb (%rdx),%r11
sbb 8(%rdx),%r8
lea 32(%rdx),%rdx
mov %r11,(%rbx)
mov %r8,8(%rbx)
lea 32(%rbx),%rbx
mov -16(%rsi),%r9
mov -8(%rsi),%r10
sbb -16(%rdx),%r9
sbb -8(%rdx),%r10
mov %r9,-16(%rbx)
dec %rbp
mov %r10,-8(%rbx)
jnz subloop
inc %rax
dec %rax
jz subend
mov (%rsi),%r11
sbb (%rdx),%r11
mov %r11,(%rbx)
dec %rax
jz subend
mov 8(%rsi),%r11
sbb 8(%rdx),%r11
mov %r11,8(%rbx)
dec %rax
jz subend
mov 16(%rsi),%r11
sbb 16(%rdx),%r11
mov %r11,16(%rbx)
subend:
')
C changes from standard addmul
C change r8 to r12 and rcx to r13 and rdi to r8
C reemove ret and write last limb but to beginning
define(`ADDMULLOOP',`
ALIGN(16)
addmulloop$1:
mov `$'0,%r10
mul %r13
add %r12,(%r8,%r11,8)
adc %rax,%r9
.byte 0x26
adc %rdx,%r10
mov 16(%rsi,%r11,8),%rax
mul %r13
add %r9,8(%r8,%r11,8)
adc %rax,%r10
mov `$'0,%ebx
adc %rdx,%rbx
mov 24(%rsi,%r11,8),%rax
mov `$'0,%r12
mov `$'0,%r9
mul %r13
add %r10,16(%r8,%r11,8)
.byte 0x26
adc %rax,%rbx
.byte 0x26
adc %rdx,%r12
mov 32(%rsi,%r11,8),%rax
mul %r13
add %rbx,24(%r8,%r11,8)
.byte 0x26
adc %rax,%r12
.byte 0x26
adc %rdx,%r9
add `$'4,%r11
mov 8(%rsi,%r11,8),%rax
jnc addmulloop$1
')
define(`ADDMULPROPRO0',`
imul %rcx,%r13
lea -8(%r8),%r8
')
define(`ADDMULPRO0',`
mov %r14,%r11
lea 8(%r8),%r8
mov (%rsi,%r14,8),%rax
mul %r13
mov %rax,%r12
mov 8(%rsi,%r14,8),%rax
mov %rdx,%r9
cmp `$'0,%r14
')
define(`ADDMULNEXT0',`
mov `$'0,%r10d
mul %r13
add %r12,(%r8,%r11,8)
adc %rax,%r9
adc %rdx,%r10
mov 16(%rsi,%r11,8),%rax
mul %r13
add %r9,8(%r8,%r11,8)
adc %rax,%r10
mov `$'0,%ebx
adc %rdx,%rbx
mov 24(%rsi,%r11,8),%rax
mov `$'0,%r12d
mov `$'0,%r9d
mul %r13
add %r10,16(%r8,%r11,8)
adc %rax,%rbx
adc %rdx,%r12
mov 32(%rsi,%r11,8),%rax
mul %r13
add %rbx,24(%r8,%r11,8)
mov 8(%r8,%r14,8),%r13
adc %rax,%r12
adc %rdx,%r9
imul %rcx,%r13
add %r12,32(%r8,%r11,8)
adc `$'0,%r9
dec %r15
mov %r9,(%r8,%r14,8)
')
define(`ADDMULPROPRO1',`
')
define(`ADDMULPRO1',`
imul %rcx,%r13
mov (%rsi,%r14,8),%rax
mov %r14,%r11
mul %r13
mov %rax,%r12
mov 8(%rsi,%r14,8),%rax
mov %rdx,%r9
cmp `$'0,%r14
')
define(`ADDMULNEXT1',`
mov `$'0,%r10d
mul %r13
add %r12,(%r8,%r11,8)
adc %rax,%r9
adc %rdx,%r10
mov 16(%rsi,%r11,8),%rax
mul %r13
add %r9,8(%r8,%r11,8)
adc %rax,%r10
mov `$'0,%ebx
adc %rdx,%rbx
mov 24(%rsi,%r11,8),%rax
mov `$'0,%r12d
mul %r13
add %r10,16(%r8,%r11,8)
adc %rax,%rbx
adc %rdx,%r12
add %rbx,24(%r8,%r11,8)
mov 8(%r8,%r14,8),%r13
adc `$'0,%r12
dec %r15
mov %r12,(%r8,%r14,8)
lea 8(%r8),%r8
')
define(`ADDMULPROPRO2',`
')
define(`ADDMULPRO2',`
imul %rcx,%r13
mov (%rsi,%r14,8),%rax
mov %r14,%r11
mul %r13
mov %rax,%r12
mov 8(%rsi,%r14,8),%rax
mov %rdx,%r9
cmp `$'0,%r14
')
define(`ADDMULNEXT2',`
mul %r13
add %r12,(%r8,%r11,8)
adc %rax,%r9
mov `$'0,%r10d
adc %rdx,%r10
mov 16(%rsi,%r11,8),%rax
mul %r13
add %r9,8(%r8,%r11,8)
adc %rax,%r10
mov `$'0,%ebx
adc %rdx,%rbx
mov 8(%r8,%r14,8),%r13
add %r10,16(%r8,%r11,8)
adc `$'0,%rbx
mov %rbx,(%r8,%r14,8)
dec %r15
lea 8(%r8),%r8
')
define(`ADDMULPROPRO3',`
')
define(`ADDMULPRO3',`
imul %rcx,%r13
mov (%rsi,%r14,8),%rax
mov %r14,%r11
mul %r13
mov %rax,%r12
mov 8(%rsi,%r14,8),%rax
mov %rdx,%r9
cmp `$'0,%r14
')
define(`ADDMULNEXT3',`
mul %r13
add %r12,(%r8,%r11,8)
adc %rax,%r9
mov `$'0,%r10d
adc %rdx,%r10
add %r9,8(%r8,%r11,8)
adc `$'0,%r10
mov 8(%r8,%r14,8),%r13
mov %r10,(%r8,%r14,8)
lea 8(%r8),%r8
dec %r15
')
C change r8 to r12
C write top limb ax straight to mem dont return (NOTE we WRITE NOT ADD)
define(`MPN_ADDMUL_1_INT',`
ADDMULPROPRO$1
ALIGN(16)
loopaddmul$1:
ADDMULPRO$1
jge addmulskiploop$1
ADDMULLOOP($1)
addmulskiploop$1:
ADDMULNEXT$1
jnz loopaddmul$1
jmp end
')
ASM_START()
PROLOGUE(mpn_redc_basecase)
cmp $1,%rdx
je one
push %r13
push %r14
push %rbx
push %r12
push %r15
push %rbp
mov $5,%r14
sub %rdx,%r14
C store copys
push %rsi
push %r8
lea -40(%r8,%rdx,8),%r8
lea -40(%rsi,%rdx,8),%rsi
mov %rdx,%rbp
mov %rdx,%r15
mov %r14,%rax
and $3,%rax
mov (%r8,%r14,8),%r13
je case0
jp case3
cmp $1,%rax
je case1
case2:
MPN_ADDMUL_1_INT(2)
ALIGN(16)
case0:
MPN_ADDMUL_1_INT(0)
ALIGN(16)
case1:
MPN_ADDMUL_1_INT(1)
ALIGN(16)
case3:
MPN_ADDMUL_1_INT(3)
ALIGN(16)
end:
mov %rbp,%rcx
pop %rdx
lea (%rdx,%rbp,8),%rsi
mov %rdi,%rbx
MPN_ADD()
C mpnadd(rdi,rsi,rdx,rcx)
pop %rdx
jnc skip
mov %rbx,%rsi
MPN_SUB()
C mpn_sub_n(rbx,rsi,rdx,rbp) we can certainly improve this sub
skip:
pop %rbp
pop %r15
pop %r12
pop %rbx
pop %r14
pop %r13
ret
ALIGN(16)
one:
mov (%r8),%r9
mov (%rsi),%r11
imul %r9,%rcx
mov %rcx,%rax
mul %r11
add %r9,%rax
C rax is zero here
adc 8(%r8),%rdx
cmovnc %rax,%r11
sub %r11,%rdx
mov %rdx,(%rdi)
ret
EPILOGUE()