190 lines
4.1 KiB
NASM
190 lines
4.1 KiB
NASM
|
; Copyright 2001, 2002 Free Software Foundation, Inc.
|
||
|
;
|
||
|
; This file is part of the GNU MP Library.
|
||
|
;
|
||
|
; The GNU MP Library is free software; you can redistribute it and/or
|
||
|
; modify it under the terms of the GNU Lesser General Public License as
|
||
|
; published by the Free Software Foundation; either version 2.1 of the
|
||
|
; License, or (at your option) any later version.
|
||
|
;
|
||
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
||
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
; Lesser General Public License for more details.
|
||
|
;
|
||
|
; You should have received a copy of the GNU Lesser General Public
|
||
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
||
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
||
|
; Suite 330, Boston, MA 02111-1307, USA.
|
||
|
;
|
||
|
; Translation of AT&T syntax code by Brian Gladman
|
||
|
|
||
|
section .text
|
||
|
|
||
|
global ___gmpn_sqr_basecase
|
||
|
%ifdef DLL
|
||
|
export ___gmpn_sqr_basecase
|
||
|
%endif
|
||
|
|
||
|
align 8
|
||
|
___gmpn_sqr_basecase:
|
||
|
mov edx,[12+esp]
|
||
|
mov eax,[8+esp]
|
||
|
mov ecx,[4+esp]
|
||
|
cmp edx,2
|
||
|
je two_limbs
|
||
|
ja three_or_more
|
||
|
mov eax,[eax]
|
||
|
mul eax
|
||
|
mov [ecx],eax
|
||
|
mov [4+ecx],edx
|
||
|
ret
|
||
|
|
||
|
two_limbs:
|
||
|
movd mm1,[eax]
|
||
|
movd mm0,[4+eax]
|
||
|
pmuludq mm0,mm1
|
||
|
pmuludq mm1,mm1
|
||
|
movd mm2,[4+eax]
|
||
|
pmuludq mm2,mm2
|
||
|
movd [ecx],mm1
|
||
|
psrlq mm1,32
|
||
|
pcmpeqd mm3,mm3
|
||
|
psrlq mm3,32
|
||
|
pand mm3,mm0
|
||
|
psrlq mm0,32
|
||
|
psllq mm3,1
|
||
|
paddq mm1,mm3
|
||
|
movd [4+ecx],mm1
|
||
|
pcmpeqd mm4,mm4
|
||
|
psrlq mm4,32
|
||
|
pand mm4,mm2
|
||
|
psrlq mm2,32
|
||
|
psllq mm0,1
|
||
|
psrlq mm1,32
|
||
|
paddq mm0,mm1
|
||
|
paddq mm0,mm4
|
||
|
movd [8+ecx],mm0
|
||
|
psrlq mm0,32
|
||
|
paddq mm0,mm2
|
||
|
movd [12+ecx],mm0
|
||
|
emms
|
||
|
ret
|
||
|
|
||
|
three_or_more:
|
||
|
sub esp,12
|
||
|
pxor mm0,mm0
|
||
|
movd mm7,[eax]
|
||
|
mov [8+esp],esi
|
||
|
mov [4+esp],edi
|
||
|
mov [esp],ebp
|
||
|
mov esi,eax
|
||
|
mov edi,ecx
|
||
|
sub edx,1
|
||
|
|
||
|
mul1:
|
||
|
movd mm1,[4+eax]
|
||
|
add eax,4
|
||
|
pmuludq mm1,mm7
|
||
|
paddq mm0,mm1
|
||
|
movd [4+ecx],mm0
|
||
|
add ecx,4
|
||
|
psrlq mm0,32
|
||
|
sub edx,1
|
||
|
jnz mul1
|
||
|
mov ebp,[24+esp]
|
||
|
sub ebp,3
|
||
|
jz corner
|
||
|
|
||
|
outer:
|
||
|
movd mm7,[4+esi]
|
||
|
movd [4+ecx],mm0
|
||
|
lea eax,[8+esi]
|
||
|
add esi,4
|
||
|
lea ecx,[8+edi]
|
||
|
add edi,8
|
||
|
lea edx,[1+ebp]
|
||
|
pxor mm0,mm0
|
||
|
|
||
|
inner:
|
||
|
movd mm1,[eax]
|
||
|
lea eax,[4+eax]
|
||
|
movd mm2,[4+ecx]
|
||
|
pmuludq mm1,mm7
|
||
|
paddq mm1,mm2
|
||
|
paddq mm0,mm1
|
||
|
sub edx,1
|
||
|
movd [4+ecx],mm0
|
||
|
psrlq mm0,32
|
||
|
lea ecx,[4+ecx]
|
||
|
jnz inner
|
||
|
sub ebp,1
|
||
|
jnz outer
|
||
|
|
||
|
corner:
|
||
|
movd mm1,[4+esi]
|
||
|
movd mm2,[8+esi]
|
||
|
pmuludq mm1,mm2
|
||
|
mov eax,[20+esp]
|
||
|
movd mm2,[eax]
|
||
|
pmuludq mm2,mm2
|
||
|
pcmpeqd mm7,mm7
|
||
|
psrlq mm7,32
|
||
|
mov edx,[16+esp]
|
||
|
movd mm3,[4+edx]
|
||
|
paddq mm0,mm1
|
||
|
movd [12+edi],mm0
|
||
|
psrlq mm0,32
|
||
|
movd [16+edi],mm0
|
||
|
movd [edx],mm2
|
||
|
psrlq mm2,32
|
||
|
psllq mm3,1
|
||
|
paddq mm2,mm3
|
||
|
movd [4+edx],mm2
|
||
|
psrlq mm2,32
|
||
|
mov ecx,[24+esp]
|
||
|
sub ecx,2
|
||
|
|
||
|
diag:
|
||
|
movd mm0,[4+eax]
|
||
|
add eax,4
|
||
|
pmuludq mm0,mm0
|
||
|
movq mm1,mm7
|
||
|
pand mm1,mm0
|
||
|
psrlq mm0,32
|
||
|
movd mm3,[8+edx]
|
||
|
psllq mm3,1
|
||
|
paddq mm1,mm3
|
||
|
paddq mm2,mm1
|
||
|
movd [8+edx],mm2
|
||
|
psrlq mm2,32
|
||
|
movd mm3,[12+edx]
|
||
|
psllq mm3,1
|
||
|
paddq mm0,mm3
|
||
|
paddq mm2,mm0
|
||
|
movd [12+edx],mm2
|
||
|
add edx,8
|
||
|
psrlq mm2,32
|
||
|
sub ecx,1
|
||
|
jnz diag
|
||
|
movd mm0,[4+eax]
|
||
|
pmuludq mm0,mm0
|
||
|
pand mm7,mm0
|
||
|
psrlq mm0,32
|
||
|
movd mm3,[8+edx]
|
||
|
psllq mm3,1
|
||
|
paddq mm7,mm3
|
||
|
paddq mm2,mm7
|
||
|
movd [8+edx],mm2
|
||
|
psrlq mm2,32
|
||
|
paddq mm2,mm0
|
||
|
movd [12+edx],mm2
|
||
|
mov esi,[8+esp]
|
||
|
mov edi,[4+esp]
|
||
|
mov ebp,[esp]
|
||
|
add esp,12
|
||
|
emms
|
||
|
ret
|
||
|
|
||
|
end
|