mpir/mpn/x86w/p4/sse2/sqr_basecase.asm
2012-11-25 22:33:07 +00:00

190 lines
3.9 KiB
NASM

; Copyright 2001, 2002 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman
section .text
global ___gmpn_sqr_basecase
%ifdef DLL
export ___gmpn_sqr_basecase
%endif
align 8
___gmpn_sqr_basecase:
mov edx,[12+esp]
mov eax,[8+esp]
mov ecx,[4+esp]
cmp edx,2
je two_limbs
ja three_or_more
mov eax,[eax]
mul eax
mov [ecx],eax
mov [4+ecx],edx
ret
two_limbs:
movd mm1,[eax]
movd mm0,[4+eax]
pmuludq mm0,mm1
pmuludq mm1,mm1
movd mm2,[4+eax]
pmuludq mm2,mm2
movd [ecx],mm1
psrlq mm1,32
pcmpeqd mm3,mm3
psrlq mm3,32
pand mm3,mm0
psrlq mm0,32
psllq mm3,1
paddq mm1,mm3
movd [4+ecx],mm1
pcmpeqd mm4,mm4
psrlq mm4,32
pand mm4,mm2
psrlq mm2,32
psllq mm0,1
psrlq mm1,32
paddq mm0,mm1
paddq mm0,mm4
movd [8+ecx],mm0
psrlq mm0,32
paddq mm0,mm2
movd [12+ecx],mm0
emms
ret
three_or_more:
sub esp,12
pxor mm0,mm0
movd mm7,[eax]
mov [8+esp],esi
mov [4+esp],edi
mov [esp],ebp
mov esi,eax
mov edi,ecx
sub edx,1
mul1:
movd mm1,[4+eax]
add eax,4
pmuludq mm1,mm7
paddq mm0,mm1
movd [4+ecx],mm0
add ecx,4
psrlq mm0,32
sub edx,1
jnz mul1
mov ebp,[24+esp]
sub ebp,3
jz corner
outer:
movd mm7,[4+esi]
movd [4+ecx],mm0
lea eax,[8+esi]
add esi,4
lea ecx,[8+edi]
add edi,8
lea edx,[1+ebp]
pxor mm0,mm0
inner:
movd mm1,[eax]
lea eax,[4+eax]
movd mm2,[4+ecx]
pmuludq mm1,mm7
paddq mm1,mm2
paddq mm0,mm1
sub edx,1
movd [4+ecx],mm0
psrlq mm0,32
lea ecx,[4+ecx]
jnz inner
sub ebp,1
jnz outer
corner:
movd mm1,[4+esi]
movd mm2,[8+esi]
pmuludq mm1,mm2
mov eax,[20+esp]
movd mm2,[eax]
pmuludq mm2,mm2
pcmpeqd mm7,mm7
psrlq mm7,32
mov edx,[16+esp]
movd mm3,[4+edx]
paddq mm0,mm1
movd [12+edi],mm0
psrlq mm0,32
movd [16+edi],mm0
movd [edx],mm2
psrlq mm2,32
psllq mm3,1
paddq mm2,mm3
movd [4+edx],mm2
psrlq mm2,32
mov ecx,[24+esp]
sub ecx,2
diag:
movd mm0,[4+eax]
add eax,4
pmuludq mm0,mm0
movq mm1,mm7
pand mm1,mm0
psrlq mm0,32
movd mm3,[8+edx]
psllq mm3,1
paddq mm1,mm3
paddq mm2,mm1
movd [8+edx],mm2
psrlq mm2,32
movd mm3,[12+edx]
psllq mm3,1
paddq mm0,mm3
paddq mm2,mm0
movd [12+edx],mm2
add edx,8
psrlq mm2,32
sub ecx,1
jnz diag
movd mm0,[4+eax]
pmuludq mm0,mm0
pand mm7,mm0
psrlq mm0,32
movd mm3,[8+edx]
psllq mm3,1
paddq mm7,mm3
paddq mm2,mm7
movd [8+edx],mm2
psrlq mm2,32
paddq mm2,mm0
movd [12+edx],mm2
mov esi,[8+esp]
mov edi,[4+esp]
mov ebp,[esp]
add esp,12
emms
ret
end