mpir/mpn/x86_64/core2/hamdist.asm

dnl  mpn_hamdist

dnl  Copyright 2010 The Code Cavern

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

C	ret mpn_hamdist(mp_ptr,mp_ptr,mp_size_t)
C	rax               rdi,   rsi,      rdx

ASM_START()
PROLOGUE(mpn_hamdist)
push %r12
push %r14
push %rbp
mov $0x5555555555555555,%r8
mov $0x3333333333333333,%r9
mov $0x0f0f0f0f0f0f0f0f,%r10
mov $0x0101010101010101,%r11
xor %eax,%eax
sub $3,%rdx
jc skip
	mov 16(%rdi,%rdx,8),%rcx
	xor 16(%rsi,%rdx,8),%rcx
	mov 8(%rdi,%rdx,8),%r12
	xor 8(%rsi,%rdx,8),%r12
	mov (%rdi,%rdx,8),%r14
	xor (%rsi,%rdx,8),%r14
sub $3,%rdx
jc skiplp
ALIGN(16)
lp:	mov %rcx,%rbp
	shr $1,%rcx
	and %r8,%rcx
	sub %rcx,%rbp
	mov %rbp,%rcx
	shr $2,%rbp
	and %r9,%rcx
	and %r9,%rbp
	add %rbp,%rcx

	mov %r12,%rbp
	shr $1,%r12
	and %r8,%r12
	sub %r12,%rbp
	mov %rbp,%r12
	shr $2,%rbp
	and %r9,%r12
	and %r9,%rbp
	add %r12,%rbp

	mov %r14,%r12
	shr $1,%r14
	and %r8,%r14
	sub %r14,%r12
	mov %r12,%r14
	shr $2,%r12
	and %r9,%r14
	and %r9,%r12
	add %r14,%r12

	add %rcx,%rbp
	add %r12,%rbp	
		mov 16(%rdi,%rdx,8),%rcx
	mov %rbp,%r14
	shr $4,%rbp
	and %r10,%r14
		xor 16(%rsi,%rdx,8),%rcx
		mov 8(%rdi,%rdx,8),%r12
		xor 8(%rsi,%rdx,8),%r12
	and %r10,%rbp
	add %rbp,%r14
	imul %r11,%r14
	shr $56,%r14
	add %r14,%rax
		mov (%rdi,%rdx,8),%r14
		xor (%rsi,%rdx,8),%r14
	sub $3,%rdx
	jnc lp
skiplp:
	mov %rcx,%rbp
	shr $1,%rcx
	and %r8,%rcx
	sub %rcx,%rbp
	mov %rbp,%rcx
	shr $2,%rbp
	and %r9,%rcx
	and %r9,%rbp
	add %rbp,%rcx
	
	mov %r12,%rbp
	shr $1,%r12
	and %r8,%r12
	sub %r12,%rbp
	mov %rbp,%r12
	shr $2,%rbp
	and %r9,%r12
	and %r9,%rbp
	add %r12,%rbp
	
	mov %r14,%r12
	shr $1,%r14
	and %r8,%r14
	sub %r14,%r12
	mov %r12,%r14
	shr $2,%r12
	and %r9,%r14
	and %r9,%r12
	add %r14,%r12
	
	add %rcx,%rbp
	add %r12,%rbp	
	mov %rbp,%r14
	shr $4,%rbp
	and %r10,%r14
	and %r10,%rbp
	add %rbp,%r14
	imul %r11,%r14
	shr $56,%r14
	add %r14,%rax
skip:
	cmp $-2,%rdx
	jl case0
	jz case1
case2:
	mov 16(%rdi,%rdx,8),%rcx
	xor 16(%rsi,%rdx,8),%rcx
	mov %rcx,%rbp
	shr $1,%rcx
	and %r8,%rcx
	sub %rcx,%rbp
	mov %rbp,%rcx
	shr $2,%rbp
	and %r9,%rcx
	and %r9,%rbp
	add %rbp,%rcx
	
	mov %rcx,%r14
	shr $4,%rcx
	and %r10,%r14
	and %r10,%rcx
	add %rcx,%r14
	imul %r11,%r14
	shr $56,%r14
	add %r14,%rax
	dec %rdx
case1:
	mov 16(%rdi,%rdx,8),%rcx
	xor 16(%rsi,%rdx,8),%rcx
	mov %rcx,%rbp
	shr $1,%rcx
	and %r8,%rcx
	sub %rcx,%rbp
	mov %rbp,%rcx
	shr $2,%rbp
	and %r9,%rcx
	and %r9,%rbp
	add %rbp,%rcx
	
	mov %rcx,%r14
	shr $4,%rcx
	and %r10,%r14
	and %r10,%rcx
	add %rcx,%r14
	imul %r11,%r14
	shr $56,%r14
	add %r14,%rax
case0:	pop %rbp
	pop %r14
	pop %r12
	ret
EPILOGUE()
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`dnl mpn_hamdist`

faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`dnl Copyright 2010 The Code Cavern`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00
			`dnl This file is part of the MPIR Library.`

			`dnl The MPIR Library is free software; you can redistribute it and/or modify`
			`dnl it under the terms of the GNU Lesser General Public License as published`
			`dnl by the Free Software Foundation; either version 2.1 of the License, or (at`
			`dnl your option) any later version.`

			`dnl The MPIR Library is distributed in the hope that it will be useful, but`
			`dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`dnl License for more details.`

			`dnl You should have received a copy of the GNU Lesser General Public License`
			`dnl along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`dnl Boston, MA 02110-1301, USA.`

			include(`../config.m4')

			`C ret mpn_hamdist(mp_ptr,mp_ptr,mp_size_t)`
			`C rax rdi, rsi, rdx`

			`ASM_START()`
			`PROLOGUE(mpn_hamdist)`
			`push %r12`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`push %r14`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`push %rbp`
			`mov $0x5555555555555555,%r8`
			`mov $0x3333333333333333,%r9`
			`mov $0x0f0f0f0f0f0f0f0f,%r10`
			`mov $0x0101010101010101,%r11`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`xor %eax,%eax`
			`sub $3,%rdx`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`jc skip`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`mov 16(%rdi,%rdx,8),%rcx`
			`xor 16(%rsi,%rdx,8),%rcx`
			`mov 8(%rdi,%rdx,8),%r12`
			`xor 8(%rsi,%rdx,8),%r12`
			`mov (%rdi,%rdx,8),%r14`
			`xor (%rsi,%rdx,8),%r14`
			`sub $3,%rdx`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`jc skiplp`
			`ALIGN(16)`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`lp: mov %rcx,%rbp`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`shr $1,%rcx`
			`and %r8,%rcx`
			`sub %rcx,%rbp`
			`mov %rbp,%rcx`
			`shr $2,%rbp`
			`and %r9,%rcx`
			`and %r9,%rbp`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`add %rbp,%rcx`

			`mov %r12,%rbp`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`shr $1,%r12`
			`and %r8,%r12`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`sub %r12,%rbp`
			`mov %rbp,%r12`
			`shr $2,%rbp`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`and %r9,%r12`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`and %r9,%rbp`
			`add %r12,%rbp`

			`mov %r14,%r12`
			`shr $1,%r14`
			`and %r8,%r14`
			`sub %r14,%r12`
			`mov %r12,%r14`
			`shr $2,%r12`
			`and %r9,%r14`
			`and %r9,%r12`
			`add %r14,%r12`

			`add %rcx,%rbp`
			`add %r12,%rbp`
			`mov 16(%rdi,%rdx,8),%rcx`
			`mov %rbp,%r14`
			`shr $4,%rbp`
			`and %r10,%r14`
			`xor 16(%rsi,%rdx,8),%rcx`
			`mov 8(%rdi,%rdx,8),%r12`
			`xor 8(%rsi,%rdx,8),%r12`
			`and %r10,%rbp`
			`add %rbp,%r14`
			`imul %r11,%r14`
			`shr $56,%r14`
			`add %r14,%rax`
			`mov (%rdi,%rdx,8),%r14`
			`xor (%rsi,%rdx,8),%r14`
			`sub $3,%rdx`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`jnc lp`
			`skiplp:`
			`mov %rcx,%rbp`
			`shr $1,%rcx`
			`and %r8,%rcx`
			`sub %rcx,%rbp`
			`mov %rbp,%rcx`
			`shr $2,%rbp`
			`and %r9,%rcx`
			`and %r9,%rbp`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`add %rbp,%rcx`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`mov %r12,%rbp`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`shr $1,%r12`
			`and %r8,%r12`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`sub %r12,%rbp`
			`mov %rbp,%r12`
			`shr $2,%rbp`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`and %r9,%r12`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`and %r9,%rbp`
			`add %r12,%rbp`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`mov %r14,%r12`
			`shr $1,%r14`
			`and %r8,%r14`
			`sub %r14,%r12`
			`mov %r12,%r14`
			`shr $2,%r12`
			`and %r9,%r14`
			`and %r9,%r12`
			`add %r14,%r12`

			`add %rcx,%rbp`
			`add %r12,%rbp`
			`mov %rbp,%r14`
			`shr $4,%rbp`
			`and %r10,%r14`
			`and %r10,%rbp`
			`add %rbp,%r14`
			`imul %r11,%r14`
			`shr $56,%r14`
			`add %r14,%rax`
			`skip:`
			`cmp $-2,%rdx`
			`jl case0`
			`jz case1`
			`case2:`
			`mov 16(%rdi,%rdx,8),%rcx`
			`xor 16(%rsi,%rdx,8),%rcx`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`mov %rcx,%rbp`
			`shr $1,%rcx`
			`and %r8,%rcx`
			`sub %rcx,%rbp`
			`mov %rbp,%rcx`
			`shr $2,%rbp`
			`and %r9,%rcx`
			`and %r9,%rbp`
faster core2/penryn mpn_hamdist by using the K8 version 2010-12-05 02:49:17 -05:00			`add %rbp,%rcx`

			`mov %rcx,%r14`
			`shr $4,%rcx`
			`and %r10,%r14`
			`and %r10,%rcx`
			`add %rcx,%r14`
			`imul %r11,%r14`
			`shr $56,%r14`
			`add %r14,%rax`
			`dec %rdx`
			`case1:`
			`mov 16(%rdi,%rdx,8),%rcx`
			`xor 16(%rsi,%rdx,8),%rcx`
			`mov %rcx,%rbp`
			`shr $1,%rcx`
			`and %r8,%rcx`
			`sub %rcx,%rbp`
			`mov %rbp,%rcx`
			`shr $2,%rbp`
			`and %r9,%rcx`
			`and %r9,%rbp`
			`add %rbp,%rcx`

			`mov %rcx,%r14`
			`shr $4,%rcx`
			`and %r10,%r14`
			`and %r10,%rcx`
			`add %rcx,%r14`
			`imul %r11,%r14`
			`shr $56,%r14`
			`add %r14,%rax`
			`case0: pop %rbp`
			`pop %r14`
New core2/penryn asm functions popcount hamdist 2009-11-19 05:53:45 -05:00			`pop %r12`
			`ret`
			`EPILOGUE()`