faster core2/penryn mpn_hamdist by using the K8 version
This commit is contained in:
parent
77e6892a3c
commit
a87caeb1f8
@ -1,6 +1,6 @@
|
||||
dnl mpn_hamdist
|
||||
|
||||
dnl Copyright 2009 Jason Moxham
|
||||
dnl Copyright 2010 The Code Cavern
|
||||
|
||||
dnl This file is part of the MPIR Library.
|
||||
|
||||
@ -27,25 +27,25 @@ C rax rdi, rsi, rdx
|
||||
ASM_START()
|
||||
PROLOGUE(mpn_hamdist)
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %rbp
|
||||
push %rbx
|
||||
mov $0x5555555555555555,%r8
|
||||
mov $0x3333333333333333,%r9
|
||||
mov $0x0f0f0f0f0f0f0f0f,%r10
|
||||
mov $0x0101010101010101,%r11
|
||||
mov $0,%rax
|
||||
sub $2,%rdx
|
||||
xor %eax,%eax
|
||||
sub $3,%rdx
|
||||
jc skip
|
||||
mov 8(%rdi,%rdx,8),%rcx
|
||||
xor 8(%rsi,%rdx,8),%rcx
|
||||
mov (%rdi,%rdx,8),%r12
|
||||
xor (%rsi,%rdx,8),%r12
|
||||
sub $2,%rdx
|
||||
mov 16(%rdi,%rdx,8),%rcx
|
||||
xor 16(%rsi,%rdx,8),%rcx
|
||||
mov 8(%rdi,%rdx,8),%r12
|
||||
xor 8(%rsi,%rdx,8),%r12
|
||||
mov (%rdi,%rdx,8),%r14
|
||||
xor (%rsi,%rdx,8),%r14
|
||||
sub $3,%rdx
|
||||
jc skiplp
|
||||
ALIGN(16)
|
||||
lp:
|
||||
mov %rcx,%rbp
|
||||
lp: mov %rcx,%rbp
|
||||
shr $1,%rcx
|
||||
and %r8,%rcx
|
||||
sub %rcx,%rbp
|
||||
@ -53,32 +53,45 @@ lp:
|
||||
shr $2,%rbp
|
||||
and %r9,%rcx
|
||||
and %r9,%rbp
|
||||
add %rcx,%rbp
|
||||
|
||||
mov %r12,%rbx
|
||||
add %rbp,%rcx
|
||||
|
||||
mov %r12,%rbp
|
||||
shr $1,%r12
|
||||
and %r8,%r12
|
||||
sub %r12,%rbx
|
||||
mov 8(%rdi,%rdx,8),%rcx
|
||||
mov %rbx,%r12
|
||||
shr $2,%rbx
|
||||
sub %r12,%rbp
|
||||
mov %rbp,%r12
|
||||
shr $2,%rbp
|
||||
and %r9,%r12
|
||||
xor 8(%rsi,%rdx,8),%rcx
|
||||
and %r9,%rbx
|
||||
add %r12,%rbx
|
||||
|
||||
add %rbp,%rbx
|
||||
mov %rbx,%r13
|
||||
mov (%rdi,%rdx,8),%r12
|
||||
xor (%rsi,%rdx,8),%r12
|
||||
shr $4,%rbx
|
||||
and %r10,%r13
|
||||
and %r10,%rbx
|
||||
add %rbx,%r13
|
||||
imul %r11,%r13
|
||||
shr $56,%r13
|
||||
add %r13,%rax
|
||||
sub $2,%rdx
|
||||
and %r9,%rbp
|
||||
add %r12,%rbp
|
||||
|
||||
mov %r14,%r12
|
||||
shr $1,%r14
|
||||
and %r8,%r14
|
||||
sub %r14,%r12
|
||||
mov %r12,%r14
|
||||
shr $2,%r12
|
||||
and %r9,%r14
|
||||
and %r9,%r12
|
||||
add %r14,%r12
|
||||
|
||||
add %rcx,%rbp
|
||||
add %r12,%rbp
|
||||
mov 16(%rdi,%rdx,8),%rcx
|
||||
mov %rbp,%r14
|
||||
shr $4,%rbp
|
||||
and %r10,%r14
|
||||
xor 16(%rsi,%rdx,8),%rcx
|
||||
mov 8(%rdi,%rdx,8),%r12
|
||||
xor 8(%rsi,%rdx,8),%r12
|
||||
and %r10,%rbp
|
||||
add %rbp,%r14
|
||||
imul %r11,%r14
|
||||
shr $56,%r14
|
||||
add %r14,%rax
|
||||
mov (%rdi,%rdx,8),%r14
|
||||
xor (%rsi,%rdx,8),%r14
|
||||
sub $3,%rdx
|
||||
jnc lp
|
||||
skiplp:
|
||||
mov %rcx,%rbp
|
||||
@ -89,31 +102,45 @@ skiplp:
|
||||
shr $2,%rbp
|
||||
and %r9,%rcx
|
||||
and %r9,%rbp
|
||||
add %rcx,%rbp
|
||||
add %rbp,%rcx
|
||||
|
||||
mov %r12,%rbx
|
||||
mov %r12,%rbp
|
||||
shr $1,%r12
|
||||
and %r8,%r12
|
||||
sub %r12,%rbx
|
||||
mov %rbx,%r12
|
||||
shr $2,%rbx
|
||||
sub %r12,%rbp
|
||||
mov %rbp,%r12
|
||||
shr $2,%rbp
|
||||
and %r9,%r12
|
||||
and %r9,%rbx
|
||||
add %r12,%rbx
|
||||
and %r9,%rbp
|
||||
add %r12,%rbp
|
||||
|
||||
add %rbp,%rbx
|
||||
mov %rbx,%r13
|
||||
shr $4,%rbx
|
||||
and %r10,%r13
|
||||
and %r10,%rbx
|
||||
add %rbx,%r13
|
||||
imul %r11,%r13
|
||||
shr $56,%r13
|
||||
add %r13,%rax
|
||||
skip: cmp $-2,%rdx
|
||||
jz case0
|
||||
case1: mov 8(%rdi,%rdx,8),%rcx
|
||||
xor 8(%rsi,%rdx,8),%rcx
|
||||
mov %r14,%r12
|
||||
shr $1,%r14
|
||||
and %r8,%r14
|
||||
sub %r14,%r12
|
||||
mov %r12,%r14
|
||||
shr $2,%r12
|
||||
and %r9,%r14
|
||||
and %r9,%r12
|
||||
add %r14,%r12
|
||||
|
||||
add %rcx,%rbp
|
||||
add %r12,%rbp
|
||||
mov %rbp,%r14
|
||||
shr $4,%rbp
|
||||
and %r10,%r14
|
||||
and %r10,%rbp
|
||||
add %rbp,%r14
|
||||
imul %r11,%r14
|
||||
shr $56,%r14
|
||||
add %r14,%rax
|
||||
skip:
|
||||
cmp $-2,%rdx
|
||||
jl case0
|
||||
jz case1
|
||||
case2:
|
||||
mov 16(%rdi,%rdx,8),%rcx
|
||||
xor 16(%rsi,%rdx,8),%rcx
|
||||
mov %rcx,%rbp
|
||||
shr $1,%rcx
|
||||
and %r8,%rcx
|
||||
@ -122,17 +149,40 @@ case1: mov 8(%rdi,%rdx,8),%rcx
|
||||
shr $2,%rbp
|
||||
and %r9,%rcx
|
||||
and %r9,%rbp
|
||||
add %rcx,%rbp
|
||||
mov %rbp,%r13
|
||||
shr $4,%rbp
|
||||
add %rbp,%r13
|
||||
and %r10,%r13
|
||||
imul %r11,%r13
|
||||
shr $56,%r13
|
||||
add %r13,%rax
|
||||
case0: pop %rbx
|
||||
pop %rbp
|
||||
pop %r13
|
||||
add %rbp,%rcx
|
||||
|
||||
mov %rcx,%r14
|
||||
shr $4,%rcx
|
||||
and %r10,%r14
|
||||
and %r10,%rcx
|
||||
add %rcx,%r14
|
||||
imul %r11,%r14
|
||||
shr $56,%r14
|
||||
add %r14,%rax
|
||||
dec %rdx
|
||||
case1:
|
||||
mov 16(%rdi,%rdx,8),%rcx
|
||||
xor 16(%rsi,%rdx,8),%rcx
|
||||
mov %rcx,%rbp
|
||||
shr $1,%rcx
|
||||
and %r8,%rcx
|
||||
sub %rcx,%rbp
|
||||
mov %rbp,%rcx
|
||||
shr $2,%rbp
|
||||
and %r9,%rcx
|
||||
and %r9,%rbp
|
||||
add %rbp,%rcx
|
||||
|
||||
mov %rcx,%r14
|
||||
shr $4,%rcx
|
||||
and %r10,%r14
|
||||
and %r10,%rcx
|
||||
add %rcx,%r14
|
||||
imul %r11,%r14
|
||||
shr $56,%r14
|
||||
add %r14,%rax
|
||||
case0: pop %rbp
|
||||
pop %r14
|
||||
pop %r12
|
||||
ret
|
||||
EPILOGUE()
|
||||
|
@ -26,32 +26,33 @@
|
||||
|
||||
%include "yasm_mac.inc"
|
||||
|
||||
CPU Core2
|
||||
CPU Athlon64
|
||||
BITS 64
|
||||
|
||||
%define reg_save_list rbx, rsi, rdi, rbp, r12, r13
|
||||
%define reg_save_list rsi, rdi, rbp, r12, r14
|
||||
|
||||
FRAME_PROC mpn_hamdist, 0, reg_save_list
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
|
||||
mov r8, 0x5555555555555555
|
||||
mov r9, 0x3333333333333333
|
||||
mov r10, 0x0f0f0f0f0f0f0f0f
|
||||
mov r11, 0x0101010101010101
|
||||
mov rax, 0
|
||||
sub rdx, 2
|
||||
jc .3
|
||||
mov rcx, [rdi+rdx*8+8]
|
||||
xor rcx, [rsi+rdx*8+8]
|
||||
mov r12, [rdi+rdx*8]
|
||||
xor r12, [rsi+rdx*8]
|
||||
sub rdx, 2
|
||||
xor eax, eax
|
||||
sub rdx, 3
|
||||
jc .2
|
||||
|
||||
mov rcx, [rdi+rdx*8+16]
|
||||
xor rcx, [rsi+rdx*8+16]
|
||||
mov r12, [rdi+rdx*8+8]
|
||||
xor r12, [rsi+rdx*8+8]
|
||||
mov r14, [rdi+rdx*8]
|
||||
xor r14, [rsi+rdx*8]
|
||||
sub rdx, 3
|
||||
jc .1
|
||||
xalign 16
|
||||
.1: mov rbp, rcx
|
||||
.0: mov rbp, rcx
|
||||
shr rcx, 1
|
||||
and rcx, r8
|
||||
sub rbp, rcx
|
||||
@ -59,66 +60,44 @@
|
||||
shr rbp, 2
|
||||
and rcx, r9
|
||||
and rbp, r9
|
||||
add rbp, rcx
|
||||
|
||||
mov rbx, r12
|
||||
add rcx, rbp
|
||||
mov rbp, r12
|
||||
shr r12, 1
|
||||
and r12, r8
|
||||
sub rbx, r12
|
||||
mov rcx, [rdi+rdx*8+8]
|
||||
mov r12, rbx
|
||||
shr rbx, 2
|
||||
and r12, r9
|
||||
xor rcx, [rsi+rdx*8+8]
|
||||
and rbx, r9
|
||||
add rbx, r12
|
||||
|
||||
add rbx, rbp
|
||||
mov r13, rbx
|
||||
mov r12, [rdi+rdx*8]
|
||||
xor r12, [rsi+rdx*8]
|
||||
shr rbx, 4
|
||||
and r13, r10
|
||||
and rbx, r10
|
||||
add r13, rbx
|
||||
imul r13, r11
|
||||
shr r13, 56
|
||||
add rax, r13
|
||||
sub rdx, 2
|
||||
jnc .1
|
||||
.2: mov rbp, rcx
|
||||
shr rcx, 1
|
||||
and rcx, r8
|
||||
sub rbp, rcx
|
||||
mov rcx, rbp
|
||||
sub rbp, r12
|
||||
mov r12, rbp
|
||||
shr rbp, 2
|
||||
and rcx, r9
|
||||
and rbp, r9
|
||||
add rbp, rcx
|
||||
|
||||
mov rbx, r12
|
||||
shr r12, 1
|
||||
and r12, r8
|
||||
sub rbx, r12
|
||||
mov r12, rbx
|
||||
shr rbx, 2
|
||||
and r12, r9
|
||||
and rbx, r9
|
||||
add rbx, r12
|
||||
|
||||
add rbx, rbp
|
||||
mov r13, rbx
|
||||
shr rbx, 4
|
||||
and r13, r10
|
||||
and rbx, r10
|
||||
add r13, rbx
|
||||
imul r13, r11
|
||||
shr r13, 56
|
||||
add rax, r13
|
||||
.3: cmp rdx, -2
|
||||
jz .5
|
||||
.4: mov rcx, [rdi+rdx*8+8]
|
||||
xor rcx, [rsi+rdx*8+8]
|
||||
and rbp, r9
|
||||
add rbp, r12
|
||||
mov r12, r14
|
||||
shr r14, 1
|
||||
and r14, r8
|
||||
sub r12, r14
|
||||
mov r14, r12
|
||||
shr r12, 2
|
||||
and r14, r9
|
||||
and r12, r9
|
||||
add r12, r14
|
||||
add rbp, rcx
|
||||
add rbp, r12
|
||||
mov rcx, [rdi+rdx*8+16]
|
||||
mov r14, rbp
|
||||
shr rbp, 4
|
||||
and r14, r10
|
||||
xor rcx, [rsi+rdx*8+16]
|
||||
mov r12, [rdi+rdx*8+8]
|
||||
xor r12, [rsi+rdx*8+8]
|
||||
and rbp, r10
|
||||
add r14, rbp
|
||||
imul r14, r11
|
||||
shr r14, 56
|
||||
add rax, r14
|
||||
mov r14, [rdi+rdx*8]
|
||||
xor r14, [rsi+rdx*8]
|
||||
sub rdx, 3
|
||||
jnc .0
|
||||
.1:
|
||||
mov rbp, rcx
|
||||
shr rcx, 1
|
||||
and rcx, r8
|
||||
@ -127,14 +106,80 @@
|
||||
shr rbp, 2
|
||||
and rcx, r9
|
||||
and rbp, r9
|
||||
add rcx, rbp
|
||||
mov rbp, r12
|
||||
shr r12, 1
|
||||
and r12, r8
|
||||
sub rbp, r12
|
||||
mov r12, rbp
|
||||
shr rbp, 2
|
||||
and r12, r9
|
||||
and rbp, r9
|
||||
add rbp, r12
|
||||
mov r12, r14
|
||||
shr r14, 1
|
||||
and r14, r8
|
||||
sub r12, r14
|
||||
mov r14, r12
|
||||
shr r12, 2
|
||||
and r14, r9
|
||||
and r12, r9
|
||||
add r12, r14
|
||||
add rbp, rcx
|
||||
mov r13, rbp
|
||||
add rbp, r12
|
||||
mov r14, rbp
|
||||
shr rbp, 4
|
||||
add r13, rbp
|
||||
and r13, r10
|
||||
imul r13, r11
|
||||
shr r13, 56
|
||||
add rax, r13
|
||||
and r14, r10
|
||||
and rbp, r10
|
||||
add r14, rbp
|
||||
imul r14, r11
|
||||
shr r14, 56
|
||||
add rax, r14
|
||||
.2:
|
||||
cmp rdx, -2
|
||||
jl .5
|
||||
jz .4
|
||||
.3:
|
||||
mov rcx, [rdi+rdx*8+16]
|
||||
xor rcx, [rsi+rdx*8+16]
|
||||
mov rbp, rcx
|
||||
shr rcx, 1
|
||||
and rcx, r8
|
||||
sub rbp, rcx
|
||||
mov rcx, rbp
|
||||
shr rbp, 2
|
||||
and rcx, r9
|
||||
and rbp, r9
|
||||
add rcx, rbp
|
||||
mov r14, rcx
|
||||
shr rcx, 4
|
||||
and r14, r10
|
||||
and rcx, r10
|
||||
add r14, rcx
|
||||
imul r14, r11
|
||||
shr r14, 56
|
||||
add rax, r14
|
||||
dec rdx
|
||||
.4:
|
||||
mov rcx, [rdi+rdx*8+16]
|
||||
xor rcx, [rsi+rdx*8+16]
|
||||
mov rbp, rcx
|
||||
shr rcx, 1
|
||||
and rcx, r8
|
||||
sub rbp, rcx
|
||||
mov rcx, rbp
|
||||
shr rbp, 2
|
||||
and rcx, r9
|
||||
and rbp, r9
|
||||
add rcx, rbp
|
||||
mov r14, rcx
|
||||
shr rcx, 4
|
||||
and r14, r10
|
||||
and rcx, r10
|
||||
add r14, rcx
|
||||
imul r14, r11
|
||||
shr r14, 56
|
||||
add rax, r14
|
||||
.5: END_PROC reg_save_list
|
||||
|
||||
end
|
||||
end
|
||||
|
Loading…
Reference in New Issue
Block a user