faster core2/penryn mpn_hamdist by using the K8 version

This commit is contained in:
jasonmoxham 2010-12-05 07:49:17 +00:00
parent 77e6892a3c
commit a87caeb1f8
2 changed files with 240 additions and 145 deletions

View File

@ -1,6 +1,6 @@
dnl mpn_hamdist
dnl Copyright 2009 Jason Moxham
dnl Copyright 2010 The Code Cavern
dnl This file is part of the MPIR Library.
@ -27,25 +27,25 @@ C rax rdi, rsi, rdx
ASM_START()
PROLOGUE(mpn_hamdist)
push %r12
push %r13
push %r14
push %rbp
push %rbx
mov $0x5555555555555555,%r8
mov $0x3333333333333333,%r9
mov $0x0f0f0f0f0f0f0f0f,%r10
mov $0x0101010101010101,%r11
mov $0,%rax
sub $2,%rdx
xor %eax,%eax
sub $3,%rdx
jc skip
mov 8(%rdi,%rdx,8),%rcx
xor 8(%rsi,%rdx,8),%rcx
mov (%rdi,%rdx,8),%r12
xor (%rsi,%rdx,8),%r12
sub $2,%rdx
mov 16(%rdi,%rdx,8),%rcx
xor 16(%rsi,%rdx,8),%rcx
mov 8(%rdi,%rdx,8),%r12
xor 8(%rsi,%rdx,8),%r12
mov (%rdi,%rdx,8),%r14
xor (%rsi,%rdx,8),%r14
sub $3,%rdx
jc skiplp
ALIGN(16)
lp:
mov %rcx,%rbp
lp: mov %rcx,%rbp
shr $1,%rcx
and %r8,%rcx
sub %rcx,%rbp
@ -53,32 +53,45 @@ lp:
shr $2,%rbp
and %r9,%rcx
and %r9,%rbp
add %rcx,%rbp
mov %r12,%rbx
add %rbp,%rcx
mov %r12,%rbp
shr $1,%r12
and %r8,%r12
sub %r12,%rbx
mov 8(%rdi,%rdx,8),%rcx
mov %rbx,%r12
shr $2,%rbx
sub %r12,%rbp
mov %rbp,%r12
shr $2,%rbp
and %r9,%r12
xor 8(%rsi,%rdx,8),%rcx
and %r9,%rbx
add %r12,%rbx
add %rbp,%rbx
mov %rbx,%r13
mov (%rdi,%rdx,8),%r12
xor (%rsi,%rdx,8),%r12
shr $4,%rbx
and %r10,%r13
and %r10,%rbx
add %rbx,%r13
imul %r11,%r13
shr $56,%r13
add %r13,%rax
sub $2,%rdx
and %r9,%rbp
add %r12,%rbp
mov %r14,%r12
shr $1,%r14
and %r8,%r14
sub %r14,%r12
mov %r12,%r14
shr $2,%r12
and %r9,%r14
and %r9,%r12
add %r14,%r12
add %rcx,%rbp
add %r12,%rbp
mov 16(%rdi,%rdx,8),%rcx
mov %rbp,%r14
shr $4,%rbp
and %r10,%r14
xor 16(%rsi,%rdx,8),%rcx
mov 8(%rdi,%rdx,8),%r12
xor 8(%rsi,%rdx,8),%r12
and %r10,%rbp
add %rbp,%r14
imul %r11,%r14
shr $56,%r14
add %r14,%rax
mov (%rdi,%rdx,8),%r14
xor (%rsi,%rdx,8),%r14
sub $3,%rdx
jnc lp
skiplp:
mov %rcx,%rbp
@ -89,31 +102,45 @@ skiplp:
shr $2,%rbp
and %r9,%rcx
and %r9,%rbp
add %rcx,%rbp
add %rbp,%rcx
mov %r12,%rbx
mov %r12,%rbp
shr $1,%r12
and %r8,%r12
sub %r12,%rbx
mov %rbx,%r12
shr $2,%rbx
sub %r12,%rbp
mov %rbp,%r12
shr $2,%rbp
and %r9,%r12
and %r9,%rbx
add %r12,%rbx
and %r9,%rbp
add %r12,%rbp
add %rbp,%rbx
mov %rbx,%r13
shr $4,%rbx
and %r10,%r13
and %r10,%rbx
add %rbx,%r13
imul %r11,%r13
shr $56,%r13
add %r13,%rax
skip: cmp $-2,%rdx
jz case0
case1: mov 8(%rdi,%rdx,8),%rcx
xor 8(%rsi,%rdx,8),%rcx
mov %r14,%r12
shr $1,%r14
and %r8,%r14
sub %r14,%r12
mov %r12,%r14
shr $2,%r12
and %r9,%r14
and %r9,%r12
add %r14,%r12
add %rcx,%rbp
add %r12,%rbp
mov %rbp,%r14
shr $4,%rbp
and %r10,%r14
and %r10,%rbp
add %rbp,%r14
imul %r11,%r14
shr $56,%r14
add %r14,%rax
skip:
cmp $-2,%rdx
jl case0
jz case1
case2:
mov 16(%rdi,%rdx,8),%rcx
xor 16(%rsi,%rdx,8),%rcx
mov %rcx,%rbp
shr $1,%rcx
and %r8,%rcx
@ -122,17 +149,40 @@ case1: mov 8(%rdi,%rdx,8),%rcx
shr $2,%rbp
and %r9,%rcx
and %r9,%rbp
add %rcx,%rbp
mov %rbp,%r13
shr $4,%rbp
add %rbp,%r13
and %r10,%r13
imul %r11,%r13
shr $56,%r13
add %r13,%rax
case0: pop %rbx
pop %rbp
pop %r13
add %rbp,%rcx
mov %rcx,%r14
shr $4,%rcx
and %r10,%r14
and %r10,%rcx
add %rcx,%r14
imul %r11,%r14
shr $56,%r14
add %r14,%rax
dec %rdx
case1:
mov 16(%rdi,%rdx,8),%rcx
xor 16(%rsi,%rdx,8),%rcx
mov %rcx,%rbp
shr $1,%rcx
and %r8,%rcx
sub %rcx,%rbp
mov %rbp,%rcx
shr $2,%rbp
and %r9,%rcx
and %r9,%rbp
add %rbp,%rcx
mov %rcx,%r14
shr $4,%rcx
and %r10,%r14
and %r10,%rcx
add %rcx,%r14
imul %r11,%r14
shr $56,%r14
add %r14,%rax
case0: pop %rbp
pop %r14
pop %r12
ret
EPILOGUE()

View File

@ -26,32 +26,33 @@
%include "yasm_mac.inc"
CPU Core2
CPU Athlon64
BITS 64
%define reg_save_list rbx, rsi, rdi, rbp, r12, r13
%define reg_save_list rsi, rdi, rbp, r12, r14
FRAME_PROC mpn_hamdist, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov r8, 0x5555555555555555
mov r9, 0x3333333333333333
mov r10, 0x0f0f0f0f0f0f0f0f
mov r11, 0x0101010101010101
mov rax, 0
sub rdx, 2
jc .3
mov rcx, [rdi+rdx*8+8]
xor rcx, [rsi+rdx*8+8]
mov r12, [rdi+rdx*8]
xor r12, [rsi+rdx*8]
sub rdx, 2
xor eax, eax
sub rdx, 3
jc .2
mov rcx, [rdi+rdx*8+16]
xor rcx, [rsi+rdx*8+16]
mov r12, [rdi+rdx*8+8]
xor r12, [rsi+rdx*8+8]
mov r14, [rdi+rdx*8]
xor r14, [rsi+rdx*8]
sub rdx, 3
jc .1
xalign 16
.1: mov rbp, rcx
.0: mov rbp, rcx
shr rcx, 1
and rcx, r8
sub rbp, rcx
@ -59,66 +60,44 @@
shr rbp, 2
and rcx, r9
and rbp, r9
add rbp, rcx
mov rbx, r12
add rcx, rbp
mov rbp, r12
shr r12, 1
and r12, r8
sub rbx, r12
mov rcx, [rdi+rdx*8+8]
mov r12, rbx
shr rbx, 2
and r12, r9
xor rcx, [rsi+rdx*8+8]
and rbx, r9
add rbx, r12
add rbx, rbp
mov r13, rbx
mov r12, [rdi+rdx*8]
xor r12, [rsi+rdx*8]
shr rbx, 4
and r13, r10
and rbx, r10
add r13, rbx
imul r13, r11
shr r13, 56
add rax, r13
sub rdx, 2
jnc .1
.2: mov rbp, rcx
shr rcx, 1
and rcx, r8
sub rbp, rcx
mov rcx, rbp
sub rbp, r12
mov r12, rbp
shr rbp, 2
and rcx, r9
and rbp, r9
add rbp, rcx
mov rbx, r12
shr r12, 1
and r12, r8
sub rbx, r12
mov r12, rbx
shr rbx, 2
and r12, r9
and rbx, r9
add rbx, r12
add rbx, rbp
mov r13, rbx
shr rbx, 4
and r13, r10
and rbx, r10
add r13, rbx
imul r13, r11
shr r13, 56
add rax, r13
.3: cmp rdx, -2
jz .5
.4: mov rcx, [rdi+rdx*8+8]
xor rcx, [rsi+rdx*8+8]
and rbp, r9
add rbp, r12
mov r12, r14
shr r14, 1
and r14, r8
sub r12, r14
mov r14, r12
shr r12, 2
and r14, r9
and r12, r9
add r12, r14
add rbp, rcx
add rbp, r12
mov rcx, [rdi+rdx*8+16]
mov r14, rbp
shr rbp, 4
and r14, r10
xor rcx, [rsi+rdx*8+16]
mov r12, [rdi+rdx*8+8]
xor r12, [rsi+rdx*8+8]
and rbp, r10
add r14, rbp
imul r14, r11
shr r14, 56
add rax, r14
mov r14, [rdi+rdx*8]
xor r14, [rsi+rdx*8]
sub rdx, 3
jnc .0
.1:
mov rbp, rcx
shr rcx, 1
and rcx, r8
@ -127,14 +106,80 @@
shr rbp, 2
and rcx, r9
and rbp, r9
add rcx, rbp
mov rbp, r12
shr r12, 1
and r12, r8
sub rbp, r12
mov r12, rbp
shr rbp, 2
and r12, r9
and rbp, r9
add rbp, r12
mov r12, r14
shr r14, 1
and r14, r8
sub r12, r14
mov r14, r12
shr r12, 2
and r14, r9
and r12, r9
add r12, r14
add rbp, rcx
mov r13, rbp
add rbp, r12
mov r14, rbp
shr rbp, 4
add r13, rbp
and r13, r10
imul r13, r11
shr r13, 56
add rax, r13
and r14, r10
and rbp, r10
add r14, rbp
imul r14, r11
shr r14, 56
add rax, r14
.2:
cmp rdx, -2
jl .5
jz .4
.3:
mov rcx, [rdi+rdx*8+16]
xor rcx, [rsi+rdx*8+16]
mov rbp, rcx
shr rcx, 1
and rcx, r8
sub rbp, rcx
mov rcx, rbp
shr rbp, 2
and rcx, r9
and rbp, r9
add rcx, rbp
mov r14, rcx
shr rcx, 4
and r14, r10
and rcx, r10
add r14, rcx
imul r14, r11
shr r14, 56
add rax, r14
dec rdx
.4:
mov rcx, [rdi+rdx*8+16]
xor rcx, [rsi+rdx*8+16]
mov rbp, rcx
shr rcx, 1
and rcx, r8
sub rbp, rcx
mov rcx, rbp
shr rbp, 2
and rcx, r9
and rbp, r9
add rcx, rbp
mov r14, rcx
shr rcx, 4
and r14, r10
and rcx, r10
add r14, rcx
imul r14, r11
shr r14, 56
add rax, r14
.5: END_PROC reg_save_list
end
end