From a87caeb1f866f9601d666a2d6850bb214d30901b Mon Sep 17 00:00:00 2001 From: jasonmoxham Date: Sun, 5 Dec 2010 07:49:17 +0000 Subject: [PATCH] faster core2/penryn mpn_hamdist by using the K8 version --- mpn/x86_64/core2/hamdist.asm | 182 +++++++++++++++++++----------- mpn/x86_64w/core2/hamdist.asm | 203 +++++++++++++++++++++------------- 2 files changed, 240 insertions(+), 145 deletions(-) diff --git a/mpn/x86_64/core2/hamdist.asm b/mpn/x86_64/core2/hamdist.asm index c1fbb699..d06f7a2a 100644 --- a/mpn/x86_64/core2/hamdist.asm +++ b/mpn/x86_64/core2/hamdist.asm @@ -1,6 +1,6 @@ dnl mpn_hamdist -dnl Copyright 2009 Jason Moxham +dnl Copyright 2010 The Code Cavern dnl This file is part of the MPIR Library. @@ -27,25 +27,25 @@ C rax rdi, rsi, rdx ASM_START() PROLOGUE(mpn_hamdist) push %r12 -push %r13 +push %r14 push %rbp -push %rbx mov $0x5555555555555555,%r8 mov $0x3333333333333333,%r9 mov $0x0f0f0f0f0f0f0f0f,%r10 mov $0x0101010101010101,%r11 -mov $0,%rax -sub $2,%rdx +xor %eax,%eax +sub $3,%rdx jc skip - mov 8(%rdi,%rdx,8),%rcx - xor 8(%rsi,%rdx,8),%rcx - mov (%rdi,%rdx,8),%r12 - xor (%rsi,%rdx,8),%r12 -sub $2,%rdx + mov 16(%rdi,%rdx,8),%rcx + xor 16(%rsi,%rdx,8),%rcx + mov 8(%rdi,%rdx,8),%r12 + xor 8(%rsi,%rdx,8),%r12 + mov (%rdi,%rdx,8),%r14 + xor (%rsi,%rdx,8),%r14 +sub $3,%rdx jc skiplp ALIGN(16) -lp: - mov %rcx,%rbp +lp: mov %rcx,%rbp shr $1,%rcx and %r8,%rcx sub %rcx,%rbp @@ -53,32 +53,45 @@ lp: shr $2,%rbp and %r9,%rcx and %r9,%rbp - add %rcx,%rbp - - mov %r12,%rbx + add %rbp,%rcx + + mov %r12,%rbp shr $1,%r12 and %r8,%r12 - sub %r12,%rbx - mov 8(%rdi,%rdx,8),%rcx - mov %rbx,%r12 - shr $2,%rbx + sub %r12,%rbp + mov %rbp,%r12 + shr $2,%rbp and %r9,%r12 - xor 8(%rsi,%rdx,8),%rcx - and %r9,%rbx - add %r12,%rbx - - add %rbp,%rbx - mov %rbx,%r13 - mov (%rdi,%rdx,8),%r12 - xor (%rsi,%rdx,8),%r12 - shr $4,%rbx - and %r10,%r13 - and %r10,%rbx - add %rbx,%r13 - imul %r11,%r13 - shr $56,%r13 - add %r13,%rax - sub $2,%rdx + and %r9,%rbp + add %r12,%rbp + + mov %r14,%r12 + shr $1,%r14 + and %r8,%r14 + sub %r14,%r12 + mov %r12,%r14 + shr $2,%r12 + and %r9,%r14 + and %r9,%r12 + add %r14,%r12 + + add %rcx,%rbp + add %r12,%rbp + mov 16(%rdi,%rdx,8),%rcx + mov %rbp,%r14 + shr $4,%rbp + and %r10,%r14 + xor 16(%rsi,%rdx,8),%rcx + mov 8(%rdi,%rdx,8),%r12 + xor 8(%rsi,%rdx,8),%r12 + and %r10,%rbp + add %rbp,%r14 + imul %r11,%r14 + shr $56,%r14 + add %r14,%rax + mov (%rdi,%rdx,8),%r14 + xor (%rsi,%rdx,8),%r14 + sub $3,%rdx jnc lp skiplp: mov %rcx,%rbp @@ -89,31 +102,45 @@ skiplp: shr $2,%rbp and %r9,%rcx and %r9,%rbp - add %rcx,%rbp + add %rbp,%rcx - mov %r12,%rbx + mov %r12,%rbp shr $1,%r12 and %r8,%r12 - sub %r12,%rbx - mov %rbx,%r12 - shr $2,%rbx + sub %r12,%rbp + mov %rbp,%r12 + shr $2,%rbp and %r9,%r12 - and %r9,%rbx - add %r12,%rbx + and %r9,%rbp + add %r12,%rbp - add %rbp,%rbx - mov %rbx,%r13 - shr $4,%rbx - and %r10,%r13 - and %r10,%rbx - add %rbx,%r13 - imul %r11,%r13 - shr $56,%r13 - add %r13,%rax -skip: cmp $-2,%rdx - jz case0 -case1: mov 8(%rdi,%rdx,8),%rcx - xor 8(%rsi,%rdx,8),%rcx + mov %r14,%r12 + shr $1,%r14 + and %r8,%r14 + sub %r14,%r12 + mov %r12,%r14 + shr $2,%r12 + and %r9,%r14 + and %r9,%r12 + add %r14,%r12 + + add %rcx,%rbp + add %r12,%rbp + mov %rbp,%r14 + shr $4,%rbp + and %r10,%r14 + and %r10,%rbp + add %rbp,%r14 + imul %r11,%r14 + shr $56,%r14 + add %r14,%rax +skip: + cmp $-2,%rdx + jl case0 + jz case1 +case2: + mov 16(%rdi,%rdx,8),%rcx + xor 16(%rsi,%rdx,8),%rcx mov %rcx,%rbp shr $1,%rcx and %r8,%rcx @@ -122,17 +149,40 @@ case1: mov 8(%rdi,%rdx,8),%rcx shr $2,%rbp and %r9,%rcx and %r9,%rbp - add %rcx,%rbp - mov %rbp,%r13 - shr $4,%rbp - add %rbp,%r13 - and %r10,%r13 - imul %r11,%r13 - shr $56,%r13 - add %r13,%rax -case0: pop %rbx - pop %rbp - pop %r13 + add %rbp,%rcx + + mov %rcx,%r14 + shr $4,%rcx + and %r10,%r14 + and %r10,%rcx + add %rcx,%r14 + imul %r11,%r14 + shr $56,%r14 + add %r14,%rax + dec %rdx +case1: + mov 16(%rdi,%rdx,8),%rcx + xor 16(%rsi,%rdx,8),%rcx + mov %rcx,%rbp + shr $1,%rcx + and %r8,%rcx + sub %rcx,%rbp + mov %rbp,%rcx + shr $2,%rbp + and %r9,%rcx + and %r9,%rbp + add %rbp,%rcx + + mov %rcx,%r14 + shr $4,%rcx + and %r10,%r14 + and %r10,%rcx + add %rcx,%r14 + imul %r11,%r14 + shr $56,%r14 + add %r14,%rax +case0: pop %rbp + pop %r14 pop %r12 ret EPILOGUE() diff --git a/mpn/x86_64w/core2/hamdist.asm b/mpn/x86_64w/core2/hamdist.asm index 0faaf462..abaff951 100644 --- a/mpn/x86_64w/core2/hamdist.asm +++ b/mpn/x86_64w/core2/hamdist.asm @@ -26,32 +26,33 @@ %include "yasm_mac.inc" - CPU Core2 + CPU Athlon64 BITS 64 -%define reg_save_list rbx, rsi, rdi, rbp, r12, r13 +%define reg_save_list rsi, rdi, rbp, r12, r14 FRAME_PROC mpn_hamdist, 0, reg_save_list - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov r8, 0x5555555555555555 mov r9, 0x3333333333333333 mov r10, 0x0f0f0f0f0f0f0f0f mov r11, 0x0101010101010101 - mov rax, 0 - sub rdx, 2 - jc .3 - mov rcx, [rdi+rdx*8+8] - xor rcx, [rsi+rdx*8+8] - mov r12, [rdi+rdx*8] - xor r12, [rsi+rdx*8] - sub rdx, 2 + xor eax, eax + sub rdx, 3 jc .2 - + mov rcx, [rdi+rdx*8+16] + xor rcx, [rsi+rdx*8+16] + mov r12, [rdi+rdx*8+8] + xor r12, [rsi+rdx*8+8] + mov r14, [rdi+rdx*8] + xor r14, [rsi+rdx*8] + sub rdx, 3 + jc .1 xalign 16 -.1: mov rbp, rcx +.0: mov rbp, rcx shr rcx, 1 and rcx, r8 sub rbp, rcx @@ -59,66 +60,44 @@ shr rbp, 2 and rcx, r9 and rbp, r9 - add rbp, rcx - - mov rbx, r12 + add rcx, rbp + mov rbp, r12 shr r12, 1 and r12, r8 - sub rbx, r12 - mov rcx, [rdi+rdx*8+8] - mov r12, rbx - shr rbx, 2 - and r12, r9 - xor rcx, [rsi+rdx*8+8] - and rbx, r9 - add rbx, r12 - - add rbx, rbp - mov r13, rbx - mov r12, [rdi+rdx*8] - xor r12, [rsi+rdx*8] - shr rbx, 4 - and r13, r10 - and rbx, r10 - add r13, rbx - imul r13, r11 - shr r13, 56 - add rax, r13 - sub rdx, 2 - jnc .1 -.2: mov rbp, rcx - shr rcx, 1 - and rcx, r8 - sub rbp, rcx - mov rcx, rbp + sub rbp, r12 + mov r12, rbp shr rbp, 2 - and rcx, r9 - and rbp, r9 - add rbp, rcx - - mov rbx, r12 - shr r12, 1 - and r12, r8 - sub rbx, r12 - mov r12, rbx - shr rbx, 2 and r12, r9 - and rbx, r9 - add rbx, r12 - - add rbx, rbp - mov r13, rbx - shr rbx, 4 - and r13, r10 - and rbx, r10 - add r13, rbx - imul r13, r11 - shr r13, 56 - add rax, r13 -.3: cmp rdx, -2 - jz .5 -.4: mov rcx, [rdi+rdx*8+8] - xor rcx, [rsi+rdx*8+8] + and rbp, r9 + add rbp, r12 + mov r12, r14 + shr r14, 1 + and r14, r8 + sub r12, r14 + mov r14, r12 + shr r12, 2 + and r14, r9 + and r12, r9 + add r12, r14 + add rbp, rcx + add rbp, r12 + mov rcx, [rdi+rdx*8+16] + mov r14, rbp + shr rbp, 4 + and r14, r10 + xor rcx, [rsi+rdx*8+16] + mov r12, [rdi+rdx*8+8] + xor r12, [rsi+rdx*8+8] + and rbp, r10 + add r14, rbp + imul r14, r11 + shr r14, 56 + add rax, r14 + mov r14, [rdi+rdx*8] + xor r14, [rsi+rdx*8] + sub rdx, 3 + jnc .0 +.1: mov rbp, rcx shr rcx, 1 and rcx, r8 @@ -127,14 +106,80 @@ shr rbp, 2 and rcx, r9 and rbp, r9 + add rcx, rbp + mov rbp, r12 + shr r12, 1 + and r12, r8 + sub rbp, r12 + mov r12, rbp + shr rbp, 2 + and r12, r9 + and rbp, r9 + add rbp, r12 + mov r12, r14 + shr r14, 1 + and r14, r8 + sub r12, r14 + mov r14, r12 + shr r12, 2 + and r14, r9 + and r12, r9 + add r12, r14 add rbp, rcx - mov r13, rbp + add rbp, r12 + mov r14, rbp shr rbp, 4 - add r13, rbp - and r13, r10 - imul r13, r11 - shr r13, 56 - add rax, r13 + and r14, r10 + and rbp, r10 + add r14, rbp + imul r14, r11 + shr r14, 56 + add rax, r14 +.2: + cmp rdx, -2 + jl .5 + jz .4 +.3: + mov rcx, [rdi+rdx*8+16] + xor rcx, [rsi+rdx*8+16] + mov rbp, rcx + shr rcx, 1 + and rcx, r8 + sub rbp, rcx + mov rcx, rbp + shr rbp, 2 + and rcx, r9 + and rbp, r9 + add rcx, rbp + mov r14, rcx + shr rcx, 4 + and r14, r10 + and rcx, r10 + add r14, rcx + imul r14, r11 + shr r14, 56 + add rax, r14 + dec rdx +.4: + mov rcx, [rdi+rdx*8+16] + xor rcx, [rsi+rdx*8+16] + mov rbp, rcx + shr rcx, 1 + and rcx, r8 + sub rbp, rcx + mov rcx, rbp + shr rbp, 2 + and rcx, r9 + and rbp, r9 + add rcx, rbp + mov r14, rcx + shr rcx, 4 + and r14, r10 + and rcx, r10 + add r14, rcx + imul r14, r11 + shr r14, 56 + add rax, r14 .5: END_PROC reg_save_list - end + end