; AMD64 mpn_rsh1sub_n ; Copyright 2009 Jason Moxham ; This file is part of the MPIR Library. ; The MPIR Library is free software; you can redistribute it and/or modify ; it under the terms of the GNU Lesser General Public License as published ; by the Free Software Foundation; either version 2.1 of the License, or (at ; your option) any later version. ; The MPIR Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. ; You should have received a copy of the GNU Lesser General Public License ; along with the MPIR Library; see the file COPYING.LIB. If not, write ; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ; Boston, MA 02110-1301, USA. ; ; (rdi,rcx)=((rsi,rcx)-(rdx,rcx))/2 ; return bottom bit of difference ; subtraction treated as two compliment %include "..\yasm_mac.inc" %define reg_save_list rbx, rsi, rdi, r12 CPU Athlon64 BITS 64 FRAME_PROC mpn_rsh1sub_n, 0, reg_save_list movsxd rax, r9d lea rdi, [rcx+rax*8-32] lea rsi, [rdx+rax*8-32] lea rdx, [r8+rax*8-32] mov rcx, rax mov r8, 4 sub r8, rcx mov r12, [rsi+r8*8] sub r12, [rdx+r8*8] sbb rbx, rbx mov rax, r12 and rax, 1 cmp r8, 0 jge L_skiplp L_lp: mov r11, [rsi+r8*8+32] mov r10, [rsi+r8*8+24] add rbx, 1 mov rcx, [rsi+r8*8+8] mov r9, [rsi+r8*8+16] sbb rcx, [rdx+r8*8+8] sbb r9, [rdx+r8*8+16] sbb r10, [rdx+r8*8+24] sbb r11, [rdx+r8*8+32] sbb rbx, rbx bt r11, 0 rcr r10, 1 rcr r9, 1 rcr rcx, 1 rcr r12, 1 mov [rdi+r8*8], r12 mov [rdi+r8*8+8], rcx mov [rdi+r8*8+16], r9 mov [rdi+r8*8+24], r10 mov r12, r11 add r8, 4 jnc L_lp L_skiplp: cmp r8, 2 ja L_case0 jz L_case1 jp L_case2 L_case3: mov r10, [rsi+r8*8+24] add rbx, 1 mov rcx, [rsi+r8*8+8] mov r9, [rsi+r8*8+16] sbb rcx, [rdx+r8*8+8] sbb r9, [rdx+r8*8+16] sbb r10, [rdx+r8*8+24] rcr r10, 1 rcr r9, 1 rcr rcx, 1 rcr r12, 1 mov [rdi+r8*8], r12 mov [rdi+r8*8+8], rcx mov [rdi+r8*8+16], r9 mov [rdi+r8*8+24], r10 jmp L_xit xalign 16 L_case2: add rbx, 1 mov rcx, [rsi+r8*8+8] mov r9, [rsi+r8*8+16] sbb rcx, [rdx+r8*8+8] sbb r9, [rdx+r8*8+16] rcr r9, 1 rcr rcx, 1 rcr r12, 1 mov [rdi+r8*8], r12 mov [rdi+r8*8+8], rcx mov [rdi+r8*8+16], r9 jmp L_xit xalign 16 L_case1: add rbx, 1 mov rcx, [rsi+r8*8+8] sbb rcx, [rdx+r8*8+8] rcr rcx, 1 rcr r12, 1 mov [rdi+r8*8], r12 mov [rdi+r8*8+8], rcx jmp L_xit xalign 16 L_case0: add rbx, 1 rcr r12, 1 mov [rdi+r8*8], r12 L_xit: END_PROC reg_save_list end