mpir/mpn/x86_64w/amd64/sublsh_n.asm


;  Copyright 2009 Jason Moxham
;
;  Windows Conversion Copyright 2008 Brian Gladman
;
;  This file is part of the MPIR Library.
;
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.

;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.

;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.
;
;  mp_limb_t  mpn_sublsh_n(mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_uint, mp_limb_t)
;  mp_limb_t mpn_sublsh_nc(mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_uint)
;  rax                        rdi     rsi     rdx        rcx       r8         r9
;  rax                        rcx     rdx      r8        r9d [rsp+40]   [rsp+48]

%include "..\yasm_mac.inc"

    CPU  Athlon64
    BITS 64

%define reg_save_list rbx, rsi, rdi, rbp, r12, r13, r14, r15

	LEAF_PROC mpn_sublsh_n
	xor     r9, r9
    jmp     entry

	LEAF_PROC mpn_sublsh_nc
	movsxd  r10, r9d
	mov     r9, [rsp+48]
entry:
	FRAME_PROC ?mpn_sublsh, 0, reg_save_list
	lea     rdi, [rcx+r10*8]
	lea     rsi, [rdx+r10*8]
	lea     rdx, [r8+r10*8]
	movsxd  rcx, dword [rsp+stack_use+40]

	neg     rcx
	shr     r9, cl
	neg     r10
	xor     rax, rax
	test    r10, 3
	jz      L_next
L_lp:
	mov     r8, [rdx+r10*8]
	mov     r11, r8
	neg     rcx
	shl     r8, cl
	neg     rcx
	shr     r11, cl
	or      r8, r9
	mov     r9, r11
	add     rax, 1
	mov     r11, [rsi+r10*8]
	sbb     r11, r8
	sbb     rax, rax
	mov     [rdi+r10*8], r11
	inc     r10
	test    r10, 3
	jnz     L_lp
L_next:
	cmp     r10, 0
	jz      L_end

	xalign  16
L_loop:
	mov     r8, [rdx+r10*8]
	mov     rbp, [rdx+r10*8+8]
	mov     rbx, [rdx+r10*8+16]
	mov     r12, [rdx+r10*8+24]
	mov     r11, r8
	mov     r13, rbp
	mov     r14, rbx
	mov     r15, r12
	neg     rcx
	shl     r8, cl
	shl     rbp, cl
	shl     rbx, cl
	shl     r12, cl
	neg     rcx
	shr     r11, cl
	shr     r13, cl
	shr     r14, cl
	shr     r15, cl
	or      r8, r9
	or      rbp, r11
	or      rbx, r13
	or      r12, r14
	mov     r9, r15
	add     rax, 1
	mov     r11, [rsi+r10*8]
	mov     r13, [rsi+r10*8+8]
	mov     r14, [rsi+r10*8+16]
	mov     r15, [rsi+r10*8+24]
	sbb     r11, r8
	sbb     r13, rbp
	sbb     r14, rbx
	sbb     r15, r12
	sbb     rax, rax
	mov     [rdi+r10*8], r11
	mov     [rdi+r10*8+8], r13
	mov     [rdi+r10*8+16], r14
	mov     [rdi+r10*8+24], r15
	add     r10, 4
	jnz     L_loop
L_end:
	neg     rax
	add     rax, r9
    END_PROC reg_save_list

    end