; mp_limb_t mpn_mul_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t) ; rax rdi rsi rdx rcx r8 ; rax rcx rdx r8 r9 [rsp+40] ; with 1st size >= 2nd size %include 'yasm_mac.inc' %define reg_save_list rsi, rdi, rbx, rbp, r12, r14 BITS 64 align 16 LEAF_PROC mpn_mul_basecase cmp r8, 2 ja .4 mov r8, rdx mov rdx, [r9] mulx r11, rax, [r8] mov [rcx], rax je .1 .0: mov [rcx+8], r11 ret .1: cmp qword [rsp+40], 2 mulx r10, rax, [r8+8] je .3 .2: add r11, rax adc r10, 0 mov [rcx+8], r11 mov [rcx+16], r10 ret .3: add r11, rax adc r10, 0 mov rdx, [r9+8] mov rax, r8 mulx r9, r8, [rax] mulx rdx, rax, [rax+8] add rax, r9 adc rdx, 0 add r11, r8 adc r10, rax adc rdx, 0 mov [rcx+8], r11 mov [rcx+16], r10 mov [rcx+24], rdx ret align 16 .4: FRAME_PROC mpn_mul_bc, 0, reg_save_list mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, [rsp+stack_use+40] mov r14, rcx lea rbx, [rdx+1] mov rbp, rdx mov eax, edx and rbx, -8 shr rbp, 3 neg rbx and eax, 7 mov rcx, rbp mov rdx, [r14] lea r14, [r14+8] lea r10, [rel .41] movsxd r11, dword [r10+rax*4] lea r10, [r11+r10] jmp r10 .5: mulx r11, r10, [rsi] lea rsi, [rsi+56] lea rdi, [rdi-8] jmp .15 .6: mulx r9, r12, [rsi] lea rsi, [rsi+16] lea rdi, [rdi+16] inc rcx jmp .20 .7: mulx r11, r10, [rsi] lea rsi, [rsi+24] lea rdi, [rdi+24] inc rcx jmp .19 .8: mulx r9, r12, [rsi] lea rsi, [rsi+32] lea rdi, [rdi+32] inc rcx jmp .18 .9: mulx r11, r10, [rsi] lea rsi, [rsi+40] lea rdi, [rdi+40] inc rcx jmp .17 .10:mulx r9, r12, [rsi] lea rsi, [rsi+48] lea rdi, [rdi+48] inc rcx jmp .16 .11:mulx r9, r12, [rsi] jmp .14 .12:mulx r11, r10, [rsi] lea rsi, [rsi+8] lea rdi, [rdi+8] mulx r9, r12, [rsi] align 16 .13:mov [rdi-8], r10 adc r12, r11 .14:mulx r11, r10, [rsi+8] adc r10, r9 lea rsi, [rsi+64] mov [rdi], r12 .15:mov [rdi+8], r10 mulx r9, r12, [rsi-48] lea rdi, [rdi+64] adc r12, r11 .16:mulx r11, r10, [rsi-40] mov [rdi-48], r12 adc r10, r9 .17:mov [rdi-40], r10 mulx r9, r12, [rsi-32] adc r12, r11 .18:mulx r11, r10, [rsi-24] mov [rdi-32], r12 adc r10, r9 .19:mulx r9, r12, [rsi-16] mov [rdi-24], r10 adc r12, r11 .20:mulx r11, r10, [rsi-8] adc r10, r9 mov [rdi-16], r12 dec rcx mulx r9, r12, [rsi] jnz .13 .21:mov [rdi-8], r10 adc r12, r11 mov [rdi], r12 adc r9, rcx mov [rdi+8], r9 dec r8 jz .31 lea r10, [rel .42] movsxd rax, dword [r10+rax*4] lea rax, [rax+r10] .22:lea rsi, [rsi+rbx*8] mov rcx, rbp mov rdx, [r14] lea r14, [r14+8] jmp rax .23:mulx r11, r10, [rsi+8] lea rdi, [rdi+rbx*8+8] lea rcx, [rcx-1] jmp .35 .24:mulx r9, r12, [rsi-16] lea rdi, [rdi+rbx*8-56] jmp .40 .25:mulx r11, r10, [rsi-24] lea rdi, [rdi+rbx*8-56] jmp .39 .26:mulx r9, r12, [rsi-32] lea rdi, [rdi+rbx*8-56] jmp .38 .27:mulx r11, r10, [rsi-40] lea rdi, [rdi+rbx*8-56] jmp .37 .28:mulx r9, r12, [rsi+16] lea rdi, [rdi+rbx*8+8] jmp .36 .29:mulx r9, r12, [rsi] lea rdi, [rdi+rbx*8+8] jmp .34 .30:adox r12, [rdi] adox r9, rcx mov [rdi], r12 adc r9, rcx mov [rdi+8], r9 dec r8 jnz .22 .31: END_PROC reg_save_list .32:mulx r11, r10, [rsi-8] lea rdi, [rdi+rbx*8+8] mulx r9, r12, [rsi] align 16 .33:adox r10, [rdi-8] adcx r12, r11 mov [rdi-8], r10 jrcxz .30 .34:mulx r11, r10, [rsi+8] adox r12, [rdi] lea rcx, [rcx-1] mov [rdi], r12 adcx r10, r9 .35:mulx r9, r12, [rsi+16] adcx r12, r11 adox r10, [rdi+8] mov [rdi+8], r10 .36:mulx r11, r10, [rsi+24] lea rsi, [rsi+64] adcx r10, r9 adox r12, [rdi+16] mov [rdi+16], r12 .37:mulx r9, r12, [rsi-32] adox r10, [rdi+24] adcx r12, r11 mov [rdi+24], r10 .38:mulx r11, r10, [rsi-24] adcx r10, r9 adox r12, [rdi+32] mov [rdi+32], r12 .39:mulx r9, r12, [rsi-16] adox r10, [rdi+40] adcx r12, r11 mov [rdi+40], r10 .40:adox r12, [rdi+48] mulx r11, r10, [rsi-8] mov [rdi+48], r12 lea rdi, [rdi+64] adcx r10, r9 mulx r9, r12, [rsi] jmp .33 align 8 .41: dd .5 - .41 dd .11 - .41 dd .12 - .41 dd .6 - .41 dd .7 - .41 dd .8 - .41 dd .9 - .41 dd .10 - .41 .42: dd .23 - .42 dd .29 - .42 dd .32 - .42 dd .24 - .42 dd .25 - .42 dd .26 - .42 dd .27 - .42 dd .28 - .42