122 lines
2.9 KiB
NASM
122 lines
2.9 KiB
NASM
dnl IA-64 mpn_popcount.
|
|
|
|
dnl Copyright 2000, 2001, 2006 Free Software Foundation, Inc.
|
|
|
|
dnl This file is part of the GNU MP Library.
|
|
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
dnl it under the terms of the GNU Lesser General Public License as published
|
|
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
|
|
dnl your option) any later version.
|
|
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
dnl License for more details.
|
|
|
|
dnl You should have received a copy of the GNU Lesser General Public License
|
|
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
|
|
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
dnl Boston, MA 02110-1301, USA.
|
|
|
|
dnl Runs at 1 cycle/limb on the Itanium. That is the peak performance for the
|
|
dnl popcnt instruction, so this is optimal code. It should be straightforward
|
|
dnl to write mpn_hamdist with the same awesome performance.
|
|
|
|
include(`../config.m4')
|
|
|
|
define(ABI32,
|
|
m4_assert_onearg()
|
|
`ifdef(`HAVE_ABI_32',`$1')')
|
|
|
|
C INPUT PARAMETERS
|
|
C sp = r32
|
|
C n = r33
|
|
|
|
ASM_START()
|
|
PROLOGUE(mpn_popcount)
|
|
.prologue
|
|
.save ar.lc, r2
|
|
ABI32(` addp4 r32 = 0, r32') C M src extend
|
|
mov r2 = ar.lc C I0
|
|
ABI32(` zxt4 r33 = r33') C I1 size extend
|
|
;;
|
|
.body
|
|
|
|
and r22 = 3, r33
|
|
shr.u r23 = r33, 2 ;;
|
|
mov ar.lc = r22
|
|
mov r8 = 0 ;;
|
|
br.cloop.dpnt .Loop0 ;;
|
|
br .L0
|
|
.Loop0: ld8 r16 = [r32], 8 ;;
|
|
popcnt r20 = r16 ;;
|
|
add r8 = r8, r20
|
|
br.cloop.dptk .Loop0 ;;
|
|
|
|
.L0: mov ar.lc = r23 ;;
|
|
br.cloop.dptk .L1 ;;
|
|
mov ar.lc = r2
|
|
br.ret.sptk.many b0 ;;
|
|
.L1: ld8 r16 = [r32], 8 ;;
|
|
ld8 r17 = [r32], 8 ;;
|
|
ld8 r18 = [r32], 8 ;;
|
|
ld8 r19 = [r32], 8 ;;
|
|
br.cloop.dptk .L2 ;;
|
|
br .Ldone1 ;;
|
|
.L2:
|
|
popcnt r20 = r16
|
|
ld8 r16 = [r32], 8 ;;
|
|
popcnt r21 = r17
|
|
ld8 r17 = [r32], 8 ;;
|
|
popcnt r22 = r18
|
|
ld8 r18 = [r32], 8 ;;
|
|
popcnt r23 = r19
|
|
ld8 r19 = [r32], 8 ;;
|
|
br.cloop.dptk .Loop ;;
|
|
br .Ldone0
|
|
|
|
.Loop: add r8 = r8, r20
|
|
popcnt r20 = r16
|
|
ld8 r16 = [r32], 8 ;;
|
|
add r8 = r8, r21
|
|
popcnt r21 = r17
|
|
ld8 r17 = [r32], 8 ;;
|
|
add r8 = r8, r22
|
|
popcnt r22 = r18
|
|
ld8 r18 = [r32], 8 ;;
|
|
add r8 = r8, r23
|
|
popcnt r23 = r19
|
|
ld8 r19 = [r32], 8
|
|
br.cloop.dptk .Loop ;;
|
|
|
|
.Ldone0:
|
|
add r8 = r8, r20
|
|
popcnt r20 = r16 ;;
|
|
add r8 = r8, r21
|
|
popcnt r21 = r17 ;;
|
|
add r8 = r8, r22
|
|
popcnt r22 = r18 ;;
|
|
add r8 = r8, r23
|
|
popcnt r23 = r19 ;;
|
|
add r21 = r21, r20
|
|
add r23 = r23, r22 ;;
|
|
add r8 = r8, r21 ;;
|
|
add r8 = r8, r23
|
|
mov ar.lc = r2
|
|
br.ret.sptk.many b0
|
|
|
|
.Ldone1:
|
|
popcnt r20 = r16
|
|
popcnt r21 = r17
|
|
popcnt r22 = r18
|
|
popcnt r23 = r19 ;;
|
|
add r21 = r21, r20
|
|
add r23 = r23, r22 ;;
|
|
add r8 = r8, r21 ;;
|
|
add r8 = r8, r23
|
|
mov ar.lc = r2
|
|
br.ret.sptk.many b0
|
|
EPILOGUE(mpn_popcount)
|
|
ASM_END()
|