From 5955fc2424146ad46ae38a484fc8cc8d1d5f1ab9 Mon Sep 17 00:00:00 2001 From: jasonmoxham Date: Wed, 13 Jul 2011 09:48:02 +0000 Subject: [PATCH] mpn_half mpn_double asm for K8 --- mpn/x86_64/k8/double.asm | 60 +++++++++++++++++++++++++++++++++ mpn/x86_64/k8/gmp-mparam.h | 69 +++++++++++++++++++------------------- mpn/x86_64/k8/half.asm | 61 +++++++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+), 34 deletions(-) create mode 100644 mpn/x86_64/k8/double.asm create mode 100644 mpn/x86_64/k8/half.asm diff --git a/mpn/x86_64/k8/double.asm b/mpn/x86_64/k8/double.asm new file mode 100644 index 00000000..18479c9c --- /dev/null +++ b/mpn/x86_64/k8/double.asm @@ -0,0 +1,60 @@ +dnl mpn_double + +dnl Copyright 2011 The Code Cavern + +dnl This file is part of the MPIR Library. + +dnl The MPIR Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The MPIR Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the MPIR Library; see the file COPYING.LIB. If not, write +dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +dnl Boston, MA 02110-1301, USA. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_double) +mov %rsi,%rax +shr $2,%rsi +and $3,%eax +jz t1 +shlq $1,(%rdi) +lea 8(%rdi),%rdi +dec %rax +jz t1 +rclq $1,(%rdi) +lea 8(%rdi),%rdi +dec %rax +jz t1 +rclq $1,(%rdi) +lea 8(%rdi),%rdi +t1: +sbb %rdx,%rdx +cmp $0,%rsi +jz skiplp +add %rdx,%rdx +.align 16 +lp: + rclq $1,(%rdi) + nop + rclq $1,8(%rdi) + rclq $1,16(%rdi) + rclq $1,24(%rdi) + nop + dec %rsi + lea 32(%rdi),%rdi + jnz lp +sbb %rdx,%rdx +skiplp: +sbb %rdx,%rax +ret +EPILOGUE() diff --git a/mpn/x86_64/k8/gmp-mparam.h b/mpn/x86_64/k8/gmp-mparam.h index 4ec79d01..85be0b9d 100644 --- a/mpn/x86_64/k8/gmp-mparam.h +++ b/mpn/x86_64/k8/gmp-mparam.h @@ -1,20 +1,20 @@ -/* Generated by tuneup.c, 2011-02-22, gcc 4.4 */ +/* Generated by tuneup.c, 2011-07-11, gcc 4.5 */ -#define MUL_KARATSUBA_THRESHOLD 24 -#define MUL_TOOM3_THRESHOLD 84 -#define MUL_TOOM4_THRESHOLD 248 -#define MUL_TOOM8H_THRESHOLD 466 +#define MUL_KARATSUBA_THRESHOLD 22 +#define MUL_TOOM3_THRESHOLD 134 +#define MUL_TOOM4_THRESHOLD 387 +#define MUL_TOOM8H_THRESHOLD 446 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_KARATSUBA_THRESHOLD 32 -#define SQR_TOOM3_THRESHOLD 117 -#define SQR_TOOM4_THRESHOLD 498 -#define SQR_TOOM8_THRESHOLD 498 +#define SQR_KARATSUBA_THRESHOLD 43 +#define SQR_TOOM3_THRESHOLD 125 +#define SQR_TOOM4_THRESHOLD 512 +#define SQR_TOOM8_THRESHOLD 674 -#define POWM_THRESHOLD 451 +#define POWM_THRESHOLD 464 -#define GCD_THRESHOLD 438 -#define GCDEXT_THRESHOLD 996 +#define GCD_THRESHOLD 446 +#define GCDEXT_THRESHOLD 969 #define JACOBI_BASE_METHOD 1 #define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ @@ -28,47 +28,48 @@ #define MOD_1_1_THRESHOLD 4 #define MOD_1_2_THRESHOLD 8 #define MOD_1_3_THRESHOLD 24 -#define DIVREM_HENSEL_QR_1_THRESHOLD 7 +#define DIVREM_HENSEL_QR_1_THRESHOLD 8 #define RSH_DIVREM_HENSEL_QR_1_THRESHOLD 7 -#define DIVREM_EUCLID_HENSEL_THRESHOLD 68 +#define DIVREM_EUCLID_HENSEL_THRESHOLD 30 #define ROOTREM_THRESHOLD 11 -#define GET_STR_DC_THRESHOLD 14 -#define GET_STR_PRECOMPUTE_THRESHOLD 23 -#define SET_STR_DC_THRESHOLD 542 -#define SET_STR_PRECOMPUTE_THRESHOLD 542 +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 27410 +#define SET_STR_PRECOMPUTE_THRESHOLD 52671 -#define MUL_FFT_TABLE { 400, 928, 3264, 3840, 7168, 36864, 114688, 458752, 2359296, 7340032, 0 } -#define MUL_FFT_MODF_THRESHOLD 464 -#define MUL_FFT_FULL_THRESHOLD 3648 +#define MUL_FFT_TABLE { 432, 1056, 3136, 3840, 7168, 45056, 114688, 589824, 0 } +#define MUL_FFT_MODF_THRESHOLD 528 +#define MUL_FFT_FULL_THRESHOLD 3264 -#define SQR_FFT_TABLE { 400, 928, 2368, 2816, 11264, 36864, 114688, 458752, 2359296, 7340032, 0 } -#define SQR_FFT_MODF_THRESHOLD 400 +#define SQR_FFT_TABLE { 464, 928, 2368, 2816, 13312, 45056, 114688, 458752, 0 } +#define SQR_FFT_MODF_THRESHOLD 464 #define SQR_FFT_FULL_THRESHOLD 2496 -#define MULLOW_BASECASE_THRESHOLD 10 +#define MULLOW_BASECASE_THRESHOLD 9 #define MULLOW_DC_THRESHOLD 13 #define MULLOW_MUL_THRESHOLD 9970 -#define MULHIGH_BASECASE_THRESHOLD 22 -#define MULHIGH_DC_THRESHOLD 22 +#define MULHIGH_BASECASE_THRESHOLD 21 +#define MULHIGH_DC_THRESHOLD 21 #define MULHIGH_MUL_THRESHOLD 9970 #define MULMOD_2EXPM1_THRESHOLD 20 #define FAC_UI_THRESHOLD 32756 -#define DC_DIV_QR_THRESHOLD 54 -#define DC_DIVAPPR_Q_N_THRESHOLD 136 +#define DC_DIV_QR_THRESHOLD 46 +#define DC_DIVAPPR_Q_N_THRESHOLD 156 #define INV_DIV_QR_THRESHOLD 9894 -#define INV_DIVAPPR_Q_N_THRESHOLD 136 -#define DC_DIV_Q_THRESHOLD 205 +#define INV_DIVAPPR_Q_N_THRESHOLD 156 +#define DC_DIV_Q_THRESHOLD 195 #define INV_DIV_Q_THRESHOLD 9894 -#define DC_DIVAPPR_Q_THRESHOLD 174 -#define INV_DIVAPPR_Q_THRESHOLD 19921 +#define DC_DIVAPPR_Q_THRESHOLD 171 +#define INV_DIVAPPR_Q_THRESHOLD 19441 #define DC_BDIV_QR_THRESHOLD 42 -#define DC_BDIV_Q_THRESHOLD 20 -/* Tuneup completed successfully, took 794 seconds */ +#define DC_BDIV_Q_THRESHOLD 24 +/* Tuneup completed successfully, took 131 seconds */ + #define MUL_FFT_TABLE2 {{1, 3}, {205, 4}, {377, 5}, {386, 4}, {404, 5}, {813, 6}, {850, 5}, {869, 6}, {971, 5}, {993, 6}, {2392, 7}, {2445, 6}, {2668, 7}, {2727, 6}, {2787, 7}, {2976, 6}, {3042, 7}, {3109, 6}, {3178, 7}, {3248, 8}, {3393, 7}, {3468, 8}, {3544, 7}, {3784, 8}, {3867, 6}, {3952, 7}, {4039, 8}, {4407, 7}, {4504, 8}, {4914, 7}, {5022, 8}, {6957, 9}, {7756, 8}, {8460, 9}, {8836, 8}, {9030, 9}, {9850, 8}, {10513, 9}, {11976, 8}, {12239, 9}, {13939, 8}, {14245, 9}, {15876, 8}, {16224, 9}, {22461, 10}, {23970, 9}, {26142, 10}, {27898, 9}, {28509, 10}, {31772, 9}, {33906, 10}, {36184, 9}, {36977, 10}, {40326, 9}, {41210, 10}, {44943, 11}, {45928, 10}, {46934, 11}, {47962, 10}, {57042, 11}, {62207, 12}, {63570, 10}, {73983, 11}, {80681, 10}, {86099, 11}, {98051, 10}, {102394, 11}, {114110, 12}, {127165, 11}, {164920, 12}, {196129, 11}, {228243, 12}, {233241, 13}, {254354, 12}, {259924, 11}, {277377, 12}, {456509, 13}, {519871, 12}, {659749, 13}, {784582, 14}, {MP_SIZE_T_MAX,0}} diff --git a/mpn/x86_64/k8/half.asm b/mpn/x86_64/k8/half.asm new file mode 100644 index 00000000..53d7bb9d --- /dev/null +++ b/mpn/x86_64/k8/half.asm @@ -0,0 +1,61 @@ +dnl mpn_half + +dnl Copyright 2011 The Code Cavern + +dnl This file is part of the MPIR Library. + +dnl The MPIR Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The MPIR Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the MPIR Library; see the file COPYING.LIB. If not, write +dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +dnl Boston, MA 02110-1301, USA. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_half) +mov %rsi,%rax +lea -8(%rdi,%rsi,8),%rdi +shr $2,%rsi +and $3,%eax +jz t1 +shrq $1,(%rdi) +lea -8(%rdi),%rdi +dec %rax +jz t1 +rcrq $1,(%rdi) +lea -8(%rdi),%rdi +dec %rax +jz t1 +rcrq $1,(%rdi) +lea -8(%rdi),%rdi +t1: +sbb %rdx,%rdx +cmp $0,%rsi +jz skiplp +add %rdx,%rdx +.align 16 +lp: + rcrq $1,(%rdi) + nop + rcrq $1,-8(%rdi) + rcrq $1,-16(%rdi) + rcrq $1,-24(%rdi) + nop + dec %rsi + lea -32(%rdi),%rdi + jnz lp +sbb %rdx,%rdx +skiplp: +sbb %rdx,%rax +ret +EPILOGUE()