336 lines
10 KiB
Python
336 lines
10 KiB
Python
|
|
||
|
# A Python program to run speed and evaluate the performance of MPIR
|
||
|
# routines.
|
||
|
#
|
||
|
# Copyright (c) 2009, Brian Gladman, Worcester, UK.
|
||
|
#
|
||
|
# This file is part of the MPIR Library. The MPIR Library is free
|
||
|
# software; you can redistribute it and/or modify it under the terms
|
||
|
# of the GNU Lesser General Public License version 2.1 as published
|
||
|
# by the Free Software Foundation.
|
||
|
#
|
||
|
# The MPIR Library is distributed in the hope that it will be useful,
|
||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
# Lesser General Public License for more details. You should have
|
||
|
# received a copy of the GNU Lesser General Public License along
|
||
|
# with the MPIR Library; see the file COPYING.LIB. If not, write to
|
||
|
# the Free Software Foundation, Inc., 51Franklin Street, Fifth Floor,
|
||
|
# Boston, MA 02110-1301, USA.
|
||
|
|
||
|
from __future__ import print_function
|
||
|
import sys
|
||
|
import os
|
||
|
import shutil
|
||
|
import string
|
||
|
import copy
|
||
|
import code
|
||
|
import math
|
||
|
import platform
|
||
|
from subprocess import Popen, PIPE, STDOUT
|
||
|
|
||
|
if sys.platform.startswith('win'):
|
||
|
dir = '.\\x64\\release\\'
|
||
|
else :
|
||
|
dir = './'
|
||
|
|
||
|
ll = [
|
||
|
'-c -s 10(10)1000 noop',
|
||
|
'-c -s 10(10)1000 noop_wxs',
|
||
|
'-c -s 10(10)1000 noop_wxys',
|
||
|
'-c -s 10(10)1000 mpn_add_n',
|
||
|
'-c -s 10(10)1000 mpn_sub_n',
|
||
|
'-c -s 10(10)1000 mpn_addadd_n',
|
||
|
'-c -s 10(10)1000 mpn_subadd_n',
|
||
|
'-c -s 10(10)1000 mpn_addsub_n',
|
||
|
'-c -s 10(10)1000 mpn_karaadd',
|
||
|
'-c -s 10(10)1000 mpn_karasub',
|
||
|
'-c -s 10(10)1000 mpn_addmul_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_submul_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_submul_2',
|
||
|
'-c -s 10(10)1000 mpn_mul_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_mul_1_inplace.3333',
|
||
|
'-c -s 10(10)1000 mpn_mul_2',
|
||
|
'-c -s 10(10)1000 mpn_divrem_euclidean_qr_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_euclidean_qr_2',
|
||
|
'-c -s 10(10)1000 mpn_divrem_euclidean_r_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_hensel_qr_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_hensel_qr_1_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_hensel_qr_1_2.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_hensel_r_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_2.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_hensel_rsh_qr_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_1f.3333',
|
||
|
'-c -s 10(10)1000 mpn_mod_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_mod_1_1',
|
||
|
'-c -s 10(10)1000 mpn_mod_1_2',
|
||
|
'-c -s 10(10)1000 mpn_mod_1_3',
|
||
|
'-c -s 10(10)1000 mpn_mod_1_k.3',
|
||
|
'-c -s 10(10)1000 mpn_preinv_divrem_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_preinv_divrem_1f.3333',
|
||
|
'-c -s 10(10)1000 mpn_preinv_mod_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_add_err1_n',
|
||
|
'-c -s 10(10)1000 mpn_sub_err1_n',
|
||
|
'-c -s 10(10)1000 mpn_inv_divappr_q',
|
||
|
'-c -s 10(10)1000 mpn_inv_div_qr',
|
||
|
'-c -s 10(10)1000 mpn_dc_divappr_q',
|
||
|
'-c -s 10(10)1000 mpn_dc_div_qr_n',
|
||
|
'-c -s 10(10)1000 mpn_divrem_1_inv.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_1f_div.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_1f_inv.3333',
|
||
|
'-c -s 10(10)1000 mpn_mod_1_div.3333',
|
||
|
'-c -s 10(10)1000 mpn_mod_1_inv.3333',
|
||
|
'-c -s 10(10)1000 mpn_divrem_2',
|
||
|
'-c -s 10(10)1000 mpn_divrem_2_div',
|
||
|
'-c -s 10(10)1000 mpn_divrem_2_inv',
|
||
|
'-c -s 10(10)1000 mpn_divexact_1.3333',
|
||
|
'-c -s 10(10)1000 mpn_divexact_by3',
|
||
|
'-c -s 10(10)1000 mpn_divexact_byff',
|
||
|
'-c -s 10(10)1000 mpn_divexact_byfobm1.3333',
|
||
|
'-c -s 10(10)1000 mpn_modexact_1_odd.333',
|
||
|
'-c -s 10(10)1000 mpn_modexact_1c_odd.333',
|
||
|
'-c -s 10(10)1000 mpn_mod_34lsub1',
|
||
|
'-c -s 10(10)1000 mpn_dc_tdiv_qr',
|
||
|
'-c -s 10(10)1000 mpn_lshift.33',
|
||
|
'-c -s 10(10)1000 mpn_rshift.33',
|
||
|
'-c -s 10(10)1000 mpn_lshift1',
|
||
|
'-c -s 10(10)1000 mpn_rshift1',
|
||
|
'-c -s 10(10)1000 mpn_double',
|
||
|
'-c -s 10(10)1000 mpn_half',
|
||
|
'-c -s 10(10)1000 mpn_lshift2',
|
||
|
'-c -s 10(10)1000 mpn_rshift2',
|
||
|
'-c -s 10(10)1000 mpn_and_n',
|
||
|
'-c -s 10(10)1000 mpn_andn_n',
|
||
|
'-c -s 10(10)1000 mpn_nand_n',
|
||
|
'-c -s 10(10)1000 mpn_ior_n',
|
||
|
'-c -s 10(10)1000 mpn_iorn_n',
|
||
|
'-c -s 10(10)1000 mpn_nior_n',
|
||
|
'-c -s 10(10)1000 mpn_xor_n',
|
||
|
'-c -s 10(10)1000 mpn_xnor_n',
|
||
|
'-c -s 10(10)1000 mpn_com_n',
|
||
|
'-c -s 10(10)1000 mpn_not',
|
||
|
'-c -s 10(10)1000 mpn_popcount',
|
||
|
'-c -s 10(10)1000 mpn_hamdist',
|
||
|
|
||
|
'-c -s 10(10)1000 MPN_ZERO',
|
||
|
'-c -s 10(10)1000 MPN_COPY',
|
||
|
'-c -s 10(10)1000 MPN_COPY_INCR',
|
||
|
'-c -s 10(10)1000 MPN_COPY_DECR',
|
||
|
|
||
|
'-c -s 10(10)1000 count_leading_zeros',
|
||
|
'-c -s 10(10)1000 gmp_allocate_free',
|
||
|
'-c -s 10(10)1000 malloc_realloc_free',
|
||
|
'-c -s 10(10)1000 gmp_allocate_reallocate_free',
|
||
|
'-c -s 10(10)1000 malloc_free',
|
||
|
'-c -s 10(10)1000 mpn_umul_ppmm',
|
||
|
'-c -s 10(10)1000 mpz_add',
|
||
|
'-c -s 10(10)1000 mpz_init_realloc_clear',
|
||
|
'-c -s 10(10)1000 mpz_init_clear',
|
||
|
'-c -s 10(10)1000 udiv_qrnnd',
|
||
|
'-c -s 10(10)1000 udiv_qrnnd_c',
|
||
|
'-c -s 10(10)1000 udiv_qrnnd_preinv1',
|
||
|
'-c -s 10(10)1000 udiv_qrnnd_preinv2',
|
||
|
'-c -s 10(10)1000 umul_ppmm',
|
||
|
'-c -s 10(10)1000 mpn_popcount',
|
||
|
'-c -s 10(10)1000 mpn_hamdist',
|
||
|
|
||
|
]
|
||
|
|
||
|
lq = [
|
||
|
'-c -s 10(10)1000 mpn_dc_divrem_n',
|
||
|
'-c -s 10(10)1000 mpn_dc_divrem_sb',
|
||
|
'-c -s 10(10)1000 mpn_dc_tdiv_qr',
|
||
|
'-c -s 10(10)1000 mpn_kara_mul_n',
|
||
|
'-c -s 10(10)1000 mpn_kara_sqr_n',
|
||
|
'-c -s 10(10)1000 mpn_mul_basecase',
|
||
|
'-c -s 1000(500)10000 -t 10 mpn_mul_fft_full',
|
||
|
'-c -s 10(10)1000 mpn_mul_n',
|
||
|
'-c -s 10(10)1000 mpn_sqr_basecase',
|
||
|
'-c -s 10(10)1000 mpn_sqr_n',
|
||
|
'-c -s 50(10)1000 mpn_toom3_mul_n',
|
||
|
'-c -s 50(10)1000 mpn_toom3_sqr_n',
|
||
|
'-c -s 1(5)100 mpz_powm',
|
||
|
]
|
||
|
|
||
|
# run an executable and return its error return value and any output
|
||
|
def run_exe(exe, args, inp) :
|
||
|
al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT }
|
||
|
if sys.platform.startswith('win') :
|
||
|
al['creationflags'] = 0x08000000
|
||
|
p = Popen([exe] + args.split(' '), **al)
|
||
|
res = p.communicate(inp.encode())[0].decode()
|
||
|
ret = p.poll()
|
||
|
return (ret, res)
|
||
|
|
||
|
# output a matrix implemented as a dictionary
|
||
|
def mout(m, n) :
|
||
|
for r in range(n) :
|
||
|
print('\n{0:3d}'.format(r), end='')
|
||
|
for c in range(n) :
|
||
|
print('{0:18.4f}'.format(m[(r,c)]) , end='')
|
||
|
print
|
||
|
|
||
|
# output a vector
|
||
|
def vout(v) :
|
||
|
print(' ' , end='')
|
||
|
for c in v :
|
||
|
print('{0:18.4f}'.format(c) , end='')
|
||
|
print()
|
||
|
|
||
|
# In-place LU matrix decomposition. The diagonal
|
||
|
# elements of the upper triangular matrix U are
|
||
|
# all 1 and are not stored. Pivoting is used and
|
||
|
# the matrix is implemented as a dictionary. It
|
||
|
# is only intended for use with small matrices.
|
||
|
|
||
|
def LU_decompose(A, n) :
|
||
|
p = [0] * n
|
||
|
for k in range(n) :
|
||
|
# find pivot
|
||
|
p[k] = k
|
||
|
max = math.fabs(A[(k,k)])
|
||
|
for j in range(k + 1, n) :
|
||
|
if max < math.fabs(A[(j,k)]) :
|
||
|
max = math.fabs(A[(j,k)])
|
||
|
p[k] = j
|
||
|
# exchange rows if necessary
|
||
|
if p[k] != k :
|
||
|
for j in range(n) :
|
||
|
A[(k,j)], A[(p[k],j)] = A[(p[k],j)], A[(k,j)]
|
||
|
|
||
|
# exit if matrix is singular
|
||
|
if A[(k,k)] == 0.0 :
|
||
|
return None
|
||
|
|
||
|
# set upper triangular elements
|
||
|
for j in range(k + 1,n) :
|
||
|
A[(k,j)] /= A[(k,k)]
|
||
|
|
||
|
# update remaining part of original matrix
|
||
|
for i in range(k + 1, n) :
|
||
|
for j in range(k + 1, n) :
|
||
|
A[(i,j)] -= A[(i,k)] * A[(k,j)]
|
||
|
|
||
|
# return pivot array
|
||
|
return p
|
||
|
|
||
|
# Use the LU decomposition above to solve the matrix
|
||
|
# equation A x = b for x given A and b
|
||
|
|
||
|
def LU_solve(A, p, b) :
|
||
|
n = len(p)
|
||
|
x = [0] * n
|
||
|
|
||
|
# calculate U x = L^-1 b
|
||
|
for k in range(n) :
|
||
|
if p[k] != k :
|
||
|
b[k], b[p[k]] = b[p[k]], b[k]
|
||
|
x[k] = b[k]
|
||
|
for i in range(k) :
|
||
|
x[k] -= x[i] * A[(k,i)]
|
||
|
x[k] /= A[(k,k)]
|
||
|
|
||
|
# back substitute for x = U^-1 (L^-1 b)
|
||
|
for k in reversed(range(n)) :
|
||
|
if p[k] != k :
|
||
|
b[k], b[p[k]], b[p[k]], b[k]
|
||
|
for i in range(k + 1, n) :
|
||
|
x[k] -= x[i] * A[(k,i)]
|
||
|
return x
|
||
|
|
||
|
def lsq_solve(x, y, n) :
|
||
|
m = {} # matrix as dictionary
|
||
|
v = [] # vector as list
|
||
|
# set up matrix and vectors for least squares
|
||
|
for i in range(n) :
|
||
|
v.append(sum(xx ** i * yy for xx, yy in zip(x, y)))
|
||
|
for j in range(i, n) :
|
||
|
m[(i,j)] = m[(j,i)] = sum(xx ** (i + j) for xx in x)
|
||
|
# decompose the matrix into lower and upper triangular
|
||
|
# matrices
|
||
|
p = LU_decompose(m, n)
|
||
|
if p != None :
|
||
|
return LU_solve(m, p, v)
|
||
|
else :
|
||
|
return None
|
||
|
|
||
|
def do_lsq(x, y, lsq_size) :
|
||
|
# get least squares coefficients
|
||
|
f = lsq_solve(x, y, lsq_size)
|
||
|
|
||
|
# now find the standard deviation from the curve
|
||
|
s = 0
|
||
|
for i in range(len(x)) :
|
||
|
t = sum(f[j] * x[i] ** j for j in range(lsq_size))
|
||
|
s += (y[i] - t) ** 2
|
||
|
sd = 2 * math.sqrt(s / len(x))
|
||
|
|
||
|
# now remove 'outliers' - data points outside twice
|
||
|
# the standard deviation
|
||
|
sc = 0
|
||
|
for i in reversed(range(len(x))) :
|
||
|
t = sum(f[j] * x[i] ** j for j in range(lsq_size))
|
||
|
if math.fabs(y[i] - t) > sd :
|
||
|
del x[i]
|
||
|
del y[i]
|
||
|
sc += 1
|
||
|
|
||
|
# if we had to remove more than 10% of measurements
|
||
|
# declare that the result is not stable
|
||
|
if 10 * sc > len(x) :
|
||
|
return None
|
||
|
else :
|
||
|
return f
|
||
|
|
||
|
print('Machine:', platform.processor())
|
||
|
print('Running:', platform.platform())
|
||
|
print('SPEED CURVE (l: no of limbs) cycles: c[0] + c[1] * l + c[2] * l^2')
|
||
|
print('ROUTINE ', end = '')
|
||
|
print(' c[0] c[1] c[2]')
|
||
|
lines = ''
|
||
|
cnt = 0
|
||
|
lsq_size = 4
|
||
|
for args in ll + lq :
|
||
|
cnt += 1
|
||
|
# run speed for each routine in the list above
|
||
|
ret = run_exe(os.path.join(dir, 'speed'), args, '')
|
||
|
# parse the output to produce limbs[] and times[]
|
||
|
x = []
|
||
|
y = []
|
||
|
lines = ret[1].split('\n')
|
||
|
for l in lines :
|
||
|
if len(l) :
|
||
|
s = l.split()
|
||
|
try :
|
||
|
t = [float(i) for i in s]
|
||
|
except :
|
||
|
continue
|
||
|
x += [t[0]]
|
||
|
y += [t[1]]
|
||
|
|
||
|
# output the name of the routine
|
||
|
nn = args.split(' ')[-1]
|
||
|
print('{0:<30s}'.format(nn) , end='')
|
||
|
if not len(x) :
|
||
|
# print(ret[1].strip(), '(failed to parse output)')
|
||
|
print('(failed to parse output)')
|
||
|
continue
|
||
|
|
||
|
q = 0 if args in ll else 1
|
||
|
rep = q
|
||
|
while rep < 3 :
|
||
|
rep += 1
|
||
|
f = do_lsq(x, y, lsq_size)
|
||
|
if f != None :
|
||
|
break
|
||
|
else :
|
||
|
print('not stable')
|
||
|
continue
|
||
|
if args in lq :
|
||
|
print('{0[0]:11.1f} {0[1]:11.1f} {0[2]:11.1f}'.format(f))
|
||
|
else :
|
||
|
print('{0[0]:11.1f} {0[1]:11.1f}'.format(f))
|