mpir/msvc/vs22/run-speed.py
2021-10-14 12:25:06 +01:00

336 lines
10 KiB
Python

# A Python program to run speed and evaluate the performance of MPIR
# routines.
#
# Copyright (c) 2009, Brian Gladman, Worcester, UK.
#
# This file is part of the MPIR Library. The MPIR Library is free
# software; you can redistribute it and/or modify it under the terms
# of the GNU Lesser General Public License version 2.1 as published
# by the Free Software Foundation.
#
# The MPIR Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. You should have
# received a copy of the GNU Lesser General Public License along
# with the MPIR Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 51Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.
from __future__ import print_function
import sys
import os
import shutil
import string
import copy
import code
import math
import platform
from subprocess import Popen, PIPE, STDOUT
if sys.platform.startswith('win'):
dir = '.\\x64\\release\\'
else :
dir = './'
ll = [
'-c -s 10(10)1000 noop',
'-c -s 10(10)1000 noop_wxs',
'-c -s 10(10)1000 noop_wxys',
'-c -s 10(10)1000 mpn_add_n',
'-c -s 10(10)1000 mpn_sub_n',
'-c -s 10(10)1000 mpn_addadd_n',
'-c -s 10(10)1000 mpn_subadd_n',
'-c -s 10(10)1000 mpn_addsub_n',
'-c -s 10(10)1000 mpn_karaadd',
'-c -s 10(10)1000 mpn_karasub',
'-c -s 10(10)1000 mpn_addmul_1.3333',
'-c -s 10(10)1000 mpn_submul_1.3333',
'-c -s 10(10)1000 mpn_submul_2',
'-c -s 10(10)1000 mpn_mul_1.3333',
'-c -s 10(10)1000 mpn_mul_1_inplace.3333',
'-c -s 10(10)1000 mpn_mul_2',
'-c -s 10(10)1000 mpn_divrem_euclidean_qr_1.3333',
'-c -s 10(10)1000 mpn_divrem_euclidean_qr_2',
'-c -s 10(10)1000 mpn_divrem_euclidean_r_1.3333',
'-c -s 10(10)1000 mpn_divrem_hensel_qr_1.3333',
'-c -s 10(10)1000 mpn_divrem_hensel_qr_1_1.3333',
'-c -s 10(10)1000 mpn_divrem_hensel_qr_1_2.3333',
'-c -s 10(10)1000 mpn_divrem_hensel_r_1.3333',
'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1.3333',
'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_1.3333',
'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_2.3333',
'-c -s 10(10)1000 mpn_divrem_hensel_rsh_qr_1.3333',
'-c -s 10(10)1000 mpn_divrem_1.3333',
'-c -s 10(10)1000 mpn_divrem_1f.3333',
'-c -s 10(10)1000 mpn_mod_1.3333',
'-c -s 10(10)1000 mpn_mod_1_1',
'-c -s 10(10)1000 mpn_mod_1_2',
'-c -s 10(10)1000 mpn_mod_1_3',
'-c -s 10(10)1000 mpn_mod_1_k.3',
'-c -s 10(10)1000 mpn_preinv_divrem_1.3333',
'-c -s 10(10)1000 mpn_preinv_divrem_1f.3333',
'-c -s 10(10)1000 mpn_preinv_mod_1.3333',
'-c -s 10(10)1000 mpn_add_err1_n',
'-c -s 10(10)1000 mpn_sub_err1_n',
'-c -s 10(10)1000 mpn_inv_divappr_q',
'-c -s 10(10)1000 mpn_inv_div_qr',
'-c -s 10(10)1000 mpn_dc_divappr_q',
'-c -s 10(10)1000 mpn_dc_div_qr_n',
'-c -s 10(10)1000 mpn_divrem_1_inv.3333',
'-c -s 10(10)1000 mpn_divrem_1f_div.3333',
'-c -s 10(10)1000 mpn_divrem_1f_inv.3333',
'-c -s 10(10)1000 mpn_mod_1_div.3333',
'-c -s 10(10)1000 mpn_mod_1_inv.3333',
'-c -s 10(10)1000 mpn_divrem_2',
'-c -s 10(10)1000 mpn_divrem_2_div',
'-c -s 10(10)1000 mpn_divrem_2_inv',
'-c -s 10(10)1000 mpn_divexact_1.3333',
'-c -s 10(10)1000 mpn_divexact_by3',
'-c -s 10(10)1000 mpn_divexact_byff',
'-c -s 10(10)1000 mpn_divexact_byfobm1.3333',
'-c -s 10(10)1000 mpn_modexact_1_odd.333',
'-c -s 10(10)1000 mpn_modexact_1c_odd.333',
'-c -s 10(10)1000 mpn_mod_34lsub1',
'-c -s 10(10)1000 mpn_dc_tdiv_qr',
'-c -s 10(10)1000 mpn_lshift.33',
'-c -s 10(10)1000 mpn_rshift.33',
'-c -s 10(10)1000 mpn_lshift1',
'-c -s 10(10)1000 mpn_rshift1',
'-c -s 10(10)1000 mpn_double',
'-c -s 10(10)1000 mpn_half',
'-c -s 10(10)1000 mpn_lshift2',
'-c -s 10(10)1000 mpn_rshift2',
'-c -s 10(10)1000 mpn_and_n',
'-c -s 10(10)1000 mpn_andn_n',
'-c -s 10(10)1000 mpn_nand_n',
'-c -s 10(10)1000 mpn_ior_n',
'-c -s 10(10)1000 mpn_iorn_n',
'-c -s 10(10)1000 mpn_nior_n',
'-c -s 10(10)1000 mpn_xor_n',
'-c -s 10(10)1000 mpn_xnor_n',
'-c -s 10(10)1000 mpn_com_n',
'-c -s 10(10)1000 mpn_not',
'-c -s 10(10)1000 mpn_popcount',
'-c -s 10(10)1000 mpn_hamdist',
'-c -s 10(10)1000 MPN_ZERO',
'-c -s 10(10)1000 MPN_COPY',
'-c -s 10(10)1000 MPN_COPY_INCR',
'-c -s 10(10)1000 MPN_COPY_DECR',
'-c -s 10(10)1000 count_leading_zeros',
'-c -s 10(10)1000 gmp_allocate_free',
'-c -s 10(10)1000 malloc_realloc_free',
'-c -s 10(10)1000 gmp_allocate_reallocate_free',
'-c -s 10(10)1000 malloc_free',
'-c -s 10(10)1000 mpn_umul_ppmm',
'-c -s 10(10)1000 mpz_add',
'-c -s 10(10)1000 mpz_init_realloc_clear',
'-c -s 10(10)1000 mpz_init_clear',
'-c -s 10(10)1000 udiv_qrnnd',
'-c -s 10(10)1000 udiv_qrnnd_c',
'-c -s 10(10)1000 udiv_qrnnd_preinv1',
'-c -s 10(10)1000 udiv_qrnnd_preinv2',
'-c -s 10(10)1000 umul_ppmm',
'-c -s 10(10)1000 mpn_popcount',
'-c -s 10(10)1000 mpn_hamdist',
]
lq = [
'-c -s 10(10)1000 mpn_dc_divrem_n',
'-c -s 10(10)1000 mpn_dc_divrem_sb',
'-c -s 10(10)1000 mpn_dc_tdiv_qr',
'-c -s 10(10)1000 mpn_kara_mul_n',
'-c -s 10(10)1000 mpn_kara_sqr_n',
'-c -s 10(10)1000 mpn_mul_basecase',
'-c -s 1000(500)10000 -t 10 mpn_mul_fft_full',
'-c -s 10(10)1000 mpn_mul_n',
'-c -s 10(10)1000 mpn_sqr_basecase',
'-c -s 10(10)1000 mpn_sqr_n',
'-c -s 50(10)1000 mpn_toom3_mul_n',
'-c -s 50(10)1000 mpn_toom3_sqr_n',
'-c -s 1(5)100 mpz_powm',
]
# run an executable and return its error return value and any output
def run_exe(exe, args, inp) :
al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT }
if sys.platform.startswith('win') :
al['creationflags'] = 0x08000000
p = Popen([exe] + args.split(' '), **al)
res = p.communicate(inp.encode())[0].decode()
ret = p.poll()
return (ret, res)
# output a matrix implemented as a dictionary
def mout(m, n) :
for r in range(n) :
print('\n{0:3d}'.format(r), end='')
for c in range(n) :
print('{0:18.4f}'.format(m[(r,c)]) , end='')
print
# output a vector
def vout(v) :
print(' ' , end='')
for c in v :
print('{0:18.4f}'.format(c) , end='')
print()
# In-place LU matrix decomposition. The diagonal
# elements of the upper triangular matrix U are
# all 1 and are not stored. Pivoting is used and
# the matrix is implemented as a dictionary. It
# is only intended for use with small matrices.
def LU_decompose(A, n) :
p = [0] * n
for k in range(n) :
# find pivot
p[k] = k
max = math.fabs(A[(k,k)])
for j in range(k + 1, n) :
if max < math.fabs(A[(j,k)]) :
max = math.fabs(A[(j,k)])
p[k] = j
# exchange rows if necessary
if p[k] != k :
for j in range(n) :
A[(k,j)], A[(p[k],j)] = A[(p[k],j)], A[(k,j)]
# exit if matrix is singular
if A[(k,k)] == 0.0 :
return None
# set upper triangular elements
for j in range(k + 1,n) :
A[(k,j)] /= A[(k,k)]
# update remaining part of original matrix
for i in range(k + 1, n) :
for j in range(k + 1, n) :
A[(i,j)] -= A[(i,k)] * A[(k,j)]
# return pivot array
return p
# Use the LU decomposition above to solve the matrix
# equation A x = b for x given A and b
def LU_solve(A, p, b) :
n = len(p)
x = [0] * n
# calculate U x = L^-1 b
for k in range(n) :
if p[k] != k :
b[k], b[p[k]] = b[p[k]], b[k]
x[k] = b[k]
for i in range(k) :
x[k] -= x[i] * A[(k,i)]
x[k] /= A[(k,k)]
# back substitute for x = U^-1 (L^-1 b)
for k in reversed(range(n)) :
if p[k] != k :
b[k], b[p[k]], b[p[k]], b[k]
for i in range(k + 1, n) :
x[k] -= x[i] * A[(k,i)]
return x
def lsq_solve(x, y, n) :
m = {} # matrix as dictionary
v = [] # vector as list
# set up matrix and vectors for least squares
for i in range(n) :
v.append(sum(xx ** i * yy for xx, yy in zip(x, y)))
for j in range(i, n) :
m[(i,j)] = m[(j,i)] = sum(xx ** (i + j) for xx in x)
# decompose the matrix into lower and upper triangular
# matrices
p = LU_decompose(m, n)
if p != None :
return LU_solve(m, p, v)
else :
return None
def do_lsq(x, y, lsq_size) :
# get least squares coefficients
f = lsq_solve(x, y, lsq_size)
# now find the standard deviation from the curve
s = 0
for i in range(len(x)) :
t = sum(f[j] * x[i] ** j for j in range(lsq_size))
s += (y[i] - t) ** 2
sd = 2 * math.sqrt(s / len(x))
# now remove 'outliers' - data points outside twice
# the standard deviation
sc = 0
for i in reversed(range(len(x))) :
t = sum(f[j] * x[i] ** j for j in range(lsq_size))
if math.fabs(y[i] - t) > sd :
del x[i]
del y[i]
sc += 1
# if we had to remove more than 10% of measurements
# declare that the result is not stable
if 10 * sc > len(x) :
return None
else :
return f
print('Machine:', platform.processor())
print('Running:', platform.platform())
print('SPEED CURVE (l: no of limbs) cycles: c[0] + c[1] * l + c[2] * l^2')
print('ROUTINE ', end = '')
print(' c[0] c[1] c[2]')
lines = ''
cnt = 0
lsq_size = 4
for args in ll + lq :
cnt += 1
# run speed for each routine in the list above
ret = run_exe(os.path.join(dir, 'speed'), args, '')
# parse the output to produce limbs[] and times[]
x = []
y = []
lines = ret[1].split('\n')
for l in lines :
if len(l) :
s = l.split()
try :
t = [float(i) for i in s]
except :
continue
x += [t[0]]
y += [t[1]]
# output the name of the routine
nn = args.split(' ')[-1]
print('{0:<30s}'.format(nn) , end='')
if not len(x) :
# print(ret[1].strip(), '(failed to parse output)')
print('(failed to parse output)')
continue
q = 0 if args in ll else 1
rep = q
while rep < 3 :
rep += 1
f = do_lsq(x, y, lsq_size)
if f != None :
break
else :
print('not stable')
continue
if args in lq :
print('{0[0]:11.1f} {0[1]:11.1f} {0[2]:11.1f}'.format(f))
else :
print('{0[0]:11.1f} {0[1]:11.1f}'.format(f))