mpir/msvc/vs19/run-speed.py


# A Python program to run speed and evaluate the performance of MPIR
# routines.
#
# Copyright (c) 2009, Brian Gladman, Worcester, UK.
#
# This file is part of the MPIR Library.  The MPIR Library is free
# software; you can redistribute it and/or modify it under the terms
# of the GNU Lesser General Public License version 2.1 as published
# by the Free Software Foundation.
#
# The MPIR Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.  You should have
# received a copy of the GNU Lesser General Public License along
# with the MPIR Library; see the file COPYING.LIB.  If not, write to
# the Free Software Foundation, Inc., 51Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.

from __future__ import print_function
import sys
import os
import shutil
import string
import copy
import code
import math
import platform
from subprocess import Popen, PIPE, STDOUT

if sys.platform.startswith('win'):
  dir = '.\\x64\\release\\'
else :
  dir = './'

ll = [
     '-c -s 10(10)1000 noop',
     '-c -s 10(10)1000 noop_wxs',
     '-c -s 10(10)1000 noop_wxys',
     '-c -s 10(10)1000 mpn_add_n',
     '-c -s 10(10)1000 mpn_sub_n',
     '-c -s 10(10)1000 mpn_addadd_n',
     '-c -s 10(10)1000 mpn_subadd_n',
     '-c -s 10(10)1000 mpn_addsub_n',
     '-c -s 10(10)1000 mpn_karaadd',
     '-c -s 10(10)1000 mpn_karasub',
     '-c -s 10(10)1000 mpn_addmul_1.3333',
     '-c -s 10(10)1000 mpn_submul_1.3333',
     '-c -s 10(10)1000 mpn_submul_2',
     '-c -s 10(10)1000 mpn_mul_1.3333',
     '-c -s 10(10)1000 mpn_mul_1_inplace.3333',
     '-c -s 10(10)1000 mpn_mul_2',
      '-c -s 10(10)1000 mpn_divrem_euclidean_qr_1.3333',
      '-c -s 10(10)1000 mpn_divrem_euclidean_qr_2',
      '-c -s 10(10)1000 mpn_divrem_euclidean_r_1.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_qr_1.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_qr_1_1.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_qr_1_2.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_r_1.3333',
      '-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1.3333',
      '-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_1.3333',
      '-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_2.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_rsh_qr_1.3333',
      '-c -s 10(10)1000 mpn_divrem_1.3333',
      '-c -s 10(10)1000 mpn_divrem_1f.3333',
      '-c -s 10(10)1000 mpn_mod_1.3333',
      '-c -s 10(10)1000 mpn_mod_1_1',
      '-c -s 10(10)1000 mpn_mod_1_2',
      '-c -s 10(10)1000 mpn_mod_1_3',
      '-c -s 10(10)1000 mpn_mod_1_k.3',
      '-c -s 10(10)1000 mpn_preinv_divrem_1.3333',
      '-c -s 10(10)1000 mpn_preinv_divrem_1f.3333',
      '-c -s 10(10)1000 mpn_preinv_mod_1.3333',
      '-c -s 10(10)1000 mpn_add_err1_n',
      '-c -s 10(10)1000 mpn_sub_err1_n',
      '-c -s 10(10)1000 mpn_inv_divappr_q',
      '-c -s 10(10)1000 mpn_inv_div_qr',
      '-c -s 10(10)1000 mpn_dc_divappr_q',
      '-c -s 10(10)1000 mpn_dc_div_qr_n',
      '-c -s 10(10)1000 mpn_divrem_1_inv.3333',
      '-c -s 10(10)1000 mpn_divrem_1f_div.3333',
      '-c -s 10(10)1000 mpn_divrem_1f_inv.3333',
      '-c -s 10(10)1000 mpn_mod_1_div.3333',
      '-c -s 10(10)1000 mpn_mod_1_inv.3333',
      '-c -s 10(10)1000 mpn_divrem_2',
      '-c -s 10(10)1000 mpn_divrem_2_div',
      '-c -s 10(10)1000 mpn_divrem_2_inv',
      '-c -s 10(10)1000 mpn_divexact_1.3333',
      '-c -s 10(10)1000 mpn_divexact_by3',
      '-c -s 10(10)1000 mpn_divexact_byff',
      '-c -s 10(10)1000 mpn_divexact_byfobm1.3333',
      '-c -s 10(10)1000 mpn_modexact_1_odd.333',
      '-c -s 10(10)1000 mpn_modexact_1c_odd.333',
      '-c -s 10(10)1000 mpn_mod_34lsub1',
      '-c -s 10(10)1000 mpn_dc_tdiv_qr',
      '-c -s 10(10)1000 mpn_lshift.33',
      '-c -s 10(10)1000 mpn_rshift.33',
      '-c -s 10(10)1000 mpn_lshift1',
      '-c -s 10(10)1000 mpn_rshift1',
      '-c -s 10(10)1000 mpn_double',
      '-c -s 10(10)1000 mpn_half',
      '-c -s 10(10)1000 mpn_lshift2',
      '-c -s 10(10)1000 mpn_rshift2',
      '-c -s 10(10)1000 mpn_and_n',
      '-c -s 10(10)1000 mpn_andn_n',
      '-c -s 10(10)1000 mpn_nand_n',
      '-c -s 10(10)1000 mpn_ior_n',
      '-c -s 10(10)1000 mpn_iorn_n',
      '-c -s 10(10)1000 mpn_nior_n',
      '-c -s 10(10)1000 mpn_xor_n',
      '-c -s 10(10)1000 mpn_xnor_n',
      '-c -s 10(10)1000 mpn_com_n',
      '-c -s 10(10)1000 mpn_not',
      '-c -s 10(10)1000 mpn_popcount',
      '-c -s 10(10)1000 mpn_hamdist',

     '-c -s 10(10)1000 MPN_ZERO',
     '-c -s 10(10)1000 MPN_COPY',
     '-c -s 10(10)1000 MPN_COPY_INCR',
     '-c -s 10(10)1000 MPN_COPY_DECR',

     '-c -s 10(10)1000 count_leading_zeros',
     '-c -s 10(10)1000 gmp_allocate_free',
     '-c -s 10(10)1000 malloc_realloc_free',
     '-c -s 10(10)1000 gmp_allocate_reallocate_free',
     '-c -s 10(10)1000 malloc_free',
     '-c -s 10(10)1000 mpn_umul_ppmm',
     '-c -s 10(10)1000 mpz_add',
     '-c -s 10(10)1000 mpz_init_realloc_clear',
     '-c -s 10(10)1000 mpz_init_clear',
     '-c -s 10(10)1000 udiv_qrnnd',
     '-c -s 10(10)1000 udiv_qrnnd_c',
     '-c -s 10(10)1000 udiv_qrnnd_preinv1',
     '-c -s 10(10)1000 udiv_qrnnd_preinv2',
     '-c -s 10(10)1000 umul_ppmm',
     '-c -s 10(10)1000 mpn_popcount',
     '-c -s 10(10)1000 mpn_hamdist',

     ]

lq = [
     '-c -s 10(10)1000 mpn_dc_divrem_n',
     '-c -s 10(10)1000 mpn_dc_divrem_sb',
     '-c -s 10(10)1000 mpn_dc_tdiv_qr',
     '-c -s 10(10)1000 mpn_kara_mul_n',
     '-c -s 10(10)1000 mpn_kara_sqr_n',
     '-c -s 10(10)1000 mpn_mul_basecase',
     '-c -s 1000(500)10000 -t 10 mpn_mul_fft_full',
     '-c -s 10(10)1000 mpn_mul_n',
     '-c -s 10(10)1000 mpn_sqr_basecase',
     '-c -s 10(10)1000 mpn_sqr_n',
     '-c -s 50(10)1000 mpn_toom3_mul_n',
     '-c -s 50(10)1000 mpn_toom3_sqr_n',
     '-c -s 1(5)100 mpz_powm',
     ]

# run an executable and return its error return value and any output
def run_exe(exe, args, inp) :
  al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT }
  if sys.platform.startswith('win') :
    al['creationflags'] = 0x08000000
  p = Popen([exe] + args.split(' '), **al)
  res = p.communicate(inp.encode())[0].decode()
  ret = p.poll()
  return (ret, res)

# output a matrix implemented as a dictionary
def mout(m, n) :
  for r in range(n) :
    print('\n{0:3d}'.format(r), end='')
    for c in range(n) :
      print('{0:18.4f}'.format(m[(r,c)]) , end='')
  print

# output a vector
def vout(v) :
  print('   ' , end='')
  for c in v :
    print('{0:18.4f}'.format(c) , end='')
  print()

# In-place LU matrix decomposition. The diagonal
# elements of the upper triangular matrix U are
# all 1 and are not stored. Pivoting is used and
# the matrix is implemented as a dictionary. It
# is only intended for use with small matrices.

def LU_decompose(A, n) :
  p = [0] * n
  for k in range(n) :
    # find pivot
    p[k] = k
    max = math.fabs(A[(k,k)])
    for j in range(k + 1, n) :
      if max < math.fabs(A[(j,k)]) :
        max = math.fabs(A[(j,k)])
        p[k] = j
    # exchange rows if necessary
    if p[k] != k :
      for j in range(n) :
        A[(k,j)], A[(p[k],j)] = A[(p[k],j)], A[(k,j)]

    # exit if matrix is singular
    if A[(k,k)] == 0.0 :
      return None

    # set upper triangular elements
    for j in range(k + 1,n) :
      A[(k,j)] /= A[(k,k)]

    # update remaining part of original matrix
    for i in range(k + 1, n) :
      for j in range(k + 1, n) :
        A[(i,j)] -= A[(i,k)] * A[(k,j)]

  # return pivot array
  return p

# Use the LU decomposition above to solve the matrix
# equation A x = b for x given A and b

def LU_solve(A, p, b) :
  n = len(p)
  x = [0] * n

  # calculate U x = L^-1 b
  for k in range(n) :
    if p[k] != k :
      b[k], b[p[k]] = b[p[k]], b[k]
    x[k] = b[k]
    for i in range(k) :
      x[k] -= x[i] * A[(k,i)]
    x[k] /= A[(k,k)]

  # back substitute for x = U^-1 (L^-1 b)
  for k in reversed(range(n)) :
    if p[k] != k :
      b[k], b[p[k]], b[p[k]], b[k]
    for i in range(k + 1, n) :
      x[k] -= x[i] *  A[(k,i)]
  return x

def lsq_solve(x, y, n) :
  m = {}  # matrix as dictionary
  v = []  # vector as list
  # set up matrix and vectors for least squares
  for i in range(n) :
    v.append(sum(xx ** i * yy for xx, yy in zip(x, y)))
    for j in range(i, n) :
      m[(i,j)] = m[(j,i)] = sum(xx ** (i + j) for xx in x)
  # decompose the matrix into lower and upper triangular
  # matrices
  p = LU_decompose(m, n)
  if p != None :
    return LU_solve(m, p, v)
  else :
    return None

def do_lsq(x, y, lsq_size) :
  # get least squares coefficients
  f = lsq_solve(x, y, lsq_size)

  # now find the standard deviation from the curve
  s = 0
  for i in range(len(x)) :
    t = sum(f[j] * x[i] ** j for j in range(lsq_size))
    s += (y[i] - t) ** 2
  sd =  2 * math.sqrt(s / len(x))

  # now remove 'outliers' - data points outside twice
  # the standard deviation
  sc = 0
  for i in reversed(range(len(x))) :
    t = sum(f[j] * x[i] ** j for j in range(lsq_size))
    if math.fabs(y[i] - t) > sd :
      del x[i]
      del y[i]
      sc += 1

  # if we had to remove more than 10% of measurements
  # declare that the result is not stable
  if 10 * sc > len(x) :
    return None
  else :
    return f

print('Machine:', platform.processor())
print('Running:', platform.platform())
print('SPEED CURVE (l: no of limbs) cycles: c[0] + c[1] * l + c[2] * l^2')
print('ROUTINE                      ', end = '')
print('        c[0]        c[1]        c[2]')
lines = ''
cnt = 0
lsq_size = 4
for args in ll + lq :
  cnt += 1
  # run speed for each routine in the list above
  ret = run_exe(os.path.join(dir, 'speed'), args, '')
  # parse the output to produce limbs[] and times[]
  x = []
  y = []
  lines = ret[1].split('\n')
  for l in lines :
    if len(l) :
      s = l.split()
      try :
        t = [float(i) for i in s]
      except :
        continue
      x += [t[0]]
      y += [t[1]]

# output the name of the routine
  nn = args.split(' ')[-1]
  print('{0:<30s}'.format(nn) , end='')
  if not len(x) :
#   print(ret[1].strip(), '(failed to parse output)')
    print('(failed to parse output)')
    continue

  q = 0 if args in ll else 1
  rep = q
  while rep < 3 :
    rep += 1
    f = do_lsq(x, y, lsq_size)
    if f != None :
      break
  else :
    print('not stable')
    continue
  if args in lq :
    print('{0[0]:11.1f} {0[1]:11.1f} {0[2]:11.1f}'.format(f))
  else :
    print('{0[0]:11.1f} {0[1]:11.1f}'.format(f))
Add build files for Visual Studio 2019 2019-03-07 08:42:42 -05:00
			`# A Python program to run speed and evaluate the performance of MPIR`
			`# routines.`
			`#`
			`# Copyright (c) 2009, Brian Gladman, Worcester, UK.`
			`#`
			`# This file is part of the MPIR Library. The MPIR Library is free`
			`# software; you can redistribute it and/or modify it under the terms`
			`# of the GNU Lesser General Public License version 2.1 as published`
			`# by the Free Software Foundation.`
			`#`
			`# The MPIR Library is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`# Lesser General Public License for more details. You should have`
			`# received a copy of the GNU Lesser General Public License along`
			`# with the MPIR Library; see the file COPYING.LIB. If not, write to`
			`# the Free Software Foundation, Inc., 51Franklin Street, Fifth Floor,`
			`# Boston, MA 02110-1301, USA.`

			`from __future__ import print_function`
			`import sys`
			`import os`
			`import shutil`
			`import string`
			`import copy`
			`import code`
			`import math`
			`import platform`
			`from subprocess import Popen, PIPE, STDOUT`

			`if sys.platform.startswith('win'):`
			`dir = '.\\x64\\release\\'`
			`else :`
			`dir = './'`

			`ll = [`
			`'-c -s 10(10)1000 noop',`
			`'-c -s 10(10)1000 noop_wxs',`
			`'-c -s 10(10)1000 noop_wxys',`
			`'-c -s 10(10)1000 mpn_add_n',`
			`'-c -s 10(10)1000 mpn_sub_n',`
			`'-c -s 10(10)1000 mpn_addadd_n',`
			`'-c -s 10(10)1000 mpn_subadd_n',`
			`'-c -s 10(10)1000 mpn_addsub_n',`
			`'-c -s 10(10)1000 mpn_karaadd',`
			`'-c -s 10(10)1000 mpn_karasub',`
			`'-c -s 10(10)1000 mpn_addmul_1.3333',`
			`'-c -s 10(10)1000 mpn_submul_1.3333',`
			`'-c -s 10(10)1000 mpn_submul_2',`
			`'-c -s 10(10)1000 mpn_mul_1.3333',`
			`'-c -s 10(10)1000 mpn_mul_1_inplace.3333',`
			`'-c -s 10(10)1000 mpn_mul_2',`
			`'-c -s 10(10)1000 mpn_divrem_euclidean_qr_1.3333',`
			`'-c -s 10(10)1000 mpn_divrem_euclidean_qr_2',`
			`'-c -s 10(10)1000 mpn_divrem_euclidean_r_1.3333',`
			`'-c -s 10(10)1000 mpn_divrem_hensel_qr_1.3333',`
			`'-c -s 10(10)1000 mpn_divrem_hensel_qr_1_1.3333',`
			`'-c -s 10(10)1000 mpn_divrem_hensel_qr_1_2.3333',`
			`'-c -s 10(10)1000 mpn_divrem_hensel_r_1.3333',`
			`'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1.3333',`
			`'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_1.3333',`
			`'-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_2.3333',`
			`'-c -s 10(10)1000 mpn_divrem_hensel_rsh_qr_1.3333',`
			`'-c -s 10(10)1000 mpn_divrem_1.3333',`
			`'-c -s 10(10)1000 mpn_divrem_1f.3333',`
			`'-c -s 10(10)1000 mpn_mod_1.3333',`
			`'-c -s 10(10)1000 mpn_mod_1_1',`
			`'-c -s 10(10)1000 mpn_mod_1_2',`
			`'-c -s 10(10)1000 mpn_mod_1_3',`
			`'-c -s 10(10)1000 mpn_mod_1_k.3',`
			`'-c -s 10(10)1000 mpn_preinv_divrem_1.3333',`
			`'-c -s 10(10)1000 mpn_preinv_divrem_1f.3333',`
			`'-c -s 10(10)1000 mpn_preinv_mod_1.3333',`
			`'-c -s 10(10)1000 mpn_add_err1_n',`
			`'-c -s 10(10)1000 mpn_sub_err1_n',`
			`'-c -s 10(10)1000 mpn_inv_divappr_q',`
			`'-c -s 10(10)1000 mpn_inv_div_qr',`
			`'-c -s 10(10)1000 mpn_dc_divappr_q',`
			`'-c -s 10(10)1000 mpn_dc_div_qr_n',`
			`'-c -s 10(10)1000 mpn_divrem_1_inv.3333',`
			`'-c -s 10(10)1000 mpn_divrem_1f_div.3333',`
			`'-c -s 10(10)1000 mpn_divrem_1f_inv.3333',`
			`'-c -s 10(10)1000 mpn_mod_1_div.3333',`
			`'-c -s 10(10)1000 mpn_mod_1_inv.3333',`
			`'-c -s 10(10)1000 mpn_divrem_2',`
			`'-c -s 10(10)1000 mpn_divrem_2_div',`
			`'-c -s 10(10)1000 mpn_divrem_2_inv',`
			`'-c -s 10(10)1000 mpn_divexact_1.3333',`
			`'-c -s 10(10)1000 mpn_divexact_by3',`
			`'-c -s 10(10)1000 mpn_divexact_byff',`
			`'-c -s 10(10)1000 mpn_divexact_byfobm1.3333',`
			`'-c -s 10(10)1000 mpn_modexact_1_odd.333',`
			`'-c -s 10(10)1000 mpn_modexact_1c_odd.333',`
			`'-c -s 10(10)1000 mpn_mod_34lsub1',`
			`'-c -s 10(10)1000 mpn_dc_tdiv_qr',`
			`'-c -s 10(10)1000 mpn_lshift.33',`
			`'-c -s 10(10)1000 mpn_rshift.33',`
			`'-c -s 10(10)1000 mpn_lshift1',`
			`'-c -s 10(10)1000 mpn_rshift1',`
			`'-c -s 10(10)1000 mpn_double',`
			`'-c -s 10(10)1000 mpn_half',`
			`'-c -s 10(10)1000 mpn_lshift2',`
			`'-c -s 10(10)1000 mpn_rshift2',`
			`'-c -s 10(10)1000 mpn_and_n',`
			`'-c -s 10(10)1000 mpn_andn_n',`
			`'-c -s 10(10)1000 mpn_nand_n',`
			`'-c -s 10(10)1000 mpn_ior_n',`
			`'-c -s 10(10)1000 mpn_iorn_n',`
			`'-c -s 10(10)1000 mpn_nior_n',`
			`'-c -s 10(10)1000 mpn_xor_n',`
			`'-c -s 10(10)1000 mpn_xnor_n',`
			`'-c -s 10(10)1000 mpn_com_n',`
			`'-c -s 10(10)1000 mpn_not',`
			`'-c -s 10(10)1000 mpn_popcount',`
			`'-c -s 10(10)1000 mpn_hamdist',`

			`'-c -s 10(10)1000 MPN_ZERO',`
			`'-c -s 10(10)1000 MPN_COPY',`
			`'-c -s 10(10)1000 MPN_COPY_INCR',`
			`'-c -s 10(10)1000 MPN_COPY_DECR',`

			`'-c -s 10(10)1000 count_leading_zeros',`
			`'-c -s 10(10)1000 gmp_allocate_free',`
			`'-c -s 10(10)1000 malloc_realloc_free',`
			`'-c -s 10(10)1000 gmp_allocate_reallocate_free',`
			`'-c -s 10(10)1000 malloc_free',`
			`'-c -s 10(10)1000 mpn_umul_ppmm',`
			`'-c -s 10(10)1000 mpz_add',`
			`'-c -s 10(10)1000 mpz_init_realloc_clear',`
			`'-c -s 10(10)1000 mpz_init_clear',`
			`'-c -s 10(10)1000 udiv_qrnnd',`
			`'-c -s 10(10)1000 udiv_qrnnd_c',`
			`'-c -s 10(10)1000 udiv_qrnnd_preinv1',`
			`'-c -s 10(10)1000 udiv_qrnnd_preinv2',`
			`'-c -s 10(10)1000 umul_ppmm',`
			`'-c -s 10(10)1000 mpn_popcount',`
			`'-c -s 10(10)1000 mpn_hamdist',`

			`]`

			`lq = [`
			`'-c -s 10(10)1000 mpn_dc_divrem_n',`
			`'-c -s 10(10)1000 mpn_dc_divrem_sb',`
			`'-c -s 10(10)1000 mpn_dc_tdiv_qr',`
			`'-c -s 10(10)1000 mpn_kara_mul_n',`
			`'-c -s 10(10)1000 mpn_kara_sqr_n',`
			`'-c -s 10(10)1000 mpn_mul_basecase',`
			`'-c -s 1000(500)10000 -t 10 mpn_mul_fft_full',`
			`'-c -s 10(10)1000 mpn_mul_n',`
			`'-c -s 10(10)1000 mpn_sqr_basecase',`
			`'-c -s 10(10)1000 mpn_sqr_n',`
			`'-c -s 50(10)1000 mpn_toom3_mul_n',`
			`'-c -s 50(10)1000 mpn_toom3_sqr_n',`
			`'-c -s 1(5)100 mpz_powm',`
			`]`

			`# run an executable and return its error return value and any output`
			`def run_exe(exe, args, inp) :`
			`al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT }`
			`if sys.platform.startswith('win') :`
			`al['creationflags'] = 0x08000000`
			`p = Popen([exe] + args.split(' '), **al)`
			`res = p.communicate(inp.encode())[0].decode()`
			`ret = p.poll()`
			`return (ret, res)`

			`# output a matrix implemented as a dictionary`
			`def mout(m, n) :`
			`for r in range(n) :`
			`print('\n{0:3d}'.format(r), end='')`
			`for c in range(n) :`
			`print('{0:18.4f}'.format(m[(r,c)]) , end='')`
			`print`

			`# output a vector`
			`def vout(v) :`
			`print(' ' , end='')`
			`for c in v :`
			`print('{0:18.4f}'.format(c) , end='')`
			`print()`

			`# In-place LU matrix decomposition. The diagonal`
			`# elements of the upper triangular matrix U are`
			`# all 1 and are not stored. Pivoting is used and`
			`# the matrix is implemented as a dictionary. It`
			`# is only intended for use with small matrices.`

			`def LU_decompose(A, n) :`
			`p = [0] * n`
			`for k in range(n) :`
			`# find pivot`
			`p[k] = k`
			`max = math.fabs(A[(k,k)])`
			`for j in range(k + 1, n) :`
			`if max < math.fabs(A[(j,k)]) :`
			`max = math.fabs(A[(j,k)])`
			`p[k] = j`
			`# exchange rows if necessary`
			`if p[k] != k :`
			`for j in range(n) :`
			`A[(k,j)], A[(p[k],j)] = A[(p[k],j)], A[(k,j)]`

			`# exit if matrix is singular`
			`if A[(k,k)] == 0.0 :`
			`return None`

			`# set upper triangular elements`
			`for j in range(k + 1,n) :`
			`A[(k,j)] /= A[(k,k)]`

			`# update remaining part of original matrix`
			`for i in range(k + 1, n) :`
			`for j in range(k + 1, n) :`
			`A[(i,j)] -= A[(i,k)] * A[(k,j)]`

			`# return pivot array`
			`return p`

			`# Use the LU decomposition above to solve the matrix`
			`# equation A x = b for x given A and b`

			`def LU_solve(A, p, b) :`
			`n = len(p)`
			`x = [0] * n`

			`# calculate U x = L^-1 b`
			`for k in range(n) :`
			`if p[k] != k :`
			`b[k], b[p[k]] = b[p[k]], b[k]`
			`x[k] = b[k]`
			`for i in range(k) :`
			`x[k] -= x[i] * A[(k,i)]`
			`x[k] /= A[(k,k)]`

			`# back substitute for x = U^-1 (L^-1 b)`
			`for k in reversed(range(n)) :`
			`if p[k] != k :`
			`b[k], b[p[k]], b[p[k]], b[k]`
			`for i in range(k + 1, n) :`
			`x[k] -= x[i] * A[(k,i)]`
			`return x`

			`def lsq_solve(x, y, n) :`
			`m = {} # matrix as dictionary`
			`v = [] # vector as list`
			`# set up matrix and vectors for least squares`
			`for i in range(n) :`
			`v.append(sum(xx ** i * yy for xx, yy in zip(x, y)))`
			`for j in range(i, n) :`
			`m[(i,j)] = m[(j,i)] = sum(xx ** (i + j) for xx in x)`
			`# decompose the matrix into lower and upper triangular`
			`# matrices`
			`p = LU_decompose(m, n)`
			`if p != None :`
			`return LU_solve(m, p, v)`
			`else :`
			`return None`

			`def do_lsq(x, y, lsq_size) :`
			`# get least squares coefficients`
			`f = lsq_solve(x, y, lsq_size)`

			`# now find the standard deviation from the curve`
			`s = 0`
			`for i in range(len(x)) :`
			`t = sum(f[j] * x[i] ** j for j in range(lsq_size))`
			`s += (y[i] - t) ** 2`
			`sd = 2 * math.sqrt(s / len(x))`

			`# now remove 'outliers' - data points outside twice`
			`# the standard deviation`
			`sc = 0`
			`for i in reversed(range(len(x))) :`
			`t = sum(f[j] * x[i] ** j for j in range(lsq_size))`
			`if math.fabs(y[i] - t) > sd :`
			`del x[i]`
			`del y[i]`
			`sc += 1`

			`# if we had to remove more than 10% of measurements`
			`# declare that the result is not stable`
			`if 10 * sc > len(x) :`
			`return None`
			`else :`
			`return f`

			`print('Machine:', platform.processor())`
			`print('Running:', platform.platform())`
			`print('SPEED CURVE (l: no of limbs) cycles: c[0] + c[1] * l + c[2] * l^2')`
			`print('ROUTINE ', end = '')`
			`print(' c[0] c[1] c[2]')`
			`lines = ''`
			`cnt = 0`
			`lsq_size = 4`
			`for args in ll + lq :`
			`cnt += 1`
			`# run speed for each routine in the list above`
			`ret = run_exe(os.path.join(dir, 'speed'), args, '')`
			`# parse the output to produce limbs[] and times[]`
			`x = []`
			`y = []`
			`lines = ret[1].split('\n')`
			`for l in lines :`
			`if len(l) :`
			`s = l.split()`
			`try :`
			`t = [float(i) for i in s]`
			`except :`
			`continue`
			`x += [t[0]]`
			`y += [t[1]]`

			`# output the name of the routine`
			`nn = args.split(' ')[-1]`
			`print('{0:<30s}'.format(nn) , end='')`
			`if not len(x) :`
			`# print(ret[1].strip(), '(failed to parse output)')`
			`print('(failed to parse output)')`
			`continue`

			`q = 0 if args in ll else 1`
			`rep = q`
			`while rep < 3 :`
			`rep += 1`
			`f = do_lsq(x, y, lsq_size)`
			`if f != None :`
			`break`
			`else :`
			`print('not stable')`
			`continue`
			`if args in lq :`
			`print('{0[0]:11.1f} {0[1]:11.1f} {0[2]:11.1f}'.format(f))`
			`else :`
			`print('{0[0]:11.1f} {0[1]:11.1f}'.format(f))`