# A Python program to run speed and evaluate the performance of MPIR
# routines.
#
# Copyright (c) 2009, Brian Gladman, Worcester, UK.
#
# This file is part of the MPIR Library.  The MPIR Library is free
# software; you can redistribute it and/or modify it under the terms
# of the GNU Lesser General Public License version 2.1 as published
# by the Free Software Foundation.
#
# The MPIR Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.  You should have
# received a copy of the GNU Lesser General Public License along
# with the MPIR Library; see the file COPYING.LIB.  If not, write to
# the Free Software Foundation, Inc., 51Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.

from __future__ import print_function
import sys
import os
import shutil
import string
import copy
import code
import math
import platform
from subprocess import Popen, PIPE, STDOUT

if sys.platform.startswith('win'):
  dir = '.\\x64\\release\\'
else :
  dir = './'

ll = [
     '-c -s 10(10)1000 noop',
     '-c -s 10(10)1000 noop_wxs',
     '-c -s 10(10)1000 noop_wxys',
     '-c -s 10(10)1000 mpn_add_n',
     '-c -s 10(10)1000 mpn_sub_n',
     '-c -s 10(10)1000 mpn_addadd_n',
     '-c -s 10(10)1000 mpn_subadd_n',
     '-c -s 10(10)1000 mpn_addsub_n',
     '-c -s 10(10)1000 mpn_karaadd',
     '-c -s 10(10)1000 mpn_karasub',
     '-c -s 10(10)1000 mpn_addmul_1.3333',
     '-c -s 10(10)1000 mpn_submul_1.3333',
     '-c -s 10(10)1000 mpn_submul_2',
     '-c -s 10(10)1000 mpn_mul_1.3333',
     '-c -s 10(10)1000 mpn_mul_1_inplace.3333',
     '-c -s 10(10)1000 mpn_mul_2',
      '-c -s 10(10)1000 mpn_divrem_euclidean_qr_1.3333',
      '-c -s 10(10)1000 mpn_divrem_euclidean_qr_2',
      '-c -s 10(10)1000 mpn_divrem_euclidean_r_1.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_qr_1.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_qr_1_1.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_qr_1_2.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_r_1.3333',
      '-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1.3333',
      '-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_1.3333',
      '-c -s 10(10)1000 mpn_rsh_divrem_hensel_qr_1_2.3333',
      '-c -s 10(10)1000 mpn_divrem_hensel_rsh_qr_1.3333',
      '-c -s 10(10)1000 mpn_divrem_1.3333',
      '-c -s 10(10)1000 mpn_divrem_1f.3333',
      '-c -s 10(10)1000 mpn_mod_1.3333',
      '-c -s 10(10)1000 mpn_mod_1_1',
      '-c -s 10(10)1000 mpn_mod_1_2',
      '-c -s 10(10)1000 mpn_mod_1_3',
      '-c -s 10(10)1000 mpn_mod_1_k.3',
      '-c -s 10(10)1000 mpn_preinv_divrem_1.3333',
      '-c -s 10(10)1000 mpn_preinv_divrem_1f.3333',
      '-c -s 10(10)1000 mpn_preinv_mod_1.3333',
      '-c -s 10(10)1000 mpn_add_err1_n',
      '-c -s 10(10)1000 mpn_sub_err1_n',
      '-c -s 10(10)1000 mpn_inv_divappr_q',
      '-c -s 10(10)1000 mpn_inv_div_qr',
      '-c -s 10(10)1000 mpn_dc_divappr_q',
      '-c -s 10(10)1000 mpn_dc_div_qr_n',
      '-c -s 10(10)1000 mpn_divrem_1_inv.3333',
      '-c -s 10(10)1000 mpn_divrem_1f_div.3333',
      '-c -s 10(10)1000 mpn_divrem_1f_inv.3333',
      '-c -s 10(10)1000 mpn_mod_1_div.3333',
      '-c -s 10(10)1000 mpn_mod_1_inv.3333',
      '-c -s 10(10)1000 mpn_divrem_2',
      '-c -s 10(10)1000 mpn_divrem_2_div',
      '-c -s 10(10)1000 mpn_divrem_2_inv',
      '-c -s 10(10)1000 mpn_divexact_1.3333',
      '-c -s 10(10)1000 mpn_divexact_by3',
      '-c -s 10(10)1000 mpn_divexact_byff',
      '-c -s 10(10)1000 mpn_divexact_byfobm1.3333',
      '-c -s 10(10)1000 mpn_modexact_1_odd.333',
      '-c -s 10(10)1000 mpn_modexact_1c_odd.333',
      '-c -s 10(10)1000 mpn_mod_34lsub1',
      '-c -s 10(10)1000 mpn_dc_tdiv_qr',
      '-c -s 10(10)1000 mpn_lshift.33',
      '-c -s 10(10)1000 mpn_rshift.33',
      '-c -s 10(10)1000 mpn_lshift1',
      '-c -s 10(10)1000 mpn_rshift1',
      '-c -s 10(10)1000 mpn_double',
      '-c -s 10(10)1000 mpn_half',
      '-c -s 10(10)1000 mpn_lshift2',
      '-c -s 10(10)1000 mpn_rshift2',
      '-c -s 10(10)1000 mpn_and_n',
      '-c -s 10(10)1000 mpn_andn_n',
      '-c -s 10(10)1000 mpn_nand_n',
      '-c -s 10(10)1000 mpn_ior_n',
      '-c -s 10(10)1000 mpn_iorn_n',
      '-c -s 10(10)1000 mpn_nior_n',
      '-c -s 10(10)1000 mpn_xor_n',
      '-c -s 10(10)1000 mpn_xnor_n',
      '-c -s 10(10)1000 mpn_com_n',
      '-c -s 10(10)1000 mpn_not',
      '-c -s 10(10)1000 mpn_popcount',
      '-c -s 10(10)1000 mpn_hamdist',

     '-c -s 10(10)1000 MPN_ZERO',
     '-c -s 10(10)1000 MPN_COPY',
     '-c -s 10(10)1000 MPN_COPY_INCR',
     '-c -s 10(10)1000 MPN_COPY_DECR',

     '-c -s 10(10)1000 count_leading_zeros',
     '-c -s 10(10)1000 gmp_allocate_free',
     '-c -s 10(10)1000 malloc_realloc_free',
     '-c -s 10(10)1000 gmp_allocate_reallocate_free',
     '-c -s 10(10)1000 malloc_free',
     '-c -s 10(10)1000 mpn_umul_ppmm',
     '-c -s 10(10)1000 mpz_add',
     '-c -s 10(10)1000 mpz_init_realloc_clear',
     '-c -s 10(10)1000 mpz_init_clear',
     '-c -s 10(10)1000 udiv_qrnnd',
     '-c -s 10(10)1000 udiv_qrnnd_c',
     '-c -s 10(10)1000 udiv_qrnnd_preinv1',
     '-c -s 10(10)1000 udiv_qrnnd_preinv2',
     '-c -s 10(10)1000 umul_ppmm',
     '-c -s 10(10)1000 mpn_popcount',
     '-c -s 10(10)1000 mpn_hamdist',

     ]

lq = [
     '-c -s 10(10)1000 mpn_dc_divrem_n',
     '-c -s 10(10)1000 mpn_dc_divrem_sb',
     '-c -s 10(10)1000 mpn_dc_tdiv_qr',
     '-c -s 10(10)1000 mpn_kara_mul_n',
     '-c -s 10(10)1000 mpn_kara_sqr_n',
     '-c -s 10(10)1000 mpn_mul_basecase',
     '-c -s 1000(500)10000 -t 10 mpn_mul_fft_full',
     '-c -s 10(10)1000 mpn_mul_n',
     '-c -s 10(10)1000 mpn_sqr_basecase',
     '-c -s 10(10)1000 mpn_sqr_n',
     '-c -s 50(10)1000 mpn_toom3_mul_n',
     '-c -s 50(10)1000 mpn_toom3_sqr_n',
     '-c -s 1(5)100 mpz_powm',
     ]

# run an executable and return its error return value and any output
def run_exe(exe, args, inp) :
  al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT }
  if sys.platform.startswith('win') :
    al['creationflags'] = 0x08000000
  p = Popen([exe] + args.split(' '), **al)
  res = p.communicate(inp.encode())[0].decode()
  ret = p.poll()
  return (ret, res)

# output a matrix implemented as a dictionary
def mout(m, n) :
  for r in range(n) :
    print('\n{0:3d}'.format(r), end='')
    for c in range(n) :
      print('{0:18.4f}'.format(m[(r,c)]) , end='')
  print

# output a vector
def vout(v) :
  print('   ' , end='')
  for c in v :
    print('{0:18.4f}'.format(c) , end='')
  print()

# In-place LU matrix decomposition. The diagonal
# elements of the upper triangular matrix U are
# all 1 and are not stored. Pivoting is used and
# the matrix is implemented as a dictionary. It
# is only intended for use with small matrices.

def LU_decompose(A, n) :
  p = [0] * n
  for k in range(n) :
    # find pivot
    p[k] = k
    max = math.fabs(A[(k,k)])
    for j in range(k + 1, n) :
      if max < math.fabs(A[(j,k)]) :
        max = math.fabs(A[(j,k)])
        p[k] = j
    # exchange rows if necessary
    if p[k] != k :
      for j in range(n) :
        A[(k,j)], A[(p[k],j)] = A[(p[k],j)], A[(k,j)]

    # exit if matrix is singular
    if A[(k,k)] == 0.0 :
      return None

    # set upper triangular elements
    for j in range(k + 1,n) :
      A[(k,j)] /= A[(k,k)]

    # update remaining part of original matrix
    for i in range(k + 1, n) :
      for j in range(k + 1, n) :
        A[(i,j)] -= A[(i,k)] * A[(k,j)]

  # return pivot array
  return p

# Use the LU decomposition above to solve the matrix
# equation A x = b for x given A and b

def LU_solve(A, p, b) :
  n = len(p)
  x = [0] * n

  # calculate U x = L^-1 b
  for k in range(n) :
    if p[k] != k :
      b[k], b[p[k]] = b[p[k]], b[k]
    x[k] = b[k]
    for i in range(k) :
      x[k] -= x[i] * A[(k,i)]
    x[k] /= A[(k,k)]

  # back substitute for x = U^-1 (L^-1 b)
  for k in reversed(range(n)) :
    if p[k] != k :
      b[k], b[p[k]], b[p[k]], b[k]
    for i in range(k + 1, n) :
      x[k] -= x[i] *  A[(k,i)]
  return x

def lsq_solve(x, y, n) :
  m = {}  # matrix as dictionary
  v = []  # vector as list
  # set up matrix and vectors for least squares
  for i in range(n) :
    v.append(sum(xx ** i * yy for xx, yy in zip(x, y)))
    for j in range(i, n) :
      m[(i,j)] = m[(j,i)] = sum(xx ** (i + j) for xx in x)
  # decompose the matrix into lower and upper triangular
  # matrices
  p = LU_decompose(m, n)
  if p != None :
    return LU_solve(m, p, v)
  else :
    return None

def do_lsq(x, y, lsq_size) :
  # get least squares coefficients
  f = lsq_solve(x, y, lsq_size)

  # now find the standard deviation from the curve
  s = 0
  for i in range(len(x)) :
    t = sum(f[j] * x[i] ** j for j in range(lsq_size))
    s += (y[i] - t) ** 2
  sd =  2 * math.sqrt(s / len(x))

  # now remove 'outliers' - data points outside twice
  # the standard deviation
  sc = 0
  for i in reversed(range(len(x))) :
    t = sum(f[j] * x[i] ** j for j in range(lsq_size))
    if math.fabs(y[i] - t) > sd :
      del x[i]
      del y[i]
      sc += 1

  # if we had to remove more than 10% of measurements
  # declare that the result is not stable
  if 10 * sc > len(x) :
    return None
  else :
    return f

print('Machine:', platform.processor())
print('Running:', platform.platform())
print('SPEED CURVE (l: no of limbs) cycles: c[0] + c[1] * l + c[2] * l^2')
print('ROUTINE                      ', end = '')
print('        c[0]        c[1]        c[2]')
lines = ''
cnt = 0
lsq_size = 4
for args in ll + lq :
  cnt += 1
  # run speed for each routine in the list above
  ret = run_exe(os.path.join(dir, 'speed'), args, '')
  # parse the output to produce limbs[] and times[]
  x = []
  y = []
  lines = ret[1].split('\n')
  for l in lines :
    if len(l) :
      s = l.split()
      try :
        t = [float(i) for i in s]
      except :
        continue
      x += [t[0]]
      y += [t[1]]

# output the name of the routine
  nn = args.split(' ')[-1]
  print('{0:<30s}'.format(nn) , end='')
  if not len(x) :
#   print(ret[1].strip(), '(failed to parse output)')
    print('(failed to parse output)')
    continue

  q = 0 if args in ll else 1
  rep = q
  while rep < 3 :
    rep += 1
    f = do_lsq(x, y, lsq_size)
    if f != None :
      break
  else :
    print('not stable')
    continue
  if args in lq :
    print('{0[0]:11.1f} {0[1]:11.1f} {0[2]:11.1f}'.format(f))
  else :
    print('{0[0]:11.1f} {0[1]:11.1f}'.format(f))