mpir/new_fft_with_flint.c

3972 lines
108 KiB
C

/* mul_fft -- split-radix fft routines for MPIR.
Copyright William Hart 2009
This file is part of the MPIR Library.
The MPIR Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.
The MPIR Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the MPIR Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
#include <stdio.h>
#include <stdlib.h>
#include "mpir.h"
#include "gmp-impl.h"
#include "flint.h"
#include "ZmodF.h"
#include "ZmodF_poly.h"
#include "longlong.h"
void mpn_to_mpz(mpz_t m, mp_limb_t * i, ulong limbs);
void set_p(mpz_t p, ulong n, ulong w);
void rand_n(mp_limb_t * n, gmp_randstate_t state, ulong limbs);
void ref_mul_2expmod(mpz_t m, mpz_t i2, mpz_t p, ulong n, ulong w, ulong d);
void ref_norm(mpz_t m, mpz_t p);
void ref_sumdiff_rshBmod(mpz_t t, mpz_t u, mpz_t i1,
mpz_t i2, mpz_t p, ulong n, ulong w, ulong x, ulong y);
/*
FIXME: This can be done faster for the case where c is small.
The function adds c to the integer r modulo 2^l + 1.
*/
void mpn_addmod_2expp1_1(mp_limb_t * r, ulong l, mp_limb_signed_t c)
{
if (c >= 0) mpn_add_1(r, r, l + 1, c);
else mpn_sub_1(r, r, l + 1, -c);
}
/*
FIXME: this is the most inefficient implementation I could come up with. ;-)
*/
ulong revbin(ulong c, ulong depth)
{
ulong i = 0;
ulong ret = 0;
for (i = 0; i < depth + 1; i++)
if (c & (1UL<<i)) ret += (1UL<<(depth - i));
return ret;
}
/*
FIXME: the following functions should use mp_limb_t's not ulongs
*/
/*
Splits an mpn into segments of length coeff_limbs and stores in
zero padded coefficients of length output_limbs, for use in FFT
convolution code. Assumes that the input is total_limbs in length.
The total number of coefficients written is returned.
*/
ulong FFT_split(mp_limb_t ** poly, mp_limb_t * limbs,
ulong total_limbs, ulong coeff_limbs, ulong output_limbs)
{
ulong length = (total_limbs - 1)/coeff_limbs + 1;
ulong i, j, skip;
for (skip = 0, i = 0; skip + coeff_limbs <= total_limbs; skip += coeff_limbs, i++)
{
//if (i + 1 < length)
//for (j = 0; j + 8 < output_limbs; j += 8) PREFETCH(poly[i+1], j);
MPN_ZERO(poly[i], output_limbs + 1);
// convert a coefficient
MPN_COPY(poly[i], limbs + skip, coeff_limbs);
}
if (i < length) MPN_ZERO(poly[i], output_limbs + 1);
if (total_limbs > skip) MPN_COPY(poly[i], limbs + skip, total_limbs - skip);
return length;
}
/*
Splits an mpn into segments of length _bits_ and stores in
zero padded coefficients of length output_limbs, for use in FFT
convolution code. Assumes that the input is total_limbs in length.
Returns the total number of coefficient written.
*/
ulong FFT_split_bits(mp_limb_t ** poly, mp_limb_t * limbs,
ulong total_limbs, ulong bits, ulong output_limbs)
{
ulong length = (GMP_LIMB_BITS*total_limbs - 1)/bits + 1;
ulong i, j;
ulong top_bits = ((GMP_LIMB_BITS - 1) & bits);
if (top_bits == 0)
{
return FFT_split(poly, limbs, total_limbs, bits/GMP_LIMB_BITS, output_limbs);
}
ulong coeff_limbs = (bits/GMP_LIMB_BITS) + 1;
ulong mask = (1L<<top_bits) - 1L;
ulong shift_bits = 0L;
ulong * limb_ptr = limbs;
for (i = 0; i < length - 1; i++)
{
//for (j = 0; j + 8 < output_limbs; j += 8) PREFETCH(poly->coeffs[i+1], j);
MPN_ZERO(poly[i], output_limbs + 1);
// convert a coefficient
if (!shift_bits)
{
MPN_COPY(poly[i], limb_ptr, coeff_limbs);
poly[i][coeff_limbs - 1] &= mask;
limb_ptr += (coeff_limbs - 1);
shift_bits += top_bits;
} else
{
mpn_rshift(poly[i], limb_ptr, coeff_limbs, shift_bits);
limb_ptr += (coeff_limbs - 1);
shift_bits += top_bits;
if (shift_bits >= GMP_LIMB_BITS)
{
limb_ptr++;
poly[i][coeff_limbs - 1] += (limb_ptr[0] << (GMP_LIMB_BITS - (shift_bits - top_bits)));
shift_bits -= GMP_LIMB_BITS;
}
poly[i][coeff_limbs - 1] &= mask;
}
}
MPN_ZERO(poly[i], output_limbs + 1);
ulong limbs_left = total_limbs - (limb_ptr - limbs);
if (!shift_bits)
{
MPN_COPY(poly[i], limb_ptr, limbs_left);
} else
{
mpn_rshift(poly[i], limb_ptr, limbs_left, shift_bits);
}
return length;
}
/*
Recombines coefficients after doing a convolution. Assumes each of the
coefficients of the poly of the given length is output_limbs long, that each
of the coefficients is being shifted by a multiple of coeff_limbs and added
to an mpn which is total_limbs long. It is assumed that the mpn has been
zeroed in advance.
*/
void FFT_combine(mp_limb_t * res, mp_limb_t ** poly, ulong length, ulong coeff_limbs,
ulong output_limbs, ulong total_limbs)
{
ulong skip, i, j;
for (skip = 0, i = 0; (i < length) && (skip + output_limbs <= total_limbs); i++, skip += coeff_limbs)
{
//for (j = 0; j < output_limbs; j += 8) PREFETCH(poly->coeffs[i+1], j);
mpn_add(res + skip, res + skip, output_limbs + 1, poly[i], output_limbs);
}
while ((skip < total_limbs) && (i < length))
{
mpn_add(res + skip, res + skip, total_limbs - skip, poly[i], MIN(total_limbs - skip, output_limbs));
i++;
skip += coeff_limbs;
}
}
/*
Recombines coefficients of a poly after doing a convolution. Assumes
each of the coefficients of the poly of the given length is output_limbs
long, that each is being shifted by a multiple of _bits_ and added
to an mpn which is total_limbs long. It is assumed that the mpn has been
zeroed in advance.
*/
void FFT_combine_bits(mp_limb_t * res, mp_limb_t ** poly, ulong length, ulong bits,
ulong output_limbs, ulong total_limbs)
{
ulong top_bits = ((GMP_LIMB_BITS - 1) & bits);
if (top_bits == 0)
{
FFT_combine(res, poly, length, bits/GMP_LIMB_BITS, output_limbs, total_limbs);
return;
}
TMP_DECL;
TMP_MARK;
ulong coeff_limbs = (bits/GMP_LIMB_BITS) + 1;
ulong i, j;
ulong * temp = (ulong *) TMP_BALLOC_LIMBS(output_limbs + 1);
ulong shift_bits = 0;
ulong * limb_ptr = res;
ulong * end = res + total_limbs;
for (i = 0; (i < length) && (limb_ptr + output_limbs < end); i++)
{
//for (j = 0; j < output_limbs; j += 8) PREFETCH(poly->coeffs[i+1], j);
if (shift_bits)
{
mpn_lshift(temp, poly[i], output_limbs + 1, shift_bits);
mpn_add_n(limb_ptr, limb_ptr, temp, output_limbs + 1);
} else
{
mpn_add(limb_ptr, limb_ptr, output_limbs + 1, poly[i], output_limbs);
}
shift_bits += top_bits;
limb_ptr += (coeff_limbs - 1);
if (shift_bits >= GMP_LIMB_BITS)
{
limb_ptr++;
shift_bits -= GMP_LIMB_BITS;
}
}
while ((limb_ptr < end) && (i < length))
{
if (shift_bits)
{
mpn_lshift(temp, poly[i], output_limbs + 1, shift_bits);
mpn_add_n(limb_ptr, limb_ptr, temp, end - limb_ptr);
} else
{
mpn_add_n(limb_ptr, limb_ptr, poly[i], end - limb_ptr);
}
shift_bits += top_bits;
limb_ptr += (coeff_limbs - 1);
if (shift_bits >= GMP_LIMB_BITS)
{
limb_ptr++;
shift_bits -= GMP_LIMB_BITS;
}
i++;
}
TMP_FREE;
}
/*
Normalise t to be in the range [0, 2^nw]
*/
void mpn_normmod_2expp1(mp_limb_t * t, ulong l)
{
mp_limb_signed_t hi = t[l];
if (hi)
{
t[l] = CNST_LIMB(0);
mpn_addmod_2expp1_1(t, l, -hi);
hi = t[l];
// hi will be in [-1,1]
if (t[l])
{
t[l] = CNST_LIMB(0);
mpn_addmod_2expp1_1(t, l, -hi);
if (t[l] == ~CNST_LIMB(0)) // if we now have -1 (very unlikely)
{
t[l] = CNST_LIMB(0);
mpn_addmod_2expp1_1(t, l, 1);
}
}
}
}
/*
We are given two integers modulo 2^wn+1, i1 and i2, which are not
necessarily normalised. We compute t = sqrt(-1)*(i1 - i2) and are given
n and w. Note 2^wn/2 = sqrt(-1) mod 2^wn+1. Requires wn to be divisible
by 2*GMP_LIMB_BITS.
*/
void mpn_submod_i_2expp1(mp_limb_t * t, mp_limb_t * i1, mp_limb_t * i2, ulong l)
{
mp_limb_t cy;
mp_limb_t l2 = l/2;
t[l] = -mpn_sub_n(t + l2, i1, i2, l2);
cy = i2[l] - i1[l] - mpn_sub_n(t, i2 + l2, i1 + l2, l2);
mpn_addmod_2expp1_1(t + l2, l2, cy);
}
/*
We are given two integers modulo 2^wn+1, i1 and i2, which are
not necessarily normalised and are given n and w. We compute
t = (i1 + i2)*B^x, u = (i1 - i2)*B^y. Aliasing between inputs and
outputs is not permitted. We require x and y to be less than the
number of limbs of i1 and i2.
*/
void mpn_lshB_sumdiffmod_2expp1(mp_limb_t * t, mp_limb_t * u, mp_limb_t * i1,
mp_limb_t * i2, ulong limbs, ulong x, ulong y)
{
mp_limb_t cy, cy1, cy2;
if (x == 0)
{
if (y == 0)
{
mpn_sumdiff_n(t + x, u + y, i1, i2, limbs + 1);
} else
{
cy = mpn_sumdiff_n(t, u + y, i1, i2, limbs - y);
u[limbs] = -(cy&1);
cy1 = cy>>1;
cy = mpn_sumdiff_n(t + limbs - y, u, i2 + limbs - y, i1 + limbs - y, y);
t[limbs] = cy>>1;
mpn_add_1(t + limbs - y, t + limbs - y, y + 1, cy1);
cy1 = -(cy&1) + (i2[limbs] - i1[limbs]);
mpn_addmod_2expp1_1(u + y, limbs - y, cy1);
cy1 = -(i1[limbs] + i2[limbs]);
mpn_addmod_2expp1_1(t, limbs, cy1);
}
} else if (y == 0)
{
cy = mpn_sumdiff_n(t + x, u, i1, i2, limbs - x);
t[limbs] = cy>>1;
cy1 = cy&1;
cy = mpn_sumdiff_n(t, u + limbs - x, i1 + limbs - x, i2 + limbs - x, x);
cy2 = mpn_neg_n(t, t, x);
u[limbs] = -(cy&1);
mpn_sub_1(u + limbs - x, u + limbs - x, x + 1, cy1);
cy1 = -(cy>>1) - cy2;
cy1 -= (i1[limbs] + i2[limbs]);
mpn_addmod_2expp1_1(t + x, limbs - x, cy1);
cy1 = (i2[limbs] - i1[limbs]);
mpn_addmod_2expp1_1(u, limbs, cy1);
} else if (x > y)
{
cy = mpn_sumdiff_n(t + x, u + y, i1, i2, limbs - x);
t[limbs] = cy>>1;
cy1 = cy&1;
cy = mpn_sumdiff_n(t, u + y + limbs - x, i1 + limbs - x, i2 + limbs - x, x - y);
cy2 = mpn_neg_n(t, t, x - y);
u[limbs] = -(cy&1);
mpn_sub_1(u + y + limbs - x, u + y + limbs - x, x - y + 1, cy1);
cy1 = (cy>>1) + cy2;
cy = mpn_sumdiff_n(t + x - y, u, i2 + limbs - y, i1 + limbs - y, y);
cy2 = mpn_neg_n(t + x - y, t + x - y, y);
cy1 = -(cy>>1) - mpn_sub_1(t + x - y, t + x - y, y, cy1) - cy2;
cy1 -= (i1[limbs] + i2[limbs]);
mpn_addmod_2expp1_1(t + x, limbs - x, cy1);
cy1 = -(cy&1) + (i2[limbs] - i1[limbs]);
mpn_addmod_2expp1_1(u + y, limbs - y, cy1);
} else if (x < y)
{
cy = mpn_sumdiff_n(t + x, u + y, i1, i2, limbs - y);
u[limbs] = -(cy&1);
cy1 = cy>>1;
cy = mpn_sumdiff_n(t + x + limbs - y, u, i2 + limbs - y, i1 + limbs - y, y - x);
t[limbs] = cy>>1;
mpn_add_1(t + x + limbs - y, t + x + limbs - y, y - x + 1, cy1);
cy1 = cy&1;
cy = mpn_sumdiff_n(t, u + y - x, i2 + limbs - x, i1 + limbs - x, x);
cy1 = -(cy&1) - mpn_sub_1(u + y - x, u + y - x, x, cy1);
cy1 += (i2[limbs] - i1[limbs]);
mpn_addmod_2expp1_1(u + y, limbs - y, cy1);
cy2 = mpn_neg_n(t, t, x);
cy1 = -(cy>>1) - (i1[limbs] + i2[limbs]) - cy2;
mpn_addmod_2expp1_1(t + x, limbs - x, cy1);
} else // x == y
{
cy = mpn_sumdiff_n(t + x, u + x, i1, i2, limbs - x);
t[limbs] = cy>>1;
u[limbs] = -(cy&1);
cy = mpn_sumdiff_n(t, u, i2 + limbs - x, i1 + limbs - x, x);
cy2 = mpn_neg_n(t, t, x);
cy1 = -(cy>>1) - (i1[limbs] + i2[limbs]) - cy2;
mpn_addmod_2expp1_1(t + x, limbs - x, cy1);
cy1 = -(cy&1) + i2[limbs] - i1[limbs];
mpn_addmod_2expp1_1(u + x, limbs - x, cy1);
}
}
/*
We are given two integers modulo 2^wn+1, i1 and i2, which are
not necessarily normalised and are given n and w. We compute
t = i1 + i2/B^x, u = i1 - i2/B^x. Aliasing between inputs and
outputs is not permitted. We require x be less than the
number of limbs of i1 and i2.
*/
void mpn_sumdiff_rshBmod_2expp1(mp_limb_t * t, mp_limb_t * u, mp_limb_t * i1,
mp_limb_t * i2, ulong limbs, ulong x, ulong y)
{
mp_limb_t cy, cy1, cy2, cy3;
if (x == 0)
{
if (y == 0)
{
mpn_sumdiff_n(t, u, i1, i2, limbs + 1);
} else // y != 0
{
cy = mpn_sumdiff_n(t, u, i1, i2 + y, limbs - y);
cy1 = (cy>>1);
cy2 = -(cy&1);
cy = mpn_sumdiff_n(u + limbs - y, t + limbs - y, i1 + limbs - y, i2, y);
u[limbs] = (cy>>1) + i1[limbs];
t[limbs] = i1[limbs] - (cy&1);
mpn_addmod_2expp1_1(t + limbs - y, y, cy1 + i2[limbs]);
mpn_addmod_2expp1_1(u + limbs - y, y, cy2 - i2[limbs]);
}
} else if (y == 0) // x != 0
{
cy = mpn_sumdiff_n(t, u, i1 + x, i2, limbs - x);
cy1 = (cy>>1);
cy2 = -(cy&1);
cy3 = mpn_neg_n(i1, i1, x);
cy = mpn_sumdiff_n(t + limbs - x, u + limbs - x, i1, i2 + limbs - x, x);
u[limbs] = -cy3 - (cy&1) - i2[limbs];
t[limbs] = -cy3 + i2[limbs] + (cy>>1);
mpn_addmod_2expp1_1(t + limbs - x, x, cy1 + i1[limbs]);
mpn_addmod_2expp1_1(u + limbs - x, x, cy2 + i1[limbs]);
} else if (x == y)
{
cy = mpn_sumdiff_n(t, u, i1 + x, i2 + x, limbs - x);
cy1 = (cy>>1);
cy2 = -(cy&1);
cy = mpn_sumdiff_n(t + limbs - x, u + limbs - x, i2, i1, x);
cy3 = mpn_neg_n(t + limbs - x, t + limbs - x, x);
u[limbs] = -(cy&1);
t[limbs] = -(cy>>1) - cy3;
mpn_addmod_2expp1_1(t + limbs - x, x, cy1 + i1[limbs] + i2[limbs]);
mpn_addmod_2expp1_1(u + limbs - x, x, cy2 + i1[limbs] - i2[limbs]);
} else if (x > y)
{
cy = mpn_sumdiff_n(t + limbs - y, u + limbs - y, i2, i1 + x - y, y);
cy3 = mpn_neg_n(t + limbs - y, t + limbs - y, y);
t[limbs] = -(cy>>1) - cy3;
u[limbs] = -(cy&1);
cy3 = mpn_neg_n(i1, i1, x - y);
cy = mpn_sumdiff_n(t + limbs - x, u + limbs - x, i1, i2 + limbs - x + y, x - y);
mpn_addmod_2expp1_1(t + limbs - y, y, (cy>>1) + i2[limbs] - cy3);
mpn_addmod_2expp1_1(u + limbs - y, y, -(cy&1) - i2[limbs] - cy3);
cy = mpn_sumdiff_n(t, u, i1 + x, i2 + y, limbs - x);
mpn_addmod_2expp1_1(t + limbs - x, x, (cy>>1) + i1[limbs]);
mpn_addmod_2expp1_1(u + limbs - x, x, -(cy&1) + i1[limbs]);
} else //(x < y)
{
cy = mpn_sumdiff_n(t + limbs - x, u + limbs - x, i2 + y - x, i1, x);
cy3 = mpn_neg_n(t + limbs - x, t + limbs - x, x);
t[limbs] = -(cy>>1) - cy3;
u[limbs] = -(cy&1);
cy3 = mpn_neg_n(i2, i2, y - x);
cy = mpn_sumdiff_n(t + limbs - y, u + limbs - y, i1 + limbs - y + x, i2, y - x);
mpn_addmod_2expp1_1(t + limbs - x, x, (cy>>1) + i1[limbs] - cy3);
mpn_addmod_2expp1_1(u + limbs - x, x, -(cy&1) + i1[limbs] + cy3);
cy = mpn_sumdiff_n(t, u, i1 + x, i2 + y, limbs - y);
mpn_addmod_2expp1_1(t + limbs - y, y, (cy>>1) + i2[limbs]);
mpn_addmod_2expp1_1(u + limbs - y, y, -(cy&1) - i2[limbs]);
}
}
/*
Given an integer i1 modulo 2^wn+1, set t to 2^d*i1 modulo 2^wm+1.
We must have GMP_LIMB_BITS > d >= 0.
*/
void mpn_mul_2expmod_2expp1(mp_limb_t * t, mp_limb_t * i1, ulong limbs, ulong d)
{
mp_limb_signed_t hi, hi2;
if (d == 0)
{
if (t != i1)
MPN_COPY(t, i1, limbs + 1);
} else
{
hi = i1[limbs];
mpn_lshift(t, i1, limbs + 1, d);
hi2 = t[limbs];
t[limbs] = CNST_LIMB(0);
mpn_sub_1(t, t, limbs + 1, hi2);
hi >>= (GMP_LIMB_BITS - d);
mpn_addmod_2expp1_1(t + 1, limbs - 1, -hi);
}
}
/*
Given an integer i1 modulo 2^wn+1, set t to i1/2^d modulo 2^wm+1.
We must have GMP_LIMB_BITS > d >= 0.
*/
void mpn_div_2expmod_2expp1(mp_limb_t * t, mp_limb_t * i1, ulong limbs, ulong d)
{
mp_limb_t lo;
mp_limb_t * ptr;
mp_limb_signed_t hi;
if (d == 0)
{
if (t != i1)
MPN_COPY(t, i1, limbs + 1);
} else
{
hi = i1[limbs];
lo = mpn_rshift(t, i1, limbs + 1, d);
t[limbs] = (hi>>d);
ptr = t + limbs - 1;
sub_ddmmss(ptr[1], ptr[0], ptr[1], ptr[0], CNST_LIMB(0), lo);
}
}
/*
Set {s, t} to { z1^i*(s+t), z2^i*(s-t) } where
z1 = exp(2*Pi*I/(2*n)), z2 = exp(2*Pi*I*3/(2*n)), z1=>w bits
*/
void FFT_split_radix_butterfly(mp_limb_t * s, mp_limb_t * t, ulong i, ulong n, ulong w, mp_limb_t * u)
{
mp_limb_t size = (w*n)/GMP_LIMB_BITS + 1;
mp_limb_t * v;
ulong x, y, b1, b2;
int negate = 0;
int negate2 = 0;
v = u + size;
b1 = i;
while (b1 >= n)
{
negate2 = 1 - negate2;
b1 -= n;
}
b1 = b1*w;
x = b1/GMP_LIMB_BITS;
b1 -= x*GMP_LIMB_BITS;
b2 = 3*i;
while (b2 >= n)
{
negate = 1 - negate;
b2 -= n;
}
b2 = b2*w;
y = b2/GMP_LIMB_BITS;
b2 -= y*GMP_LIMB_BITS;
mpn_lshB_sumdiffmod_2expp1(u, v, s, t, size - 1, x, y);
mpn_mul_2expmod_2expp1(s, u, size - 1, b1);
mpn_mul_2expmod_2expp1(t, v, size - 1, b2);
if (negate) mpn_neg_n(t, t, size);
if (negate2) mpn_neg_n(s, s, size);
}
void FFT_split_radix_butterfly2(mp_limb_t * u, mp_limb_t * v, mp_limb_t * s, mp_limb_t * t, ulong i, ulong n, ulong w)
{
mp_limb_t size = (w*n)/GMP_LIMB_BITS + 1;
ulong x, y, b1, b2;
int negate = 0;
int negate2 = 0;
b1 = i;
while (b1 >= n)
{
negate2 = 1 - negate2;
b1 -= n;
}
b1 = b1*w;
x = b1/GMP_LIMB_BITS;
b1 -= x*GMP_LIMB_BITS;
b2 = 3*i;
while (b2 >= n)
{
negate = 1 - negate;
b2 -= n;
}
b2 = b2*w;
y = b2/GMP_LIMB_BITS;
b2 -= y*GMP_LIMB_BITS;
mpn_lshB_sumdiffmod_2expp1(u, v, s, t, size - 1, x, y);
mpn_mul_2expmod_2expp1(u, u, size - 1, b1);
if (negate2) mpn_neg_n(u, u, size);
mpn_mul_2expmod_2expp1(v, v, size - 1, b2);
if (negate) mpn_neg_n(v, v, size);
}
/*
Set u = 2^{ws*tw1}*(s + t), v = 2^{w+ws*tw2}*(s - t)
*/
void FFT_radix2_twiddle_butterfly(mp_limb_t * u, mp_limb_t * v,
mp_limb_t * s, mp_limb_t * t, ulong NW, ulong b1, ulong b2)
{
mp_limb_t size = NW/GMP_LIMB_BITS + 1;
ulong x, y;
int negate = 0;
int negate2 = 0;
b1 %= (2*NW);
if (b1 >= NW)
{
negate2 = 1;
b1 -= NW;
}
x = b1/GMP_LIMB_BITS;
b1 -= x*GMP_LIMB_BITS;
b2 %= (2*NW);
if (b2 >= NW)
{
negate = 1;
b2 -= NW;
}
y = b2/GMP_LIMB_BITS;
b2 -= y*GMP_LIMB_BITS;
mpn_lshB_sumdiffmod_2expp1(u, v, s, t, size - 1, x, y);
mpn_mul_2expmod_2expp1(u, u, size - 1, b1);
if (negate2) mpn_neg_n(u, u, size);
mpn_mul_2expmod_2expp1(v, v, size - 1, b2);
if (negate) mpn_neg_n(v, v, size);
}
/*
Set s = i1 + i2, t = z1*(i1 - i2) where z1 = exp(2*Pi*I/m) => w bits
*/
void FFT_radix2_butterfly(mp_limb_t * s, mp_limb_t * t,
mp_limb_t * i1, mp_limb_t * i2, ulong i, ulong n, ulong w)
{
mp_limb_t size = (w*n)/GMP_LIMB_BITS + 1;
ulong x, y, b1;
int negate = 0;
x = 0;
b1 = i;
while (b1 >= n)
{
negate = 1 - negate;
b1 -= n;
}
b1 = b1*w;
y = b1/GMP_LIMB_BITS;
b1 -= y*GMP_LIMB_BITS;
mpn_lshB_sumdiffmod_2expp1(s, t, i1, i2, size - 1, x, y);
mpn_mul_2expmod_2expp1(t, t, size - 1, b1);
if (negate) mpn_neg_n(t, t, size);
}
/*
Set s = i1 + z1*i2, t = i1 - z1*i2 where z1 = exp(-2*Pi*I/m) => w bits
*/
void FFT_radix2_inverse_butterfly(mp_limb_t * s, mp_limb_t * t,
mp_limb_t * i1, mp_limb_t * i2, ulong i, ulong n, ulong w)
{
mp_limb_t limbs = (w*n)/GMP_LIMB_BITS;
ulong y, b1;
b1 = i*w;
y = b1/GMP_LIMB_BITS;
b1 -= y*GMP_LIMB_BITS;
mpn_div_2expmod_2expp1(i2, i2, limbs, b1);
mpn_sumdiff_rshBmod_2expp1(s, t, i1, i2, limbs, 0, y);
}
void FFT_radix2_twiddle_inverse_butterfly(mp_limb_t * s, mp_limb_t * t,
mp_limb_t * i1, mp_limb_t * i2, ulong NW, ulong b1, ulong b2)
{
mp_limb_t limbs = NW/GMP_LIMB_BITS;
ulong x, y;
int negate = 0;
int negate2 = 0;
b1 %= (2*NW);
if (b1 >= NW)
{
negate = 1;
b1 -= NW;
}
x = b1/GMP_LIMB_BITS;
b1 -= x*GMP_LIMB_BITS;
b2 %= (2*NW);
if (b2 >= NW)
{
negate2 = 1;
b2 -= NW;
}
y = b2/GMP_LIMB_BITS;
b2 -= y*GMP_LIMB_BITS;
if (negate) mpn_neg_n(i1, i1, limbs + 1);
mpn_div_2expmod_2expp1(i1, i1, limbs, b1);
if (negate2) mpn_neg_n(i2, i2, limbs + 1);
mpn_div_2expmod_2expp1(i2, i2, limbs, b2);
mpn_sumdiff_rshBmod_2expp1(s, t, i1, i2, limbs, x, y);
}
void FFT_radix2(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp);
/*
The split radix DIF FFT works as follows:
Given: inputs [i0, i1, ..., i{m-1}], for m a power of 2
Output: Fsplit[i0, i1, ..., i{m-1}] = [r0, r1, ..., r{m-1}]
(Inputs and outputs may be subject to a stride and not in consecutive locations.)
Algorithm: * Recursively compute [r0, r2, r4, ...., r{m-2}]
= Fsplit[i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
* Let [s0, s1, ..., s{m/4-1}]
= [i0-i{m/2}, i1-i{m/2+1}, ..., i{m/4-1}-i{3m/4-1}]
* Let [t0, t1, ..., t{m/4-1}]
= sqrt(-1)[i{m/4}-i{3m/4}, i{m/4+1}-i{3m/4+1}, ..., i{m/2-1}-i{m-1}]
* Let [u0, u1, ..., u{m/4-1}]
= [z1^0*(s0+t0), z1^1(s1+t1), ..., z1^{m/4-1}*(s{m/4-1}+t{m/4-1})]
where z1 = exp(2*Pi*I/m)
* Let [v0, v1, ..., v{m/4-1}]
= [z2^0*(s0-t0), z2^1(s1-t1), ..., z2^{m/4-1}*(s{m/4-1}-t{m/4-1})]
where z2 = exp(2*Pi*I*3/m)
* Recursively compute [r1, r5, ..., r{m-3}]
= Fsplit[u0, u1, ..., u{m/4-1}]
* Recursively compute [r3, r7, ..., r{m-1}]
= Fsplit[v0, v1, ..., v{m/4-1}]
The parameters are as follows:
* 2*n is the length of the input and output arrays (m in the above)
* w is such that 2^w is an 2n-th root of unity in the ring Z/pZ that
we are working in, i.e. p = 2^wn + 1 (here n is divisible by
GMP_LIMB_BITS)
* ii is the array of inputs (each input is an array of limbs of length
wn/GMP_LIMB_BITS + 1 (the extra limb being a "carry limb")
* rr is the array of outputs (each being an array of limbs of length
wn/GMP_LIMB_BITS + 1)
* rs is the stride with which entries of rr should be written (e.g.
rs = 4 means that the output is to be written in every 4th entry of
rr)
* is is the stride with which entries of ii should be read
*/
void FFT_split_radix(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)
{
mp_limb_t * ptr;
ulong i;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (n < 4)
{
FFT_radix2(rr, rs, ii, n, w, t1, t2, temp);
return;
}
// [s0, s1, ..., s{m/4-1}] = [i0-i{m/2}, i1-i{m/2+1}, ..., i{m/4-1}-i{3m/4-1}]
for (i = 0; i < n/2; i++)
{
mpn_sub_n(temp[i], ii[i], ii[i+n], size);
mpn_add_n(ii[i+n], ii[i], ii[i+n], size);
}
// [t0, t1, ..., t{m/4-1}] = sqrt(-1)[i{m/4}-i{3m/4}, i{m/4+1}-i{3m/4+1}, ..., i{m/2-1}-i{m-1}]
for (i = 0; i < n/2; i++)
{
mpn_submod_i_2expp1(temp[i+n/2], ii[i+n/2], ii[i+3*n/2], size - 1);
mpn_add_n(ii[i+3*n/2], ii[i+n/2], ii[i+3*n/2], size);
}
// [u0, u1, ..., u{m/4-1}] = [z1^0*(s0+t0), z1^1(s1+t1), ..., z1^{m/4-1}*(s{m/4-1}+t{m/4-1})]
// [v0, v1, ..., v{m/4-1}] = [z2^0*(s0-t0), z2^1(s1-t1), ..., z2^{m/4-1}*(s{m/4-1}-t{m/4-1})]
// where z1 = exp(2*Pi*I/m), z2 = exp(2*Pi*I*3/m), z1 => w bits
// note {u, v} = {ss, tt}
for (i = 0; i < n/2; i++)
{
FFT_split_radix_butterfly2(*t1, *t2, temp[i], temp[i+n/2], i, n, w);
ptr = temp[i];
temp[i] = *t1;
*t1 = ptr;
ptr = temp[i+n/2];
temp[i+n/2] = *t2;
*t2 = ptr;
}
// [r0, r2, r4, ...., r{m-2}] = Fsplit[i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
FFT_split_radix(rr, 2*rs, ii+n, n/2, 2*w, t1, t2, temp + n);
// [r1, r5, ..., r{m-3}] = Fsplit[u0, u1, ..., u{m/4-1}]
FFT_split_radix(rr + rs, 4*rs, temp, n/4, 4*w, t1, t2, temp + n);
// [r3, r7, ..., r{m-1}] = Fsplit[v0, v1, ..., v{m/4-1}]
FFT_split_radix(rr + 3*rs, 4*rs, temp+n/2, n/4, 4*w, t1, t2, temp + n);
}
/*
The radix 2 DIF FFT works as follows:
Given: inputs [i0, i1, ..., i{m-1}], for m a power of 2
Output: Fradix2[i0, i1, ..., i{m-1}] = [r0, r1, ..., r{m-1}]
(Inputs and outputs may be subject to a stride and not in consecutive locations.)
Algorithm: * Recursively compute [r0, r2, r4, ...., r{m-2}]
= Fsplit[i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
* Let [t0, t1, ..., t{m/2-1}]
= [i0-i{m/2}, i1-i{m/2+1}, ..., i{m/2-1}-i{m-1}]
* Let [u0, u1, ..., u{m/2-1}]
= [z1^0*t0, z1^1*t1, ..., z1^{m/2-1}*t{m/2-1}]
where z1 = exp(2*Pi*I/m)
* Recursively compute [r1, r3, ..., r{m-1}]
= Fsplit[u0, u1, ..., u{m/2-1}]
The parameters are as follows:
* 2*n is the length of the input and output arrays (m in the above)
* w is such that 2^w is an 2n-th root of unity in the ring Z/pZ that
we are working in, i.e. p = 2^wn + 1 (here n is divisible by
GMP_LIMB_BITS)
* ii is the array of inputs (each input is an array of limbs of length
wn/GMP_LIMB_BITS + 1 (the extra limb being a "carry limb")
* rr is the array of outputs (each being an array of limbs of length
wn/GMP_LIMB_BITS + 1)
* rs is the stride with which entries of rr should be written (e.g.
rs = 4 means that the output is to be written in every 4th entry of
rr)
* is is the stride with which entries of ii should be read
*/
void FFT_radix2(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (n == 1)
{
FFT_radix2_butterfly(*t1, *t2, ii[0], ii[1], 0, n, w);
ptr = rr[0];
rr[0] = *t1;
*t1 = ptr;
ptr = rr[rs];
rr[rs] = *t2;
*t2 = ptr;
return;
}
// [s0, s1, ..., s{m/2}] = [i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
// [t0, t1, ..., t{m/2-1}]
// = [z1^0*(i0-i{m/2}), z1^1*(i1-i{m/2+1}), ..., z1^{m/2-1}*(i{m/2-1}-i{m-1})]
// where z1 = exp(2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
FFT_radix2_butterfly(*t1, temp[i], ii[i], ii[n+i], i, n, w);
ptr = ii[i];
ii[i] = *t1;
*t1 = ptr;
}
// [r0, r2, ..., r{m-2}] = Fradix2[s0, s1, ..., s{m/2-1}]
FFT_radix2(rr, 2*rs, ii, n/2, 2*w, t1, t2, temp + n);
// [r1, r3, ..., r{m-1}] = Fradix2[t0, t1, ..., t{m/2-1}]
FFT_radix2(rr + rs, 2*rs, temp, n/2, 2*w, t1, t2, temp + n);
}
void FFT_radix2_new(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (n == 1)
{
FFT_radix2_butterfly(*t1, *t2, ii[0], ii[1], 0, n, w);
ptr = rr[0];
rr[0] = *t1;
*t1 = ptr;
ptr = rr[rs];
rr[rs] = *t2;
*t2 = ptr;
return;
}
// [s0, s1, ..., s{m/2}] = [i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
// [t0, t1, ..., t{m/2-1}]
// = [z1^0*(i0-i{m/2}), z1^1*(i1-i{m/2+1}), ..., z1^{m/2-1}*(i{m/2-1}-i{m-1})]
// where z1 = exp(2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
FFT_radix2_butterfly(*t1, *t2, ii[i], ii[n+i], i, n, w);
ptr = ii[i];
ii[i] = *t1;
*t1 = ptr;
ptr = ii[n+i];
ii[n+i] = *t2;
*t2 = ptr;
}
// [r0, r2, ..., r{m-2}] = Fradix2[s0, s1, ..., s{m/2-1}]
FFT_radix2_new(rr, 1, ii, n/2, 2*w, t1, t2, temp);
// [r1, r3, ..., r{m-1}] = Fradix2[t0, t1, ..., t{m/2-1}]
FFT_radix2_new(rr + n, 1, ii+n, n/2, 2*w, t1, t2, temp);
}
int FFT_negacyclic_twiddle(mp_limb_t * r, mp_limb_t * i1, ulong i, ulong n, ulong w)
{
mp_limb_t cy;
ulong limbs = (n*w)/GMP_LIMB_BITS;
int negate = 0;
while (i >= 2*n)
{
negate = 1 - negate;
i -= 2*n;
}
ulong b1 = (w*i)/2;
ulong x = b1/GMP_LIMB_BITS;
b1 -= x*GMP_LIMB_BITS;
//if ((!x) && (negate))
if (negate) mpn_neg_n(i1, i1, limbs + 1);
mpn_mul_2expmod_2expp1(i1, i1, limbs, b1);
if (x)
{
/*if (negate)
{
r[limbs] = -mpn_neg_n(r + x, i1, limbs - x);
MPN_COPY(r, i1 + limbs - x, x);
mpn_addmod_2expp1_1(r + x, limbs - x, i1[limbs]);
} else
{*/
MPN_COPY(r + x, i1, limbs - x);
r[limbs] = CNST_LIMB(0);
cy = mpn_neg_n(r, i1 + limbs - x, x);
mpn_addmod_2expp1_1(r + x, limbs - x, -cy - i1[limbs]);
//}
return 1;
}
return 0;
}
void FFT_twiddle(mp_limb_t * r, mp_limb_t * i1, ulong i, ulong n, ulong w)
{
mp_limb_t cy;
ulong limbs = (n*w)/GMP_LIMB_BITS;
int negate = 0;
while (i >= n)
{
negate = 1 - negate;
i -= n;
}
ulong b1 = w*i;
ulong x = b1/GMP_LIMB_BITS;
b1 -= x*GMP_LIMB_BITS;
if (x)
{
MPN_COPY(r + x, i1, limbs - x);
r[limbs] = CNST_LIMB(0);
cy = mpn_neg_n(r, i1 + limbs - x, x);
mpn_addmod_2expp1_1(r + x, limbs - x, -cy - i1[limbs]);
if (negate) mpn_neg_n(r, r, limbs + 1);
mpn_mul_2expmod_2expp1(r, r, limbs, b1);
} else
{
if (negate)
{
mpn_neg_n(r, i1, limbs + 1);
mpn_mul_2expmod_2expp1(r, r, limbs, b1);
} else
mpn_mul_2expmod_2expp1(r, i1, limbs, b1);
}
}
void FFT_radix2_twiddle(mp_limb_t ** ii, ulong is,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong ws, ulong r, ulong c, ulong rs);
/*
Truncate FFT to any length given by trunc, so long as trunc is
divisible by 8.
*/
void FFT_radix2_truncate1(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong trunc)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (trunc == 2*n)
{
FFT_radix2_new(rr, rs, ii, n, w, t1, t2, temp);
return;
}
if (trunc <= n)
{
for (i = 0; i < n; i++)
mpn_add_n(ii[i], ii[i], ii[i+n], size);
FFT_radix2_truncate1(rr, rs, ii, n/2, 2*w, t1, t2, temp, trunc);
} else
{
// [s0, s1, ..., s{m/2}] = [i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
// [t0, t1, ..., t{m/2-1}]
// = [z1^0*(i0-i{m/2}), z1^1*(i1-i{m/2+1}), ..., z1^{m/2-1}*(i{m/2-1}-i{m-1})]
// where z1 = exp(2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
FFT_radix2_butterfly(*t1, *t2, ii[i], ii[n+i], i, n, w);
ptr = ii[i];
ii[i] = *t1;
*t1 = ptr;
ptr = ii[n+i];
ii[n+i] = *t2;
*t2 = ptr;
}
// [r0, r2, ..., r{m-2}] = Fradix2[s0, s1, ..., s{m/2-1}]
FFT_radix2_new(rr, 1, ii, n/2, 2*w, t1, t2, temp);
// [r1, r3, ..., r{m-1}] = Fradix2[t0, t1, ..., t{m/2-1}]
FFT_radix2_truncate1(rr + n, 1, ii+n, n/2, 2*w, t1, t2, temp, trunc - n);
}
}
void FFT_radix2_truncate1_twiddle(mp_limb_t ** ii, ulong is,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong ws, ulong r, ulong c, ulong rs, ulong trunc)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (trunc == 2*n)
{
FFT_radix2_twiddle(ii, is, n, w, t1, t2, temp, ws, r, c, rs);
return;
}
if (trunc <= n)
{
for (i = 0; i < n; i++)
mpn_add_n(ii[i*is], ii[i*is], ii[(i+n)*is], size);
FFT_radix2_truncate1_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs, trunc);
} else
{
// [s0, s1, ..., s{m/2}] = [i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
// [t0, t1, ..., t{m/2-1}]
// = [z1^0*(i0-i{m/2}), z1^1*(i1-i{m/2+1}), ..., z1^{m/2-1}*(i{m/2-1}-i{m-1})]
// where z1 = exp(2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
FFT_radix2_butterfly(*t1, *t2, ii[i*is], ii[(n+i)*is], i, n, w);
ptr = ii[i*is];
ii[i*is] = *t1;
*t1 = ptr;
ptr = ii[(n+i)*is];
ii[(n+i)*is] = *t2;
*t2 = ptr;
}
// [r0, r2, ..., r{m-2}] = Fradix2[s0, s1, ..., s{m/2-1}]
FFT_radix2_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs);
// [r1, r3, ..., r{m-1}] = Fradix2[t0, t1, ..., t{m/2-1}]
FFT_radix2_truncate1_twiddle(ii + n*is, is, n/2, 2*w, t1, t2, temp, ws, r + rs, c, 2*rs, trunc - n);
}
}
/*
Truncate FFT to any length given by trunc, so long as trunc is
divisible by 8. Assumes zeros from trunc to n.
*/
void FFT_radix2_truncate(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong trunc)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (trunc == 2*n)
{
FFT_radix2_new(rr, rs, ii, n, w, t1, t2, temp);
return;
}
if (trunc <= n)
{
// PASS
FFT_radix2_truncate(rr, rs, ii, n/2, 2*w, t1, t2, temp, trunc);
} else
{
// PASS
// [s0, s1, ..., s{m/2}] = [i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
// [t0, t1, ..., t{m/2-1}]
// = [z1^0*(i0-i{m/2}), z1^1*(i1-i{m/2+1}), ..., z1^{m/2-1}*(i{m/2-1}-i{m-1})]
// where z1 = exp(2*Pi*I/m), z1 => w bits
for (i = 0; i < trunc - n; i++)
{
FFT_radix2_butterfly(*t1, *t2, ii[i], ii[n+i], i, n, w);
ptr = ii[i];
ii[i] = *t1;
*t1 = ptr;
ptr = ii[n+i];
ii[n+i] = *t2;
*t2 = ptr;
}
for (i = trunc; i < 2*n; i++)
{
FFT_twiddle(ii[i], ii[i-n], i - n, n, w);
}
// [r0, r2, ..., r{m-2}] = Fradix2[s0, s1, ..., s{m/2-1}]
FFT_radix2_new(rr, 1, ii, n/2, 2*w, t1, t2, temp);
// [r1, r3, ..., r{m-1}] = Fradix2[t0, t1, ..., t{m/2-1}]
FFT_radix2_truncate1(rr + n, 1, ii+n, n/2, 2*w, t1, t2, temp, trunc - n);
}
}
void FFT_radix2_truncate_twiddle(mp_limb_t ** ii, ulong is,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong ws, ulong r, ulong c, ulong rs, ulong trunc)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (trunc == 2*n)
{
FFT_radix2_twiddle(ii, is, n, w, t1, t2, temp, ws, r, c, rs);
return;
}
if (trunc <= n)
{
// PASS
FFT_radix2_truncate_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs, trunc);
} else
{
// PASS
// [s0, s1, ..., s{m/2}] = [i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
// [t0, t1, ..., t{m/2-1}]
// = [z1^0*(i0-i{m/2}), z1^1*(i1-i{m/2+1}), ..., z1^{m/2-1}*(i{m/2-1}-i{m-1})]
// where z1 = exp(2*Pi*I/m), z1 => w bits
for (i = 0; i < trunc - n; i++)
{
FFT_radix2_butterfly(*t1, *t2, ii[i*is], ii[(n+i)*is], i, n, w);
ptr = ii[i*is];
ii[i*is] = *t1;
*t1 = ptr;
ptr = ii[(n+i)*is];
ii[(n+i)*is] = *t2;
*t2 = ptr;
}
for (i = trunc; i < 2*n; i++)
{
FFT_twiddle(ii[i*is], ii[(i-n)*is], i - n, n, w);
}
// [r0, r2, ..., r{m-2}] = Fradix2[s0, s1, ..., s{m/2-1}]
FFT_radix2_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs);
// [r1, r3, ..., r{m-1}] = Fradix2[t0, t1, ..., t{m/2-1}]
FFT_radix2_truncate1_twiddle(ii + n*is, is, n/2, 2*w, t1, t2, temp, ws, r + rs, c, 2*rs, trunc - n);
}
}
void FFT_radix2_negacyclic(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
// we first apply twiddle factors corresponding to shifts of w*i/2 bits
// this could partially be combined with the butterflies below
// [s0, s1, ..., s{m/2}] = [i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
// [t0, t1, ..., t{m/2-1}]
// = [z1^0*(i0-i{m/2}), z1^1*(i1-i{m/2+1}), ..., z1^{m/2-1}*(i{m/2-1}-i{m-1})]
// where z1 = exp(2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
if (FFT_negacyclic_twiddle(*t1, ii[i], i, n, w))
{
ptr = ii[i];
ii[i] = *t1;
*t1 = ptr;
}
if (FFT_negacyclic_twiddle(*t1, ii[n + i], n + i, n, w))
{
ptr = ii[n + i];
ii[n + i] = *t1;
*t1 = ptr;
}
FFT_radix2_butterfly(*t1, *t2, ii[i], ii[n+i], i, n, w);
ptr = ii[i];
ii[i] = *t1;
*t1 = ptr;
ptr = ii[n+i];
ii[n+i] = *t2;
*t2 = ptr;
}
// [r0, r2, ..., r{m-2}] = Fradix2[s0, s1, ..., s{m/2-1}]
FFT_radix2_new(rr, 1, ii, n/2, 2*w, t1, t2, temp);
// [r1, r3, ..., r{m-1}] = Fradix2[t0, t1, ..., t{m/2-1}]
FFT_radix2_new(rr + n, 1, ii+n, n/2, 2*w, t1, t2, temp);
j = 0; k = 1; l = n+1;
temp[0] = ii[1];
ii[1] = ii[n];
for (i = 2; i < n; i += 2)
{
temp[k++] = ii[i];
ii[i] = temp[j++];
temp[k++] = ii[i+1];
ii[i+1] = ii[l++];
}
for ( ; i < 2*n; i+=2)
{
ii[i] = temp[j++];
ii[i+1] = ii[l++];
}
}
/*
FFT of length 2*n on entries of ii with stride is.
Apply additional twists by z^{c*i} where i starts at r and increases by rs
for each coeff. Note z => ws bits.
*/
void FFT_radix2_twiddle(mp_limb_t ** ii, ulong is,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong ws, ulong r, ulong c, ulong rs)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (n == 1)
{
ulong tw1, tw2;
tw1 = r*c;
tw2 = tw1 + rs*c;
FFT_radix2_twiddle_butterfly(*t1, *t2, ii[0], ii[is], n*w, tw1*ws, tw2*ws);
ptr = ii[0];
ii[0] = *t1;
*t1 = ptr;
ptr = ii[is];
ii[is] = *t2;
*t2 = ptr;
return;
}
// [s0, s1, ..., s{m/2}] = [i0+i{m/2}, i1+i{m/2+1}, ..., i{m/2-1}+i{m-1}]
// [t0, t1, ..., t{m/2-1}]
// = [z1^0*(i0-i{m/2}), z1^1*(i1-i{m/2+1}), ..., z1^{m/2-1}*(i{m/2-1}-i{m-1})]
// where z1 = exp(2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
FFT_radix2_butterfly(*t1, *t2, ii[i*is], ii[(n+i)*is], i, n, w);
ptr = ii[i*is];
ii[i*is] = *t1;
*t1 = ptr;
ptr = ii[(n+i)*is];
ii[(n+i)*is] = *t2;
*t2 = ptr;
}
// [r0, r2, ..., r{m-2}] = Fradix2[s0, s1, ..., s{m/2-1}]
FFT_radix2_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs);
// [r1, r3, ..., r{m-1}] = Fradix2[t0, t1, ..., t{m/2-1}]
FFT_radix2_twiddle(ii+n*is, is, n/2, 2*w, t1, t2, temp, ws, r + rs, c, 2*rs);
}
void IFFT_radix2_new(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (n == 1)
{
FFT_radix2_inverse_butterfly(*t1, *t2, ii[0], ii[1], 0, n, w);
ptr = rr[0];
rr[0] = *t1;
*t1 = ptr;
ptr = rr[rs];
rr[rs] = *t2;
*t2 = ptr;
return;
}
// [s0, s1, ..., s{m/2-1}] = Fradix2_inverse[i0, i2, ..., i{m-2}]
IFFT_radix2_new(ii, 1, ii, n/2, 2*w, t1, t2, temp);
// [t{m/2}, t{m/2+1}, ..., t{m-1}] = Fradix2_inverse[i1, i3, ..., i{m-1}]
IFFT_radix2_new(ii+n, 1, ii+n, n/2, 2*w, t1, t2, temp);
// [r0, r1, ..., r{m/2-1}]
// = [s0+z1^0*t0, s1+z1^1*t1, ..., s{m/2-1}+z1^{m/2-1}*t{m-1}]
// [r{m/2}, t{m/2+1}, ..., r{m-1}]
// = [s0-z1^0*t{m/2}, s1-z1^1*t{m/2+1}, ..., s{m/2-1}-z1^{m/2-1}*t{m-1}]
// where z1 = exp(-2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
FFT_radix2_inverse_butterfly(*t1, *t2, ii[i], ii[n+i], i, n, w);
ptr = rr[i];
rr[i] = *t1;
*t1 = ptr;
ptr = rr[n+i];
rr[n+i] = *t2;
*t2 = ptr;
}
}
void IFFT_radix2_twiddle(mp_limb_t ** ii, ulong is,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong ws, ulong r, ulong c, ulong rs);
void IFFT_radix2_truncate1(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong trunc)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (trunc == 2*n)
{
IFFT_radix2_new(rr, rs, ii, n, w, t1, t2, temp);
return;
}
if (trunc <= n)
{
// PASS
for (i = trunc; i < n; i++)
{
mpn_add_n(ii[i], ii[i], ii[i+n], size);
mpn_div_2expmod_2expp1(ii[i], ii[i], size - 1, 1);
}
IFFT_radix2_truncate1(rr, rs, ii, n/2, 2*w, t1, t2, temp, trunc);
for (i = 0; i < trunc; i++)
mpn_addsub_n(ii[i], ii[i], ii[i], ii[n+i], size);
return;
}
// [s0, s1, ..., s{m/2-1}] = Fradix2_inverse[i0, i2, ..., i{m-2}]
IFFT_radix2_new(ii, 1, ii, n/2, 2*w, t1, t2, temp);
for (i = trunc; i < 2*n; i++)
{
mpn_sub_n(ii[i], ii[i-n], ii[i], size);
FFT_twiddle(*t1, ii[i], i - n, n, w);
ptr = ii[i];
ii[i] = *t1;
*t1 = ptr;
}
for (i = trunc - n; i < n; i++)
{
FFT_twiddle(*t1, ii[i + n], 2*n - i, n, w);
mpn_add_n(ii[i], ii[i], *t1, size);
}
// [t{m/2}, t{m/2+1}, ..., t{m-1}] = Fradix2_inverse[i1, i3, ..., i{m-1}]
IFFT_radix2_truncate1(ii+n, 1, ii+n, n/2, 2*w, t1, t2, temp, trunc - n);
// [r0, r1, ..., r{m/2-1}]
// = [s0+z1^0*t0, s1+z1^1*t1, ..., s{m/2-1}+z1^{m/2-1}*t{m-1}]
// [r{m/2}, t{m/2+1}, ..., r{m-1}]
// = [s0-z1^0*t{m/2}, s1-z1^1*t{m/2+1}, ..., s{m/2-1}-z1^{m/2-1}*t{m-1}]
// where z1 = exp(-2*Pi*I/m), z1 => w bits
for (i = 0; i < trunc - n; i++)
{
FFT_radix2_inverse_butterfly(*t1, *t2, ii[i], ii[n+i], i, n, w);
ptr = rr[i];
rr[i] = *t1;
*t1 = ptr;
ptr = rr[n+i];
rr[n+i] = *t2;
*t2 = ptr;
}
}
void IFFT_radix2_truncate1_twiddle(mp_limb_t ** ii, ulong is,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong ws, ulong r, ulong c, ulong rs, ulong trunc)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (trunc == 2*n)
{
IFFT_radix2_twiddle(ii, is, n, w, t1, t2, temp, ws, r, c, rs);
return;
}
if (trunc <= n)
{
// PASS
for (i = trunc; i < n; i++)
{
mpn_add_n(ii[i*is], ii[i*is], ii[(i+n)*is], size);
mpn_div_2expmod_2expp1(ii[i*is], ii[i*is], size - 1, 1);
}
IFFT_radix2_truncate1_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs, trunc);
for (i = 0; i < trunc; i++)
mpn_addsub_n(ii[i*is], ii[i*is], ii[i*is], ii[(n+i)*is], size);
return;
}
// [s0, s1, ..., s{m/2-1}] = Fradix2_inverse[i0, i2, ..., i{m-2}]
IFFT_radix2_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs);
for (i = trunc; i < 2*n; i++)
{
mpn_sub_n(ii[i*is], ii[(i-n)*is], ii[i*is], size);
FFT_twiddle(*t1, ii[i*is], i - n, n, w);
ptr = ii[i*is];
ii[i*is] = *t1;
*t1 = ptr;
}
for (i = trunc - n; i < n; i++)
{
FFT_twiddle(*t1, ii[(i+n)*is], 2*n - i, n, w);
mpn_add_n(ii[i*is], ii[i*is], *t1, size);
}
// [t{m/2}, t{m/2+1}, ..., t{m-1}] = Fradix2_inverse[i1, i3, ..., i{m-1}]
IFFT_radix2_truncate1_twiddle(ii + n*is, is, n/2, 2*w, t1, t2, temp, ws, r + rs, c, 2*rs, trunc - n);
// [r0, r1, ..., r{m/2-1}]
// = [s0+z1^0*t0, s1+z1^1*t1, ..., s{m/2-1}+z1^{m/2-1}*t{m-1}]
// [r{m/2}, t{m/2+1}, ..., r{m-1}]
// = [s0-z1^0*t{m/2}, s1-z1^1*t{m/2+1}, ..., s{m/2-1}-z1^{m/2-1}*t{m-1}]
// where z1 = exp(-2*Pi*I/m), z1 => w bits
for (i = 0; i < trunc - n; i++)
{
FFT_radix2_inverse_butterfly(*t1, *t2, ii[i*is], ii[(n+i)*is], i, n, w);
ptr = ii[i*is];
ii[i*is] = *t1;
*t1 = ptr;
ptr = ii[(n+i)*is];
ii[(n+i)*is] = *t2;
*t2 = ptr;
}
}
/*
Truncate IFFT to given length. Requires trunc a multiple of 8.
Assumes (conceptually) zeroes from trunc to n.
*/
void IFFT_radix2_truncate(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong trunc)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (trunc == 2*n)
{
IFFT_radix2_new(rr, rs, ii, n, w, t1, t2, temp);
return;
}
if (trunc <= n)
{
// PASS
IFFT_radix2_truncate(rr, rs, ii, n/2, 2*w, t1, t2, temp, trunc);
for (i = 0; i < trunc; i++)
mpn_add_n(ii[i], ii[i], ii[i], size);
return;
}
//PASS
// [s0, s1, ..., s{m/2-1}] = Fradix2_inverse[i0, i2, ..., i{m-2}]
IFFT_radix2_new(ii, 1, ii, n/2, 2*w, t1, t2, temp);
for (i = trunc; i < 2*n; i++)
{
FFT_twiddle(ii[i], ii[i-n], i - n, n, w);
}
// [t{m/2}, t{m/2+1}, ..., t{m-1}] = Fradix2_inverse[i1, i3, ..., i{m-1}]
IFFT_radix2_truncate1(ii+n, 1, ii+n, n/2, 2*w, t1, t2, temp, trunc - n);
// [r0, r1, ..., r{m/2-1}]
// = [s0+z1^0*t0, s1+z1^1*t1, ..., s{m/2-1}+z1^{m/2-1}*t{m-1}]
// [r{m/2}, t{m/2+1}, ..., r{m-1}]
// = [s0-z1^0*t{m/2}, s1-z1^1*t{m/2+1}, ..., s{m/2-1}-z1^{m/2-1}*t{m-1}]
// where z1 = exp(-2*Pi*I/m), z1 => w bits
for (i = 0; i < trunc - n; i++)
{
FFT_radix2_inverse_butterfly(*t1, *t2, ii[i], ii[n+i], i, n, w);
ptr = rr[i];
rr[i] = *t1;
*t1 = ptr;
ptr = rr[n+i];
rr[n+i] = *t2;
*t2 = ptr;
}
for (i = trunc - n; i < n; i++)
mpn_add_n(ii[i], ii[i], ii[i], size);
}
void IFFT_radix2_truncate_twiddle(mp_limb_t ** ii, ulong is,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong ws, ulong r, ulong c, ulong rs, ulong trunc)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (trunc == 2*n)
{
IFFT_radix2_twiddle(ii, is, n, w, t1, t2, temp, ws, r, c, rs);
return;
}
if (trunc <= n)
{
// PASS
IFFT_radix2_truncate_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs, trunc);
for (i = 0; i < trunc; i++)
mpn_add_n(ii[i*is], ii[i*is], ii[i*is], size);
return;
}
//PASS
// [s0, s1, ..., s{m/2-1}] = Fradix2_inverse[i0, i2, ..., i{m-2}]
IFFT_radix2_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs);
for (i = trunc; i < 2*n; i++)
{
FFT_twiddle(ii[i*is], ii[(i-n)*is], i - n, n, w);
}
// [t{m/2}, t{m/2+1}, ..., t{m-1}] = Fradix2_inverse[i1, i3, ..., i{m-1}]
IFFT_radix2_truncate1_twiddle(ii+n*is, is, n/2, 2*w, t1, t2, temp, ws, r + rs, c, 2*rs, trunc - n);
// [r0, r1, ..., r{m/2-1}]
// = [s0+z1^0*t0, s1+z1^1*t1, ..., s{m/2-1}+z1^{m/2-1}*t{m-1}]
// [r{m/2}, t{m/2+1}, ..., r{m-1}]
// = [s0-z1^0*t{m/2}, s1-z1^1*t{m/2+1}, ..., s{m/2-1}-z1^{m/2-1}*t{m-1}]
// where z1 = exp(-2*Pi*I/m), z1 => w bits
for (i = 0; i < trunc - n; i++)
{
FFT_radix2_inverse_butterfly(*t1, *t2, ii[i*is], ii[(n+i)*is], i, n, w);
ptr = ii[i*is];
ii[i*is] = *t1;
*t1 = ptr;
ptr = ii[(n+i)*is];
ii[(n+i)*is] = *t2;
*t2 = ptr;
}
for (i = trunc - n; i < n; i++)
mpn_add_n(ii[i*is], ii[i*is], ii[i*is], size);
}
void IFFT_radix2_negacyclic(mp_limb_t ** rr, ulong rs, mp_limb_t ** ii,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
j = 0; k = 0;
for (i = 0; i < n; i += 2)
{
ii[i] = ii[2*i];
temp[k++] = ii[i+1];
ii[i+1] = ii[2*(i+1)];
}
for ( ; i < 2*n; i+=2)
{
ii[i] = temp[j++];
temp[k++] = ii[i+1];
ii[i+1] = temp[j++];
}
// [s0, s1, ..., s{m/2-1}] = Fradix2_inverse[i0, i2, ..., i{m-2}]
IFFT_radix2_new(ii, 1, ii, n/2, 2*w, t1, t2, temp);
// [t{m/2}, t{m/2+1}, ..., t{m-1}] = Fradix2_inverse[i1, i3, ..., i{m-1}]
IFFT_radix2_new(ii+n, 1, ii+n, n/2, 2*w, t1, t2, temp);
// [r0, r1, ..., r{m/2-1}]
// = [s0+z1^0*t0, s1+z1^1*t1, ..., s{m/2-1}+z1^{m/2-1}*t{m-1}]
// [r{m/2}, t{m/2+1}, ..., r{m-1}]
// = [s0-z1^0*t{m/2}, s1-z1^1*t{m/2+1}, ..., s{m/2-1}-z1^{m/2-1}*t{m-1}]
// where z1 = exp(-2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
FFT_radix2_inverse_butterfly(*t1, *t2, ii[i], ii[n+i], i, n, w);
ptr = rr[i];
rr[i] = *t1;
*t1 = ptr;
ptr = rr[n+i];
rr[n+i] = *t2;
*t2 = ptr;
// we finally apply twiddle factors corresponding to shifts of w*i bits
// this could partially be combined with the butterflies above
if (FFT_negacyclic_twiddle(*t1, ii[i], 4*n - i, n, w))
{
ptr = ii[i];
ii[i] = *t1;
*t1 = ptr;
}
if (FFT_negacyclic_twiddle(*t1, ii[n + i], 3*n - i, n, w))
{
ptr = ii[n + i];
ii[n + i] = *t1;
*t1 = ptr;
}
}
}
void IFFT_radix2_twiddle(mp_limb_t ** ii, ulong is,
ulong n, ulong w, mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp,
ulong ws, ulong r, ulong c, ulong rs)
{
mp_limb_t ** ss, ** tt;
mp_limb_t * ptr;
ulong i, j, k, l;
ulong size = (w*n)/GMP_LIMB_BITS + 1;
if (n == 1)
{
ulong tw1, tw2;
tw1 = r*c;
tw2 = tw1 + rs*c;
FFT_radix2_twiddle_inverse_butterfly(*t1, *t2, ii[0], ii[is], n*w, tw1*ws, tw2*ws);
ptr = ii[0];
ii[0] = *t1;
*t1 = ptr;
ptr = ii[is];
ii[is] = *t2;
*t2 = ptr;
return;
}
// [s0, s1, ..., s{m/2-1}] = Fradix2_inverse[i0, i2, ..., i{m-2}]
IFFT_radix2_twiddle(ii, is, n/2, 2*w, t1, t2, temp, ws, r, c, 2*rs);
// [t{m/2}, t{m/2+1}, ..., t{m-1}] = Fradix2_inverse[i1, i3, ..., i{m-1}]
IFFT_radix2_twiddle(ii+n*is, is, n/2, 2*w, t1, t2, temp, ws, r + rs, c, 2*rs);
// [r0, r1, ..., r{m/2-1}]
// = [s0+z1^0*t0, s1+z1^1*t1, ..., s{m/2-1}+z1^{m/2-1}*t{m-1}]
// [r{m/2}, t{m/2+1}, ..., r{m-1}]
// = [s0-z1^0*t{m/2}, s1-z1^1*t{m/2+1}, ..., s{m/2-1}-z1^{m/2-1}*t{m-1}]
// where z1 = exp(-2*Pi*I/m), z1 => w bits
for (i = 0; i < n; i++)
{
FFT_radix2_inverse_butterfly(*t1, *t2, ii[i*is], ii[(n+i)*is], i, n, w);
ptr = ii[i*is];
ii[i*is] = *t1;
*t1 = ptr;
ptr = ii[(n+i)*is];
ii[(n+i)*is] = *t2;
*t2 = ptr;
}
}
/*
The matrix Fourier algorithm for a 1D Fourier transform of length m = 2*n,
works as follows:
* Split the coefficients into R rows of C columns (here C = n1, R = m/n1 = n2)
* Perform a length R FFT on each column, i.e. with an input stride of n1
* Multiply each coefficient by z^{r*c} where z = exp(2*Pi*I/m), note z=>w bits
* Perform a length C FFT on each row, i.e. with an input stride of 1
*/
void FFT_radix2_mfa(mp_limb_t ** ii, ulong n, ulong w,
mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp, ulong n1)
{
ulong i, j;
ulong n2 = (2*n)/n1;
ulong depth = 0;
ulong depth2 = 0;
mp_limb_t * ptr;
while ((1UL<<depth) < n2/2) depth++;
while ((1UL<<depth2) < n1/2) depth2++;
// n2 rows, n1 cols
for (i = 0; i < n1; i++)
{
// FFT of length n2 on column i, applying z^{r*i} for rows going up in steps
// of 1 starting at row 0, where z => w bits
FFT_radix2_twiddle(ii + i, n1, n2/2, w*n1, t1, t2, temp, w, 0, i, 1);
for (j = 0; j < n2; j++)
{
ulong s = revbin(j, depth);
if (j < s)
{
ptr = ii[i + j*n1];
ii[i + j*n1] = ii[i + s*n1];
ii[i + s*n1] = ptr;
}
}
}
for (i = 0; i < n2; i++)
{
FFT_radix2_new(ii + i*n1, 1, ii + i*n1, n1/2, w*n2, t1, t2, temp);
for (j = 0; j < n1; j++)
{
ulong s = revbin(j, depth2);
if (j < s)
{
ptr = ii[i*n1 + j];
ii[i*n1 + j] = ii[i*n1 + s];
ii[i*n1 + s] = ptr;
}
}
}
}
void FFT_radix2_mfa_truncate(mp_limb_t ** ii, ulong n, ulong w,
mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp, ulong n1, ulong trunc)
{
ulong i, j, s;
ulong n2 = (2*n)/n1;
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong depth = 0;
ulong depth2 = 0;
mp_limb_t * ptr;
while ((1UL<<depth) < n2/2) depth++;
while ((1UL<<depth2) < n1/2) depth2++;
trunc /= n1;
// n2 rows, n1 cols
for (i = 0; i < n1; i++)
{
// FFT of length n2 on column i, applying z^{r*i} for rows going up in steps
// of 1 starting at row 0, where z => w bits
FFT_radix2_truncate_twiddle(ii + i, n1, n2/2, w*n1, t1, t2, temp, w, 0, i, 1, trunc);
for (j = 0; j < n2; j++)
{
s = revbin(j, depth);
if (j < s)
{
ptr = ii[i + j*n1];
ii[i + j*n1] = ii[i + s*n1];
ii[i + s*n1] = ptr;
}
}
}
for (s = 0; s < trunc; s++)
{
i = revbin(s, depth);
FFT_radix2_new(ii + i*n1, 1, ii + i*n1, n1/2, w*n2, t1, t2, temp);
for (j = 0; j < n1; j++)
{
ulong t = revbin(j, depth2);
if (j < t)
{
ptr = ii[i*n1 + j];
ii[i*n1 + j] = ii[i*n1 + t];
ii[i*n1 + t] = ptr;
}
mpn_normmod_2expp1(ii[i*n1 + j], limbs);
}
}
}
void IFFT_radix2_mfa(mp_limb_t ** ii, ulong n, ulong w,
mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp, ulong n1)
{
ulong i, j;
ulong n2 = (2*n)/n1;
ulong depth = 0;
ulong depth2 = 0;
mp_limb_t * ptr;
while ((1UL<<depth) < n2/2) depth++;
while ((1UL<<depth2) < n1/2) depth2++;
// n2 rows, n1 cols
for (i = 0; i < n2; i++)
{
for (j = 0; j < n1; j++)
{
ulong s = revbin(j, depth2);
if (j < s)
{
ptr = ii[i*n1 + j];
ii[i*n1 + j] = ii[i*n1 + s];
ii[i*n1 + s] = ptr;
}
}
IFFT_radix2_new(ii + i*n1, 1, ii + i*n1, n1/2, w*n2, t1, t2, temp);
}
for (i = 0; i < n1; i++)
{
for (j = 0; j < n2; j++)
{
ulong s = revbin(j, depth);
if (j < s)
{
ptr = ii[i + j*n1];
ii[i + j*n1] = ii[i + s*n1];
ii[i + s*n1] = ptr;
}
}
// IFFT of length n2 on column i, applying z^{r*i} for rows going up in steps
// of 1 starting at row 0, where z => w bits
IFFT_radix2_twiddle(ii + i, n1, n2/2, w*n1, t1, t2, temp, w, 0, i, 1);
}
}
void IFFT_radix2_mfa_truncate(mp_limb_t ** ii, ulong n, ulong w,
mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp, ulong n1, ulong trunc)
{
ulong i, j, s;
ulong n2 = (2*n)/n1;
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong depth = 0;
ulong depth2 = 0;
mp_limb_t * ptr;
while ((1UL<<depth) < n2/2) depth++;
while ((1UL<<depth2) < n1/2) depth2++;
trunc /= n1;
// n2 rows, n1 cols
for (s = 0; s < trunc; s++)
{
i = revbin(s, depth);
for (j = 0; j < n1; j++)
{
ulong t = revbin(j, depth2);
if (j < t)
{
ptr = ii[i*n1 + j];
ii[i*n1 + j] = ii[i*n1 + t];
ii[i*n1 + t] = ptr;
}
}
IFFT_radix2_new(ii + i*n1, 1, ii + i*n1, n1/2, w*n2, t1, t2, temp);
}
for (i = 0; i < n1; i++)
{
for (j = 0; j < n2; j++)
{
s = revbin(j, depth);
if (j < s)
{
ptr = ii[i + j*n1];
ii[i + j*n1] = ii[i + s*n1];
ii[i + s*n1] = ptr;
}
}
// IFFT of length n2 on column i, applying z^{r*i} for rows going up in steps
// of 1 starting at row 0, where z => w bits
IFFT_radix2_truncate_twiddle(ii + i, n1, n2/2, w*n1, t1, t2, temp, w, 0, i, 1, trunc);
for (j = 0; j < trunc; j++)
mpn_normmod_2expp1(ii[i + j], limbs);
}
}
void FFT_mulmod_2expp1(mp_limb_t * r1, mp_limb_t * i1, mp_limb_t * i2,
ulong r_limbs, ulong depth, ulong w)
{
ulong n = (1UL<<depth);
ulong bits1 = (r_limbs*GMP_LIMB_BITS)/(2*n);
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2;
mp_limb_t c;
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
jj = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) jj + 2*n; i < 2*n; i++, ptr += size)
{
jj[i] = ptr;
}
u1 = ptr;
u2 = u1 + size;
s2 = (mp_limb_t **) u2 + size;
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
j = FFT_split_bits(ii, i1, r_limbs, bits1, limbs);
for ( ; j < 2*n; j++)
MPN_ZERO(ii[j], limbs + 1);
FFT_radix2_negacyclic(ii, 1, ii, n, w, &t1, &t2, s1);
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
j = FFT_split_bits(jj, i2, r_limbs, bits1, limbs);
for ( ; j < 2*n; j++)
MPN_ZERO(jj[j], limbs + 1);
FFT_radix2_negacyclic(jj, 1, jj, n, w, &u1, &u2, s2);
for (j = 0; j < 2*n; j++)
{
mpn_normmod_2expp1(jj[j], limbs);
c = ii[j][limbs] + 2*jj[j][limbs];
ii[j][limbs] = mpn_mulmod_2expp1(ii[j], ii[j], jj[j], c, n*w, tt);
}
IFFT_radix2_negacyclic(ii, 1, ii, n, w, &t1, &t2, s1);
for (j = 0; j < 2*n; j++)
{
mpn_div_2expmod_2expp1(ii[j], ii[j], limbs, depth + 1);
mpn_normmod_2expp1(ii[j], limbs);
}
MPN_ZERO(r1, r_limbs + 1);
FFT_combine_bits(r1, ii, 2*n - 1, bits1, limbs, r_limbs + 1);
// as the negacyclic convolution has effectively done subtractions
// some of the coefficients will be negative, so need to subtract p
// FIXME: the following is not very cache friendly
ulong b = 0;
ulong ll = 0;
mp_limb_t bit;
ulong limb_add = bits1/GMP_LIMB_BITS;
ulong bit_add = bits1 - limb_add*GMP_LIMB_BITS;
for (j = 0; j < 2*n - 1; j++)
{
if (b >= GMP_LIMB_BITS)
{
b -= GMP_LIMB_BITS;
ll++;
}
bit = (1UL<<b);
if (ii[j][limbs] || (mp_limb_signed_t) ii[j][limbs - 1] < 0) // coefficient was -ve
{
mpn_sub_1(r1 + ll, r1 + ll, r_limbs + 1 - ll, bit);
mpn_sub_1(r1 + ll + limbs, r1 + ll + limbs, r_limbs + 1 - limbs - ll, bit);
}
b += bit_add;
ll += limb_add;
}
// final coefficient wraps around
ulong l = (bits1 - 1)/GMP_LIMB_BITS + 1;
ulong shift = l*GMP_LIMB_BITS - bits1;
mp_limb_t cy = 0;
if (ii[2*n - 1][limbs] || (mp_limb_signed_t) ii[2*n - 1][limbs - 1] < 0)
{
if (ii[2*n - 1][limbs]) abort();
cy = mpn_sub_1(ii[2*n - 1], ii[2*n - 1], limbs + 1, 1);
cy += mpn_sub_1(ii[2*n - 1] + limbs, ii[2*n - 1] + limbs, 1, 1);
}
mpn_lshift(ii[2*n - 1], ii[2*n - 1], limbs + 1, shift);
r1[r_limbs] += mpn_add_n(r1 + r_limbs - l, r1 + r_limbs - l, ii[2*n - 1], l);
c = mpn_sub_n(r1, r1, ii[2*n - 1] + l, limbs + 1 - l);
mpn_addmod_2expp1_1(r1 + limbs + 1 - l, r_limbs - limbs - 1 + l, -c + cy);
mpn_normmod_2expp1(r1, r_limbs);
TMP_FREE;
}
/*
FIXME: currently doesn't use negacyclic fft as it is too inefficient.
*/
mp_limb_t new_mpn_mulmod_2expp1(mp_limb_t * r, mp_limb_t * i1, mp_limb_t * i2,
mp_limb_t c, mp_limb_t bits, mp_limb_t * tt)
{
if (1) // currently the negacyclic fft is too inefficient to use.
return mpn_mulmod_2expp1(r, i1, i2, c, bits, tt);
ulong w = 2;
ulong depth = 1;
ulong n = (1UL<<depth);
ulong bits1 = (n*w - depth - 2)/2;
ulong bits2 = 2*n*bits1;
while (bits2 <= bits && bits % (2*n) == 0)
{
depth++;
n = (1UL<<depth);
bits1 = (n*w - depth - 2)/2;
bits2 = 2*n*bits1;
}
depth--;
n = (1UL<<depth);
bits1 = (n*w - depth - 2)/2;
bits2 = 2*n*bits1;
while (bits2 < bits)
{
w += 2;
bits1 = (n*w - depth - 2)/2;
bits2 = 2*n*bits1;
}
//printf("bits = %ld, depth = %ld, n = %ld, w = %ld\n", bits, depth, 1UL<<depth, w);
FFT_mulmod_2expp1(r, i1, i2, bits/GMP_LIMB_BITS, depth, w);
return 0;
}
/*
The main integer multiplication routine. Multiplies i1 of n1 limbs by i2 of
n2 limbs and puts the result in r1, which must have space for n1 + n2 limbs.
Currently there is no sqrt2 trick, thus a convolution of depth d gives a
convolution length 2*n where n = 2^d with coefficients of size n*w, where
w = w2^2.
FIXME: there is no reason for w to be a square, nor for us to pass in w2.
Just pass in w.
Note the *output* polynomial must fit in that convolution size.
The computation, bits1 = (n*w - depth)/2, gives the maximum size of input
coefficients for a given n and w if you want the output to be meaningful.
The function automatically truncates the FFT, pointwise mults and IFFT,
thus the inputs can be different lengths. The function will just segfault
if n and w2 are not sufficiently large. Except for the smallest multiplications
(where w2 should be 2), the value of w2 should probably always just be 1.
*/
void new_mpn_mul(mp_limb_t * r1, mp_limb_t * i1, ulong n1, mp_limb_t * i2, ulong n2,
ulong depth, ulong w2)
{
ulong n = (1UL<<depth);
ulong w = w2*w2;
ulong bits1 = (n*w - depth)/2;
ulong sqrt = (1UL<<(depth/2));
ulong r_limbs = n1 + n2;
ulong j1 = (n1*GMP_LIMB_BITS - 1)/bits1 + 1;
ulong j2 = (n2*GMP_LIMB_BITS - 1)/bits1 + 1;
ulong trunc = ((j1 + j2 - 2 + 2*sqrt)/(2*sqrt)) * 2*sqrt;
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s, t, u;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2;
mp_limb_t c;
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
jj = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) jj + 2*n; i < 2*n; i++, ptr += size)
{
jj[i] = ptr;
}
u1 = ptr;
u2 = u1 + size;
s2 = (mp_limb_t **) u2 + size;
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
j = FFT_split_bits(ii, i1, n1, bits1, limbs);
for ( ; j < trunc; j++)
MPN_ZERO(ii[j], limbs + 1);
FFT_radix2_mfa_truncate(ii, n, w, &t1, &t2, s1, sqrt, trunc);
j = FFT_split_bits(jj, i2, n2, bits1, limbs);
for ( ; j < trunc; j++)
MPN_ZERO(jj[j], limbs + 1);
FFT_radix2_mfa_truncate(jj, n, w, &u1, &u2, s2, sqrt, trunc);
for (s = 0; s < trunc/sqrt; s++)
{
u = revbin(s, (depth + 1)/2)*sqrt;
for (t = 0; t < sqrt; t++)
{
j = u + t;
c = ii[j][limbs] + 2*jj[j][limbs];
ii[j][limbs] = new_mpn_mulmod_2expp1(ii[j], ii[j], jj[j], c, n*w, tt);
}
}
IFFT_radix2_mfa_truncate(ii, n, w, &t1, &t2, s1, sqrt, trunc);
for (j = 0; j < trunc; j++)
{
mpn_div_2expmod_2expp1(ii[j], ii[j], limbs, depth + 1);
mpn_normmod_2expp1(ii[j], limbs);
}
MPN_ZERO(r1, r_limbs);
FFT_combine_bits(r1, ii, j1 + j2 - 1, bits1, limbs, r_limbs);
TMP_FREE;
}
/************************************************************************************
Test code
************************************************************************************/
void mpn_to_mpz(mpz_t m, mp_limb_t * i, ulong limbs)
{
mp_limb_signed_t hi;
mpz_realloc(m, limbs + 1);
MPN_COPY(m->_mp_d, i, limbs + 1);
hi = i[limbs];
if (hi < 0L)
{
mpn_neg_n(m->_mp_d, m->_mp_d, limbs + 1);
m->_mp_size = limbs + 1;
while ((m->_mp_size) && (!m->_mp_d[m->_mp_size - 1]))
m->_mp_size--;
m->_mp_size = -m->_mp_size;
} else
{
m->_mp_size = limbs + 1;
while ((m->_mp_size) && (!m->_mp_d[m->_mp_size - 1]))
m->_mp_size--;
}
}
void ref_norm(mpz_t m, mpz_t p)
{
mpz_mod(m, m, p);
}
void ref_submod_i(mpz_t m, mpz_t i1, mpz_t i2, mpz_t p, ulong n, ulong w)
{
mpz_sub(m, i1, i2);
mpz_mul_2exp(m, m, (n*w)/2);
mpz_mod(m, m, p);
}
void ref_mul_2expmod(mpz_t m, mpz_t i2, mpz_t p, ulong n, ulong w, ulong d)
{
mpz_mul_2exp(m, i2, d);
mpz_mod(m, m, p);
}
void ref_div_2expmod(mpz_t m, mpz_t i2, mpz_t p, ulong n, ulong w, ulong d)
{
mpz_t temp;
mpz_init(temp);
mpz_set_ui(temp, 1);
mpz_mul_2exp(temp, temp, d);
mpz_invert(temp, temp, p);
mpz_mul(m, i2, temp);
mpz_mod(m, m, p);
mpz_clear(temp);
}
void ref_lshB_sumdiffmod(mpz_t t, mpz_t u, mpz_t i1,
mpz_t i2, mpz_t p, ulong n, ulong w, ulong x, ulong y)
{
mpz_add(t, i1, i2);
mpz_sub(u, i1, i2);
mpz_mul_2exp(t, t, x*GMP_LIMB_BITS);
mpz_mul_2exp(u, u, y*GMP_LIMB_BITS);
mpz_mod(t, t, p);
mpz_mod(u, u, p);
}
void ref_sumdiff_rshBmod(mpz_t t, mpz_t u, mpz_t i1,
mpz_t i2, mpz_t p, ulong n, ulong w, ulong x, ulong y)
{
mpz_t mult1, mult2;
mpz_init(mult1);
mpz_init(mult2);
mpz_set_ui(mult1, 1);
mpz_mul_2exp(mult1, mult1, x*GMP_LIMB_BITS);
mpz_invert(mult1, mult1, p);
mpz_set_ui(mult2, 1);
mpz_mul_2exp(mult2, mult2, y*GMP_LIMB_BITS);
mpz_invert(mult2, mult2, p);
mpz_mul(mult1, mult1, i1);
mpz_mul(mult2, mult2, i2);
mpz_add(t, mult1, mult2);
mpz_sub(u, mult1, mult2);
mpz_mod(t, t, p);
mpz_mod(u, u, p);
mpz_clear(mult1);
mpz_clear(mult2);
}
void ref_FFT_split_radix_butterfly(mpz_t s, mpz_t t, mpz_t p, ulong i, ulong n, ulong w)
{
mpz_t temp;
mpz_init(temp);
mpz_add(temp, s, t);
mpz_sub(t, s, t);
mpz_set(s, temp);
mpz_mul_2exp(s, s, i*w);
mpz_mul_2exp(t, t, 3*i*w);
mpz_mod(s, s, p);
mpz_mod(t, t, p);
mpz_clear(temp);
}
void set_p(mpz_t p, ulong n, ulong w)
{
mpz_set_ui(p, 1);
mpz_mul_2exp(p, p, n*w);
mpz_add_ui(p, p, 1);
}
void rand_n(mp_limb_t * n, gmp_randstate_t state, ulong limbs)
{
mpn_rrandom(n, state, limbs);
n[limbs] = gmp_urandomm_ui(state, 10);
if (gmp_urandomm_ui(state, 2)) n[limbs] = -n[limbs];
}
void test_norm()
{
ulong i, j, k, l, n, w, limbs;
mpz_t p, m, m2;
mpz_init(p);
mpz_init(m);
mpz_init(m2);
mp_limb_t * nn;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (i = GMP_LIMB_BITS; i < 32*GMP_LIMB_BITS; i += GMP_LIMB_BITS)
{
for (j = 1; j < 32; j++)
{
for (k = 1; k <= GMP_NUMB_BITS; k <<= 1)
{
n = i/k;
w = j*k;
limbs = (n*w)/GMP_LIMB_BITS;
TMP_MARK;
nn = TMP_BALLOC_LIMBS(limbs + 1);
mpn_rrandom(nn, state, limbs + 1);
mpn_to_mpz(m, nn, limbs);
set_p(p, n, w);
mpn_normmod_2expp1(nn, limbs);
mpn_to_mpz(m2, nn, limbs);
ref_norm(m, p);
if (mpz_cmp(m, m2) != 0)
{
printf("mpn_normmod_2expp1 error\n");
gmp_printf("want %Zx\n\n", m);
gmp_printf("got %Zx\n", m2);
abort();
}
TMP_FREE;
}
}
}
mpz_clear(p);
mpz_clear(m);
mpz_clear(m2);
gmp_randclear(state);
}
void test_submod_i()
{
ulong i, j, k, l, n, w, limbs;
mpz_t p, m, m2, mn1, mn2;
mpz_init(p);
mpz_init(m);
mpz_init(m2);
mpz_init(mn1);
mpz_init(mn2);
mp_limb_t * nn1, * nn2, * r;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (i = 2*GMP_LIMB_BITS; i < 64*GMP_LIMB_BITS; i += 2*GMP_LIMB_BITS)
{
for (j = 1; j < 32; j++)
{
for (k = 1; k <= 2*GMP_NUMB_BITS; k <<= 1)
{
n = i/k;
w = j*k;
limbs = (n*w)/GMP_LIMB_BITS;
TMP_MARK;
nn1 = TMP_BALLOC_LIMBS(limbs + 1);
nn2 = TMP_BALLOC_LIMBS(limbs + 1);
r = TMP_BALLOC_LIMBS(limbs + 1);
rand_n(nn1, state, limbs);
rand_n(nn2, state, limbs);
mpn_to_mpz(mn1, nn1, limbs);
mpn_to_mpz(mn2, nn2, limbs);
set_p(p, n, w);
mpn_submod_i_2expp1(r, nn1, nn2, limbs);
mpn_to_mpz(m2, r, limbs);
ref_norm(m2, p);
ref_submod_i(m, mn1, mn2, p, n, w);
if (mpz_cmp(m, m2) != 0)
{
printf("mpn_submmod_i_2expp1 error\n");
gmp_printf("want %Zx\n\n", m);
gmp_printf("got %Zx\n", m2);
abort();
}
TMP_FREE;
}
}
}
mpz_clear(p);
mpz_clear(m);
mpz_clear(m2);
mpz_clear(mn1);
mpz_clear(mn2);
gmp_randclear(state);
}
void test_mul_2expmod()
{
ulong i, j, k, l, n, w, limbs, d;
mpz_t p, m, m2, mn1, mn2;
mpz_init(p);
mpz_init(m);
mpz_init(m2);
mpz_init(mn1);
mp_limb_t * nn1, * r;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (i = 2*GMP_LIMB_BITS; i < 64*GMP_LIMB_BITS; i += 2*GMP_LIMB_BITS)
{
for (j = 1; j < 32; j++)
{
for (k = 1; k <= 2*GMP_NUMB_BITS; k <<= 1)
{
for (d = 0; d < GMP_LIMB_BITS; d++)
{
n = i/k;
w = j*k;
limbs = (n*w)/GMP_LIMB_BITS;
TMP_MARK;
nn1 = TMP_BALLOC_LIMBS(limbs + 1);
r = TMP_BALLOC_LIMBS(limbs + 1);
rand_n(nn1, state, limbs);
mpn_to_mpz(mn1, nn1, limbs);
set_p(p, n, w);
mpn_mul_2expmod_2expp1(r, nn1, limbs, d);
mpn_to_mpz(m2, r, limbs);
ref_norm(m2, p);
ref_mul_2expmod(m, mn1, p, n, w, d);
if (mpz_cmp(m, m2) != 0)
{
printf("mpn_mul_2expmod_2expp1 error\n");
gmp_printf("want %Zx\n\n", m);
gmp_printf("got %Zx\n", m2);
abort();
}
TMP_FREE;
}
}
}
}
mpz_clear(p);
mpz_clear(m);
mpz_clear(m2);
mpz_clear(mn1);
gmp_randclear(state);
}
void test_FFT_negacyclic_twiddle()
{
ulong i, j, k, l, n, w, limbs, d;
mpz_t p, m, m2, mn1, mn2;
mpz_init(p);
mpz_init(m);
mpz_init(m2);
mpz_init(mn1);
mp_limb_t * nn1, * r;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (i = 2*GMP_LIMB_BITS; i < 20*GMP_LIMB_BITS; i += 2*GMP_LIMB_BITS)
{
for (j = 1; j < 10; j++)
{
for (k = 1; k <= 2*GMP_NUMB_BITS; k <<= 1)
{
n = i/k;
w = 2*j*k;
for (d = 0; d < 2*n; d++)
{
limbs = (n*w)/GMP_LIMB_BITS;
TMP_MARK;
nn1 = TMP_BALLOC_LIMBS(limbs + 1);
r = TMP_BALLOC_LIMBS(limbs + 1);
rand_n(nn1, state, limbs);
mpn_to_mpz(mn1, nn1, limbs);
set_p(p, n, w);
if (!FFT_negacyclic_twiddle(r, nn1, d, n, w))
MPN_COPY(r, nn1, limbs + 1);
mpn_to_mpz(m2, r, limbs);
ref_norm(m2, p);
ref_mul_2expmod(m, mn1, p, n, w, d*w/2);
if (mpz_cmp(m, m2) != 0)
{
printf("FFT_negacyclic_twiddle error\n");
gmp_printf("want %Zx\n\n", m);
gmp_printf("got %Zx\n", m2);
abort();
}
TMP_FREE;
}
}
}
}
for (i = 2*GMP_LIMB_BITS; i < 20*GMP_LIMB_BITS; i += 2*GMP_LIMB_BITS)
{
for (j = 1; j < 10; j++)
{
for (k = 1; k <= 2*GMP_NUMB_BITS; k <<= 1)
{
n = i/k;
w = 2*j*k;
for (d = 0; d < 2*n; d++)
{
limbs = (n*w)/GMP_LIMB_BITS;
TMP_MARK;
nn1 = TMP_BALLOC_LIMBS(limbs + 1);
r = TMP_BALLOC_LIMBS(limbs + 1);
rand_n(nn1, state, limbs);
mpn_to_mpz(mn1, nn1, limbs);
set_p(p, n, w);
if (!FFT_negacyclic_twiddle(r, nn1, 4*n - d, n, w))
MPN_COPY(r, nn1, limbs + 1);
mpn_to_mpz(m2, r, limbs);
ref_norm(m2, p);
ref_div_2expmod(m, mn1, p, n, w, d*w/2);
if (mpz_cmp(m, m2) != 0)
{
printf("FFT_negacyclic_twiddle error\n");
gmp_printf("want %Zx\n\n", m);
gmp_printf("got %Zx\n", m2);
abort();
}
TMP_FREE;
}
}
}
}
mpz_clear(p);
mpz_clear(m);
mpz_clear(m2);
mpz_clear(mn1);
gmp_randclear(state);
}
void test_div_2expmod()
{
ulong i, j, k, l, n, w, limbs, d;
mpz_t p, m, m2, mn1, mn2;
mpz_init(p);
mpz_init(m);
mpz_init(m2);
mpz_init(mn1);
mp_limb_t * nn1, * r;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (i = 2*GMP_LIMB_BITS; i < 64*GMP_LIMB_BITS; i += 2*GMP_LIMB_BITS)
{
for (j = 1; j < 32; j++)
{
for (k = 1; k <= 2*GMP_NUMB_BITS; k <<= 1)
{
for (d = 0; d < GMP_LIMB_BITS; d++)
{
n = i/k;
w = j*k;
limbs = (n*w)/GMP_LIMB_BITS;
TMP_MARK;
nn1 = TMP_BALLOC_LIMBS(limbs + 1);
r = TMP_BALLOC_LIMBS(limbs + 1);
rand_n(nn1, state, limbs);
mpn_to_mpz(mn1, nn1, limbs);
set_p(p, n, w);
mpn_div_2expmod_2expp1(r, nn1, limbs, d);
mpn_to_mpz(m2, r, limbs);
ref_norm(m2, p);
ref_norm(mn1, p);
ref_mul_2expmod(m, m2, p, n, w, d);
if (mpz_cmp(m, mn1) != 0)
{
printf("mpn_div_2expmod_2expp1 error\n");
gmp_printf("want %Zx\n\n", mn1);
gmp_printf("got %Zx\n", m);
abort();
}
TMP_FREE;
}
}
}
}
mpz_clear(p);
mpz_clear(m);
mpz_clear(m2);
mpz_clear(mn1);
gmp_randclear(state);
}
void test_lshB_sumdiffmod()
{
ulong c, i, j, k, l, x, y, n, w, limbs;
mpz_t p, ma, mb, m2a, m2b, mn1, mn2;
mpz_init(p);
mpz_init(ma);
mpz_init(mb);
mpz_init(m2a);
mpz_init(m2b);
mpz_init(mn1);
mpz_init(mn2);
mp_limb_t * nn1, * nn2, * r1, * r2;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (i = 2*GMP_LIMB_BITS; i < 20*GMP_LIMB_BITS; i += 2*GMP_LIMB_BITS)
{
for (j = 1; j < 10; j++)
{
for (k = 1; k <= 2*GMP_NUMB_BITS; k <<= 1)
{
n = i/k;
w = j*k;
limbs = (n*w)/GMP_LIMB_BITS;
for (c = 0; c < limbs; c++)
{
x = gmp_urandomm_ui(state, limbs + 1);
y = gmp_urandomm_ui(state, limbs + 1);
TMP_MARK;
nn1 = TMP_BALLOC_LIMBS(limbs + 1);
nn2 = TMP_BALLOC_LIMBS(limbs + 1);
r1 = TMP_BALLOC_LIMBS(limbs + 1);
r2 = TMP_BALLOC_LIMBS(limbs + 1);
rand_n(nn1, state, limbs);
rand_n(nn2, state, limbs);
mpn_to_mpz(mn1, nn1, limbs);
mpn_to_mpz(mn2, nn2, limbs);
set_p(p, n, w);
mpn_lshB_sumdiffmod_2expp1(r1, r2, nn1, nn2, limbs, x, y);
mpn_to_mpz(m2a, r1, limbs);
mpn_to_mpz(m2b, r2, limbs);
ref_norm(m2a, p);
ref_norm(m2b, p);
ref_lshB_sumdiffmod(ma, mb, mn1, mn2, p, n, w, x, y);
if (mpz_cmp(ma, m2a) != 0)
{
printf("mpn_lshB_sumdiffmod_2expp1 error a\n");
printf("x = %ld, y = %ld\n", x, y);
gmp_printf("want %Zx\n\n", ma);
gmp_printf("got %Zx\n", m2a);
abort();
}
if (mpz_cmp(mb, m2b) != 0)
{
printf("mpn_lshB_sumdiffmod_2expp1 error b\n");
printf("x = %ld, y = %ld\n", x, y);
gmp_printf("want %Zx\n\n", mb);
gmp_printf("got %Zx\n", m2b);
abort();
}
TMP_FREE;
}
}
}
}
mpz_clear(p);
mpz_clear(ma);
mpz_clear(mb);
mpz_clear(m2a);
mpz_clear(m2b);
mpz_clear(mn1);
mpz_clear(mn2);
gmp_randclear(state);
}
void test_sumdiff_rshBmod()
{
ulong c, i, j, k, l, x, y, n, w, limbs;
mpz_t p, ma, mb, m2a, m2b, mn1, mn2;
mpz_init(p);
mpz_init(ma);
mpz_init(mb);
mpz_init(m2a);
mpz_init(m2b);
mpz_init(mn1);
mpz_init(mn2);
mp_limb_t * nn1, * nn2, * r1, * r2;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (i = 2*GMP_LIMB_BITS; i < 20*GMP_LIMB_BITS; i += 2*GMP_LIMB_BITS)
{
for (j = 1; j < 10; j++)
{
for (k = 1; k <= 2*GMP_NUMB_BITS; k <<= 1)
{
n = i/k;
w = j*k;
limbs = (n*w)/GMP_LIMB_BITS;
for (c = 0; c < limbs; c++)
{
x = gmp_urandomm_ui(state, limbs);
y = gmp_urandomm_ui(state, limbs);
TMP_MARK;
nn1 = TMP_BALLOC_LIMBS(limbs + 1);
nn2 = TMP_BALLOC_LIMBS(limbs + 1);
r1 = TMP_BALLOC_LIMBS(limbs + 1);
r2 = TMP_BALLOC_LIMBS(limbs + 1);
rand_n(nn1, state, limbs);
rand_n(nn2, state, limbs);
mpn_to_mpz(mn1, nn1, limbs);
mpn_to_mpz(mn2, nn2, limbs);
set_p(p, n, w);
mpn_sumdiff_rshBmod_2expp1(r1, r2, nn1, nn2, limbs, x, y);
mpn_to_mpz(m2a, r1, limbs);
mpn_to_mpz(m2b, r2, limbs);
ref_norm(m2a, p);
ref_norm(m2b, p);
ref_sumdiff_rshBmod(ma, mb, mn1, mn2, p, n, w, x, y);
if (mpz_cmp(ma, m2a) != 0)
{
printf("mpn_sumdiff_rshBmod_2expp1 error a\n");
printf("x = %ld, y = %ld, limbs = %ld\n", x, y, limbs);
gmp_printf("want %Zx\n\n", ma);
gmp_printf("got %Zx\n", m2a);
abort();
}
if (mpz_cmp(mb, m2b) != 0)
{
printf("mpn_sumdiff_rshBmod_2expp1 error b\n");
printf("x = %ld, y = %ld, limbs = %ld\n", x, y, limbs);
gmp_printf("want %Zx\n\n", mb);
gmp_printf("got %Zx\n", m2b);
abort();
}
TMP_FREE;
}
}
}
}
mpz_clear(p);
mpz_clear(ma);
mpz_clear(mb);
mpz_clear(m2a);
mpz_clear(m2b);
mpz_clear(mn1);
mpz_clear(mn2);
gmp_randclear(state);
}
void test_FFT_split_radix_butterfly()
{
ulong i, j, k, x, n, w, limbs;
mpz_t p, ma, mb, m2a, m2b, mn1, mn2;
mpz_init(p);
mpz_init(ma);
mpz_init(mb);
mpz_init(m2a);
mpz_init(m2b);
mpz_init(mn1);
mpz_init(mn2);
mp_limb_t * nn1, * nn2, * temp;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (i = 2*GMP_LIMB_BITS; i < 20*GMP_LIMB_BITS; i += 2*GMP_LIMB_BITS)
{
for (j = 1; j < 10; j++)
{
for (k = 1; k <= 2*GMP_NUMB_BITS; k <<= 1)
{
n = i/k;
w = j*k;
limbs = (n*w)/GMP_LIMB_BITS;
for (x = 0; x < n + 1; x++)
{
TMP_MARK;
nn1 = TMP_BALLOC_LIMBS(limbs + 1);
nn2 = TMP_BALLOC_LIMBS(limbs + 1);
temp = TMP_BALLOC_LIMBS(2*(limbs + 1));
rand_n(nn1, state, limbs);
rand_n(nn2, state, limbs);
mpn_to_mpz(mn1, nn1, limbs);
mpn_to_mpz(mn2, nn2, limbs);
set_p(p, n, w);
FFT_split_radix_butterfly(nn1, nn2, x, n, w, temp);
mpn_to_mpz(m2a, nn1, limbs);
mpn_to_mpz(m2b, nn2, limbs);
ref_norm(m2a, p);
ref_norm(m2b, p);
ref_FFT_split_radix_butterfly(mn1, mn2, p, x, n, w);
if (mpz_cmp(m2a, mn1) != 0)
{
printf("FFT_split_radix_butterfly error a\n");
printf("x = %ld\n", x);
gmp_printf("want %Zx\n\n", mn1);
gmp_printf("got %Zx\n", m2a);
abort();
}
if (mpz_cmp(m2b, mn2) != 0)
{
printf("FFT_split_radix_butterfly error b\n");
printf("x = %ld\n", x);
gmp_printf("want %Zx\n\n", mn2);
gmp_printf("got %Zx\n", m2b);
abort();
}
TMP_FREE;
}
}
}
}
mpz_clear(p);
mpz_clear(ma);
mpz_clear(mb);
mpz_clear(m2a);
mpz_clear(m2b);
mpz_clear(mn1);
mpz_clear(mn2);
gmp_randclear(state);
}
void test_mul()
{
ulong depth = 13UL;
ulong w2 = 3UL;
ulong iters = 1;
ulong n = (1UL<<depth);
ulong w = w2*w2;
ulong bits1 = (n*w - depth)/2;
ulong bits = n*bits1;
ulong int_limbs = (bits - 1UL)/GMP_LIMB_BITS + 1;
ulong i, j;
mp_limb_t *i1, *i2, *r1, *r2;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
i1 = TMP_BALLOC_LIMBS(6*int_limbs);
i2 = i1 + int_limbs;
r1 = i2 + int_limbs;
r2 = r1 + 2*int_limbs;
for (i = 0; i < iters; i++)
{
mpn_urandomb(i1, state, bits);
mpn_urandomb(i2, state, bits);
mpn_mul_n(r2, i1, i2, int_limbs);
new_mpn_mul(r1, i1, int_limbs, i2, int_limbs, depth, w2);
for (j = 0; j < 2*int_limbs; j++)
{
if (r1[j] != r2[j])
{
printf("error in limb %ld, %lx != %lx\n", j, r1[j], r2[j]);
abort();
}
}
}
TMP_FREE;
gmp_randclear(state);
}
void time_mul_with_negacyclic()
{
ulong depth = 17UL;
ulong w2 = 1UL;
ulong iters = 1;
ulong n = (1UL<<depth);
ulong w = w2*w2;
ulong bits1 = (n*w - depth)/2;
ulong bits = n*bits1;
ulong int_limbs = (bits - 1UL)/GMP_LIMB_BITS + 1;
ulong i, j;
mp_limb_t *i1, *i2, *r1;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
i1 = TMP_BALLOC_LIMBS(4*int_limbs);
i2 = i1 + int_limbs;
r1 = i2 + int_limbs;
mpn_urandomb(i1, state, bits);
mpn_urandomb(i2, state, bits);
for (i = 0; i < iters; i++)
{
new_mpn_mul(r1, i1, int_limbs, i2, int_limbs, depth, w2);
}
TMP_FREE;
gmp_randclear(state);
}
void test_mulmod()
{
ulong depth = 7UL; //should be at least 5 (or 4 on 32 bit machine)
ulong w = 2UL; // should be even
ulong iters = 1000;
ulong n = (1UL<<depth);
ulong bits1 = (n*w - depth - 2)/2; // length is 2*n, hence depth + 1, but the
// negacyclic convolution subtracts coefficients
// and signs need to be differentiated
ulong bits = 2*n*bits1;
ulong int_limbs = bits/GMP_LIMB_BITS;
ulong i, j;
mp_limb_t *i1, *i2, *r1, *r2, *tt;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
i1 = TMP_BALLOC_LIMBS(6*(int_limbs+1));
i2 = i1 + int_limbs + 1;
r1 = i2 + int_limbs + 1;
r2 = r1 + int_limbs + 1;
tt = r2 + int_limbs + 1;
for (i = 0; i < iters; i++)
{
mpn_rrandom(i1, state, int_limbs);
i1[int_limbs] = CNST_LIMB(0);
mpn_rrandom(i2, state, int_limbs);
i2[int_limbs] = CNST_LIMB(0);
mpn_mulmod_2expp1(r2, i1, i2, 0, bits, tt);
FFT_mulmod_2expp1(r1, i1, i2, int_limbs, depth, w);
ulong wrong = 0;
for (j = 0; j < int_limbs; j++)
{
if (r1[j] != r2[j])
{
if (wrong < 10)
printf("error in limb %ld, %lx != %lx\n", j, r1[j], r2[j]);
wrong++;
}
}
if (wrong) printf("%ld limbs wrong\n", wrong);
}
TMP_FREE;
gmp_randclear(state);
}
void test_fft_ifft()
{
ulong depth = 10UL;
ulong n = (1UL<<depth);
ulong w = 1;
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, ** kk, *tt, *t1, *t2, *u1, *u2, *v1, *v2, **s1, **s2, **s3;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
rand_n(ii[i], state, limbs);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
jj = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) jj + 2*n; i < 2*n; i++, ptr += size)
{
jj[i] = ptr;
MPN_COPY(jj[i], ii[i], limbs + 1);
}
u1 = ptr;
u2 = u1 + size;
s2 = (mp_limb_t **) u2 + size;
kk = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) kk + 2*n; i < 2*n; i++, ptr += size)
{
kk[i] = ptr;
}
v1 = ptr;
v2 = v1 + size;
s3 = (mp_limb_t **) v2 + size;
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < 1; i++)
{
//FFT_split_radix(ii, 1, ii, n, w, &t1, &t2, s1);
FFT_radix2_new(ii, 1, ii, n, w, &t1, &t2, s1);
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (j = 0; j < 2*n; j++)
{
s = revbin(j, depth);
MPN_COPY(kk[j], ii[j], limbs + 1);
}
IFFT_radix2_new(kk, 1, kk, n, w, &v1, &v2, s3);
for (j = 0; j < 2*n; j++)
{
mpn_mul_2expmod_2expp1(jj[j], jj[j], limbs, depth + 1);
mpn_normmod_2expp1(jj[j], limbs);
mpn_normmod_2expp1(kk[j], limbs);
}
for (j = 0; j < 2*n; j++)
{
if (mpn_cmp(kk[j], jj[j], limbs + 1) != 0)
{
printf("Error in entry %ld\n", j);
abort();
}
}
}
TMP_FREE;
gmp_randclear(state);
}
void test_fft_ifft_truncate()
{
ulong depth = 10UL;
ulong n = (1UL<<depth);
ulong w = 1;
ulong iter = 1000;
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s, count;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, ** kk, *tt, *t1, *t2, *u1, *u2, *v1, *v2, **s1, **s2, **s3;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*n + 2*n*size + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
rand_n(ii[i], state, limbs);
}
t1 = (mp_limb_t *) ii + 2*n + 2*n*size;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
jj = (mp_limb_t **) TMP_BALLOC_LIMBS(2*n + 2*n*size + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) jj + 2*n; i < 2*n; i++, ptr += size)
{
jj[i] = ptr;
MPN_COPY(jj[i], ii[i], limbs + 1);
}
u1 = (mp_limb_t *) jj + 2*n + 2*n*size;
u2 = u1 + size;
s2 = (mp_limb_t **) u2 + size;
kk = (mp_limb_t **) TMP_BALLOC_LIMBS(2*n + 2*n*size + 2*n + 2*size);
for (i = 0, j = 0, ptr = (mp_limb_t *) kk + 2*n; i < 2*n; i++, ptr += size)
{
kk[i] = ptr;
}
v1 = (mp_limb_t *) kk + 2*n + 2*n*size;
v2 = v1 + size;
s3 = (mp_limb_t **) v2 + size;
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (count = 0; count < iter; count++)
{
for (i = 0; i < 2*n; i++)
{
rand_n(ii[i], state, limbs);
}
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
{
MPN_COPY(jj[i], ii[i], limbs + 1);
}
ulong trunc = gmp_urandomm_ui(state, 2*n) + 1;
trunc = ((trunc + 1)/2)*2;
FFT_radix2_truncate(ii, 1, ii, n, w, &t1, &t2, s1, trunc);
for (j = 0; j < trunc; j++)
{
mpn_normmod_2expp1(ii[j], limbs);
MPN_COPY(kk[j], ii[j], limbs + 1);
}
IFFT_radix2_truncate(kk, 1, kk, n, w, &v1, &v2, s3, trunc);
for (j = 0; j < trunc; j++)
{
mpn_mul_2expmod_2expp1(jj[j], jj[j], limbs, depth + 1);
mpn_normmod_2expp1(jj[j], limbs);
mpn_normmod_2expp1(kk[j], limbs);
if (mpn_cmp(kk[j], jj[j], limbs + 1) != 0)
{
gmp_printf("Error in entry %ld, %Nx != %Nx\n", j, kk[j], limbs + 1, jj[j], limbs + 1);
abort();
}
}
}
TMP_FREE;
gmp_randclear(state);
}
void test_fft_ifft_mfa()
{
ulong depth = 12UL;
ulong n = (1UL<<depth);
ulong w = 1;
ulong sqrt = (1UL<<(depth/2));
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, ** kk, *tt, *t1, *t2, *u1, *u2, *v1, *v2, **s1, **s2, **s3;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
rand_n(ii[i], state, limbs);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
jj = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) jj + 2*n; i < 2*n; i++, ptr += size)
{
jj[i] = ptr;
MPN_COPY(jj[i], ii[i], limbs + 1);
}
u1 = ptr;
u2 = u1 + size;
s2 = (mp_limb_t **) u2 + size;
kk = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) kk + 2*n; i < 2*n; i++, ptr += size)
{
kk[i] = ptr;
}
v1 = ptr;
v2 = v1 + size;
s3 = (mp_limb_t **) v2 + size;
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < 1; i++)
{
FFT_radix2_mfa(ii, n, w, &t1, &t2, s1, sqrt);
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (j = 0; j < 2*n; j++)
{
MPN_COPY(kk[j], ii[j], limbs + 1);
}
IFFT_radix2_mfa(kk, n, w, &v1, &v2, s3, sqrt);
for (j = 0; j < 2*n; j++)
{
mpn_mul_2expmod_2expp1(jj[j], jj[j], limbs, depth + 1);
mpn_normmod_2expp1(jj[j], limbs);
mpn_normmod_2expp1(kk[j], limbs);
}
for (j = 0; j < 2*n; j++)
{
if (mpn_cmp(kk[j], jj[j], limbs + 1) != 0)
{
gmp_printf("Error in entry %ld, %Nx != %Nx\n", j, kk[j], limbs + 1, jj[j], limbs + 1);
abort();
}
}
}
TMP_FREE;
gmp_randclear(state);
}
void test_fft_ifft_mfa_truncate()
{
ulong depth = 12UL;
ulong n = (1UL<<depth);
ulong w = 1;
ulong iters = 100;
ulong sqrt = (1UL<<(depth/2));
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s, count, trunc;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, ** kk, *tt, *t1, *t2, *u1, *u2, *v1, *v2, **s1, **s2, **s3;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
rand_n(ii[i], state, limbs);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
jj = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) jj + 2*n; i < 2*n; i++, ptr += size)
{
jj[i] = ptr;
MPN_COPY(jj[i], ii[i], limbs + 1);
}
u1 = ptr;
u2 = u1 + size;
s2 = (mp_limb_t **) u2 + size;
kk = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) kk + 2*n; i < 2*n; i++, ptr += size)
{
kk[i] = ptr;
}
v1 = ptr;
v2 = v1 + size;
s3 = (mp_limb_t **) v2 + size;
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (count = 0; count < iters; count++)
{
ulong trunc = (gmp_urandomm_ui(state, (n/sqrt)/2) + 1)*sqrt*2;
for (i = 0; i < 2*n; i++)
{
rand_n(ii[i], state, limbs);
}
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
{
MPN_COPY(jj[i], ii[i], limbs + 1);
}
FFT_radix2_mfa_truncate(ii, n, w, &t1, &t2, s1, sqrt, trunc);
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (j = 0; j < 2*n; j++)
{
MPN_COPY(kk[j], ii[j], limbs + 1);
}
IFFT_radix2_mfa_truncate(kk, n, w, &v1, &v2, s3, sqrt, trunc);
for (j = 0; j < 2*n; j++)
{
mpn_mul_2expmod_2expp1(jj[j], jj[j], limbs, depth + 1);
mpn_normmod_2expp1(jj[j], limbs);
mpn_normmod_2expp1(kk[j], limbs);
}
for (j = 0; j < trunc; j++)
{
if (mpn_cmp(kk[j], jj[j], limbs + 1) != 0)
{
gmp_printf("Error in entry %ld, %Nx != %Nx\n", j, kk[j], limbs + 1, jj[j], limbs + 1);
abort();
}
}
}
TMP_FREE;
gmp_randclear(state);
}
void test_fft_truncate()
{
ulong depth = 10UL;
ulong n = (1UL<<depth);
ulong w = 1;
ulong iter = 1000;
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s, count;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, ** kk, *tt, *t1, *t2, *u1, *u2, *v1, *v2, **s1, **s2, **s3;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
for (count = 0; count < iter; count++)
{
ulong trunc = gmp_urandomm_ui(state, 2*n) + 1;
trunc = ((trunc + 7)/8)*8;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*n + 2*n*size + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
if (i < trunc) rand_n(ii[i], state, limbs);
else MPN_ZERO(ii[i], limbs + 1);
}
t1 = (mp_limb_t *) ii + 2*n + 2*n*size;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
jj = (mp_limb_t **) TMP_BALLOC_LIMBS(2*n + 2*n*size + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) jj + 2*n; i < 2*n; i++, ptr += size)
{
jj[i] = ptr;
MPN_COPY(jj[i], ii[i], limbs + 1);
}
u1 = (mp_limb_t *) jj + 2*n + 2*n*size;
u2 = u1 + size;
s2 = (mp_limb_t **) u2 + size;
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < 1; i++)
{
FFT_radix2_truncate(ii, 1, ii, n, w, &t1, &t2, s1, trunc);
FFT_radix2_new(jj, 1, jj, n, w, &u1, &u2, s2);
for (j = 0; j < trunc; j++)
{
mpn_normmod_2expp1(jj[j], limbs);
mpn_normmod_2expp1(ii[j], limbs);
if (mpn_cmp(ii[j], jj[j], limbs + 1) != 0)
{
gmp_printf("Error in entry %ld, %Nx != %Nx\n", j, ii[j], limbs + 1, jj[j], limbs + 1);
abort();
}
}
}
TMP_FREE;
}
gmp_randclear(state);
}
void test_flint_fft()
{
ulong depth = 10L;
ulong iters = 1;
ulong w = 1;
ulong n = (1UL<<depth);
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
if (i < n) rand_n(ii[i], state, limbs);
else MPN_ZERO(ii[i], limbs + 1);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
ZmodF_poly_t poly;
ZmodF_poly_init(poly, depth + 1, limbs, 2);
poly->length = 2*n;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
MPN_COPY(poly->coeffs[i], ii[i], limbs + 1);
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < 1; i++)
{
FFT_radix2_new(ii, 1, ii, n, w, &t1, &t2, s1);
ZmodF_poly_FFT(poly, 2*n);
for (j = 0; j < 2*n; j++)
{
mpn_normmod_2expp1(ii[j], limbs);
ZmodF_normalise(poly->coeffs[j], limbs);
if (mpn_cmp(ii[j], poly->coeffs[j], limbs + 1) != 0)
{
printf("Coeff %ld incorrect\n", j);
}
}
}
ZmodF_poly_clear(poly);
TMP_FREE;
gmp_randclear(state);
}
void time_flint_mfa()
{
#define NEWMFA 1
ulong depth = 12L;
ulong iters = 1000;
ulong w2 = 1;
ulong n = (1UL<<depth)/w2;
ulong w = w2*w2;
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
if (i < n) rand_n(ii[i], state, limbs);
else MPN_ZERO(ii[i], limbs + 1);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
ZmodF_poly_t poly;
ZmodF_poly_init(poly, depth + 1, limbs, 2);
poly->length = 2*n;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
MPN_COPY(poly->coeffs[i], ii[i], limbs + 1);
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < iters; i++)
{
#if NEWMFA
FFT_radix2_mfa(ii, n, w, &t1, &t2, s1, (1UL<<(depth/2))/w2);
#else
ZmodF_poly_FFT(poly, 2*n);
#endif
}
ZmodF_poly_clear(poly);
TMP_FREE;
gmp_randclear(state);
}
void test_mfa_fft()
{
ulong depth = 12L;
ulong iters = 1;
ulong w = 1;
ulong n = (1UL<<depth);
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s, n1, n2;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2, **trans;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
if (i < n) rand_n(ii[i], state, limbs);
else MPN_ZERO(ii[i], limbs + 1);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
ZmodF_poly_t poly;
ZmodF_poly_init(poly, depth + 1, limbs, 2);
poly->length = 2*n;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
MPN_COPY(poly->coeffs[i], ii[i], limbs + 1);
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
trans = (mp_limb_t **) TMP_BALLOC_LIMBS(2*n);
n1 = (1UL<<(depth/2));
n2 = (2*n)/n1;
FFT_radix2_mfa(ii, n, w, &t1, &t2, s1, n1);
// transpose matrix
for (i = 0; i < n2; i++)
{
for (j = 0; j < n1; j++)
{
trans[j*n2 + i] = ii[i*n1 + j];
}
}
for (i = 0; i < 2*n; i++)
ii[i] = trans[i];
ZmodF_poly_FFT(poly, 2*n);
for (j = 0; j < 2*n; j++)
{
s = revbin(j, depth);
if (j < s)
{
ptr = ii[j];
ii[j] = ii[s];
ii[s] = ptr;
}
mpn_normmod_2expp1(ii[j], limbs);
ZmodF_normalise(poly->coeffs[j], limbs);
if (mpn_cmp(ii[j], poly->coeffs[j], limbs + 1) != 0)
{
printf("Coeff %ld incorrect\n", j);
}
}
ZmodF_poly_clear(poly);
TMP_FREE;
gmp_randclear(state);
}
void test_flint_ifft()
{
#define NEWFFT 1
#define IFFT 1
ulong depth = 10L;
ulong iters = 1;
ulong w = 1;
ulong n = (1UL<<depth);
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
if (i < n) rand_n(ii[i], state, limbs);
else MPN_ZERO(ii[i], limbs + 1);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
ZmodF_poly_t poly;
ZmodF_poly_init(poly, depth + 1, limbs, 2);
poly->length = 2*n;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
MPN_COPY(poly->coeffs[i], ii[i], limbs + 1);
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < 1; i++)
{
IFFT_radix2_new(ii, 1, ii, n, w, &t1, &t2, s1);
ZmodF_poly_IFFT(poly);
for (j = 0; j < 2*n; j++)
{
mpn_normmod_2expp1(ii[j], limbs);
ZmodF_normalise(poly->coeffs[j], limbs);
if (mpn_cmp(ii[j], poly->coeffs[j], limbs) != 0)
{
gmp_printf("%Nx\n", ii[j], limbs + 1);
gmp_printf("%Nx\n", poly->coeffs[j], limbs + 1);
printf("Coeff %ld incorrect\n", j);
}
}
}
ZmodF_poly_clear(poly);
TMP_FREE;
gmp_randclear(state);
}
void time_flint_ifft()
{
#define NEWIFFT 0
ulong depth = 16L;
ulong iters = 1;
ulong w = 1;
ulong n = (1UL<<depth);
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
if (i < n) rand_n(ii[i], state, limbs);
else MPN_ZERO(ii[i], limbs + 1);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
ZmodF_poly_t poly;
ZmodF_poly_init(poly, depth + 1, limbs, 2);
poly->length = 2*n;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
MPN_COPY(poly->coeffs[i], ii[i], limbs + 1);
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < iters; i++)
{
#if NEWIFFT
IFFT_radix2_new(ii, 1, ii, n, w, &t1, &t2, s1);
#else
ZmodF_poly_IFFT(poly);
#endif
}
ZmodF_poly_clear(poly);
TMP_FREE;
gmp_randclear(state);
}
void time_flint_negacyclic_fft()
{
#define NEGFFT 1
ulong depth = 10L;
ulong iters = 10000;
ulong w = 4;
ulong n = 512;//(1UL<<depth);
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
if (i < n) rand_n(ii[i], state, limbs);
else MPN_ZERO(ii[i], limbs + 1);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
ZmodF_poly_t poly;
ZmodF_poly_init(poly, depth + 1, limbs, 2);
poly->length = 2*n;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
MPN_COPY(poly->coeffs[i], ii[i], limbs + 1);
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < iters; i++)
{
#if NEGFFT
FFT_radix2_negacyclic(ii, 1, ii, n, w, &t1, &t2, s1);
#else
ZmodF_poly_IFFT(poly);
#endif
}
ZmodF_poly_clear(poly);
TMP_FREE;
gmp_randclear(state);
}
void time_flint_imfa()
{
#define NEWIMFA 0
ulong depth = 16L;
ulong iters = 1;
ulong w2 = 1;
ulong w = w2*w2;
ulong n = (1UL<<depth)/w2;
ulong limbs = (n*w)/GMP_LIMB_BITS;
ulong size = limbs + 1;
ulong i, j, s;
mp_limb_t * ptr;
mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *u1, *u2, **s1, **s2;
mp_limb_t c;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
ii = (mp_limb_t **) TMP_BALLOC_LIMBS(2*(n + n*size) + 2*n + 2*size);
for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
{
ii[i] = ptr;
if (i < n) rand_n(ii[i], state, limbs);
else MPN_ZERO(ii[i], limbs + 1);
}
t1 = ptr;
t2 = t1 + size;
s1 = (mp_limb_t **) t2 + size;
ZmodF_poly_t poly;
ZmodF_poly_init(poly, depth + 1, limbs, 2);
poly->length = 2*n;
for (j = 0; j < 2*n; j++)
mpn_normmod_2expp1(ii[j], limbs);
for (i = 0; i < 2*n; i++)
MPN_COPY(poly->coeffs[i], ii[i], limbs + 1);
tt = (mp_limb_t *) TMP_BALLOC_LIMBS(2*size);
for (i = 0; i < iters; i++)
{
#if NEWIMFA
IFFT_radix2_mfa(ii, n, w, &t1, &t2, s1, (1UL<<(depth/2))/w2);
#else
ZmodF_poly_IFFT(poly);
#endif
}
ZmodF_poly_clear(poly);
TMP_FREE;
gmp_randclear(state);
}
void time_mul()
{
ulong depth = 13UL;
ulong w2 = 1UL;
ulong iters = 100;
ulong n = (1UL<<depth);
ulong w = w2*w2;
ulong bits1 = (n*w - depth)/2;
ulong bits = (8364032*8)/8; //n*bits1;
printf("bits = %ld\n", bits);
ulong int_limbs = (bits - 1UL)/GMP_LIMB_BITS + 1;
ulong i, j;
mp_limb_t *i1, *i2, *r1, *r2;
gmp_randstate_t state;
gmp_randinit_default(state);
TMP_DECL;
TMP_MARK;
i1 = TMP_BALLOC_LIMBS(6*int_limbs);
i2 = i1 + int_limbs;
r1 = i2 + int_limbs;
r2 = r1 + 2*int_limbs;
mpn_urandomb(i1, state, bits);
mpn_urandomb(i2, state, bits);
for (i = 0; i < iters; i++)
{
new_mpn_mul(r1, i1, int_limbs, i2, int_limbs, depth, w2);
}
TMP_FREE;
gmp_randclear(state);
}
int main(void)
{
test_norm(); printf("mpn_normmod_2expp1...PASS\n");
test_submod_i(); printf("mpn_submod_i_2expp1...PASS\n");
test_mul_2expmod(); printf("mpn_mul_2expmod_2expp1...PASS\n");
test_div_2expmod(); printf("mpn_div_2expmod_2expp1...PASS\n");
test_lshB_sumdiffmod(); printf("mpn_lshB_sumdiffmod_2expp1...PASS\n");
test_sumdiff_rshBmod(); printf("mpn_sumdiff_rshBmod_2expp1...PASS\n");
test_FFT_split_radix_butterfly(); printf("FFT_split_radix_butterfly...PASS\n");
test_flint_fft(); printf("FFT_radix2...PASS\n");
test_flint_ifft(); printf("IFFT_radix2...PASS\n");
test_fft_ifft_mfa_truncate(); printf("FFT_IFFT_MFA_TRUNCATE...PASS\n");
test_mfa_fft(); printf("FFT_MFA...PASS\n");
test_FFT_negacyclic_twiddle(); printf("FFT_negacyclic_twiddle...PASS\n");
test_fft_ifft(); printf("FFT_IFFT...PASS\n");
test_fft_ifft_mfa(); printf("FFT_IFFT_MFA...PASS\n");
test_fft_truncate(); printf("FFT_TRUNCATE...PASS\n");
test_fft_ifft_truncate(); printf("FFT_IFFT_TRUNCATE...PASS\n");
//time_flint_ifft();
//time_flint_mfa();
//time_flint_imfa();
//test_mulmod();
//test_mul();
//time_mul_with_negacyclic(); // negacyclic is currently *disabled*
//time_flint_negacyclic_fft();
time_mul();
return 0;
}