/* floating-point Newton, with inversion in 3M(n) */

/* mpn_invert

Copyright 2009 Paul Zimmermann
Copyright 2009 William Hart

This file is part of the MPIR Library.

The MPIR Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.

The MPIR Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the MPIR Library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "mpir.h"
#include "gmp-impl.h"
#include "longlong.h"
#include "fft/fft_tuning.h"

#define ZERO (mp_limb_t) 0
#define ONE  (mp_limb_t) 1
#define WRAP_AROUND_BOUND 1500

int
mpn_is_invert (mp_srcptr xp, mp_srcptr ap, mp_size_t n)
{
  int res = 1;
  mp_size_t i;
  mp_ptr tp, up;
  mp_limb_t cy;
  TMP_DECL;

  TMP_MARK;
  tp = TMP_ALLOC_LIMBS (2 * n);
  up = TMP_ALLOC_LIMBS (2 * n);

  /* first check X*A < B^(2*n) */
  mpn_mul_n (tp, xp, ap, n);
  cy = mpn_add_n (tp + n, tp + n, ap, n); /* A * msb(X) */
  if (cy != 0)
    res = 0;

  /* now check B^(2n) - X*A <= A */
  mpn_not (tp, 2 * n);
  mpn_add_1 (tp, tp, 2 * n, 1); /* B^(2n) - X*A */
  MPN_ZERO (up, 2 * n);
  MPN_COPY (up, ap, n);
  res = mpn_cmp (tp, up, 2 * n) <= 0;
  TMP_FREE;
  return res;
}

/* Input: A = {ap, n} with most significant bit set.
   Output: X = B^n + {xp, n} where B = 2^GMP_NUMB_BITS.

   X is a lower approximation of B^(2n)/A with implicit msb.
   More precisely, one has:

              A*X < B^(2n) <= A*(X+1)

   or X = ceil(B^(2n)/A) - 1.
*/
void
mpn_invert (mp_ptr xp, mp_srcptr ap, mp_size_t n)
{
  if (n == 1)
    {
      /* invert_limb returns min(B-1, floor(B^2/ap[0])-B),
	 which is B-1 when ap[0]=B/2, and 1 when ap[0]=B-1.
	 For X=B+xp[0], we have A*X < B^2 <= A*(X+1) where
	 the equality holds only when A=B/2.

	 We thus have A*X < B^2 <= A*(X+1).
      */
      invert_limb (xp[0], ap[0]);
    }
  else if (n == 2)
    {
      mp_limb_t tp[4], up[2], sp[2], cy;

      tp[0] = ZERO;
      invert_limb (xp[1], ap[1]);
      tp[3] = mpn_mul_1 (tp + 1, ap, 2, xp[1]);
      cy = mpn_add_n (tp + 2, tp + 2, ap, 2);
      while (cy) /* Xh is too large */
	{
	  xp[1] --;
	  cy -= mpn_sub (tp + 1, tp + 1, 3, ap, 2);
	}
      /* tp[3] should be 111...111 */

      mpn_com_n (sp, tp + 1, 2);
      cy = mpn_add_1 (sp, sp, 2, ONE);
      /* cy should be 0 */

      up[1] = mpn_mul_1 (up, sp + 1, 1, xp[1]);
      cy = mpn_add_1 (up + 1, up + 1, 1, sp[1]);
      /* cy should be 0 */
      xp[0] = up[1];

      /* update tp */
      cy = mpn_addmul_1 (tp, ap, 2, xp[0]);
      cy = mpn_add_1 (tp + 2, tp + 2, 2, cy);
      do
	{
	  cy = mpn_add (tp, tp, 4, ap, 2);
	  if (cy == ZERO)
	    mpn_add_1 (xp, xp, 2, ONE);
	}
      while (cy == ZERO);

      /* now A*X < B^4 <= A*(X+1) */
    }
  else
    {
      mp_size_t l, h;
      mp_ptr tp, up;
      mp_limb_t cy, th;
      int special = 0;
      TMP_DECL;

      l = (n - 1) / 2;
      h = n - l;

      mpn_invert (xp + l, ap + l, h);

      TMP_MARK;
      tp = TMP_ALLOC_LIMBS (n + h);
      up = TMP_ALLOC_LIMBS (2 * h);

      if (n <= WRAP_AROUND_BOUND)
	{
          mpn_mul (tp, ap, n, xp + l, h);
          cy = mpn_add_n (tp + h, tp + h, ap, n);
        }
      else
	{
          mp_size_t m = n + 1;
          mpir_ui k;
          int cc;

          if (m >= FFT_MULMOD_2EXPP1_CUTOFF)
             m = mpir_fft_adjust_limbs (m);
          /* we have m >= n + 1 by construction, thus m > h */
          ASSERT(m < n + h);
          cy = mpn_mulmod_Bexpp1_fft (tp, m, ap, n, xp + l, h);
          /* cy, {tp, m} = A * {xp + l, h} mod (B^m+1) */
          cy += mpn_add_n (tp + h, tp + h, ap, m - h);
          cc = mpn_sub_n (tp, tp, ap + m - h, n + h - m);
          cc = mpn_sub_1 (tp + n + h - m, tp + n + h - m, 2 * m - n - h, cc);
          if (cc > cy) /* can only occur if cc=1 and cy=0 */
            cy = mpn_add_1 (tp, tp, m, ONE);
          else
            cy -= cc;
          /* cy, {tp, m} = A * Xh */

          /* add B^(n+h) + B^(n+h-m) */
          MPN_ZERO (tp + m, n + h - m);
          tp[m] = cy;
          /* note: since tp[n+h-1] is either 0, or cy<=1 if m=n+h-1,
             the mpn_incr_u() below cannot produce a carry */
          mpn_incr_u (tp + n + h - m, ONE);
          cy = 1;
          do /* check if T >= B^(n+h) + 2*B^n */
            {
              mp_size_t i;

              if (cy == ZERO)
                break; /* surely T < B^(n+h) */
              if (cy == ONE)
                {
                  for (i = n + h - 1; tp[i] == ZERO && i > n; i--);
                  if (i == n && tp[i] < (mp_limb_t) 2)
                    break;
                }
              /* subtract B^m+1 */
              cy -= mpn_sub_1 (tp, tp, n + h, ONE);
              cy -= mpn_sub_1 (tp + m, tp + m, n + h - m, ONE);
            }
          while (1);
        }

      while (cy)
	{
	  mpn_sub_1 (xp + l, xp + l, h, ONE);
	  cy -= mpn_sub (tp, tp, n + h, ap, n);
	}

      mpn_not (tp, n);
      th = ~tp[n] + mpn_add_1 (tp, tp, n, ONE);
      mpn_mul_n (up, tp + l, xp + l, h);
      cy = mpn_add_n (up + h, up + h, tp + l, h);
      if (th != ZERO)
      {
         cy += ONE + mpn_add_n (up + h, up + h, xp + l, h);
      }
      if (up[2*h-l-1] + 4 <= CNST_LIMB(3)) special = 1;
      MPN_COPY (xp, up + 2 * h - l, l);
      mpn_add_1 (xp + l, xp + l, h, cy);
      TMP_FREE;
      if ((special) && !mpn_is_invert(xp, ap, n))
         mpn_add_1 (xp, xp, n, 1);
    }
}

void mpn_invert_trunc(mp_ptr x_new, mp_size_t m, mp_srcptr xp, mp_size_t n, mp_srcptr ap)
{
  mp_ptr tp;
  mp_limb_t cy;
  TMP_DECL;

  TMP_MARK;
  tp = TMP_ALLOC_LIMBS (2 * m);
  
  MPN_COPY(x_new, xp + n - m, m);
  ap += (n - m);

  mpn_mul_n (tp, x_new, ap, m);
  mpn_add_n (tp + m, tp + m, ap, m); /* A * msb(X) */
  
  /* now check B^(2n) - X*A <= A */
  mpn_not (tp, 2 * m);
  mpn_add_1 (tp, tp, 2 * m, 1); /* B^(2m) - X*A */
  
  while (tp[m] || mpn_cmp (tp, ap, m) > 0)
  {
     mpn_add_1(x_new, x_new, m, 1);
     tp[m] -= mpn_sub_n(tp, tp, ap, m);
  }
  TMP_FREE;
}