/* TC89: experimental implementation of Toom-Cook method for 8, 9 way.

   Utility functions written by Paul Zimmermann, October 2006.
   8,9-way added by Alberto Zanoni, inspired by Paul Zimmermann's
   mpz_tc4, March 2009

   Reference[1]: What About Toom-Cook Matrices Optimality? Marco Bodrato
   and Alberto Zanoni, October 19, 2006, http://bodrato.it/papers/#CIVV2006

   Reference[2]: Integer and polynomial multiplication: Towards
   optimal Toom-Cook matrices. Marco Bodrato and Alberto Zanoni,
   ISSAC 2007, http://bodrato.it/papers/

   Last release: March 25, 2009

The TC89 program is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.

The TC89 program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA.

gcc -g -l gmp Toom-Cook8.c -o Toom-Cook8 && ./Toom-Cook8 <n.limbs> <n. repetitions>

 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/resource.h>
#include "gmp.h"
#include "gmp-impl.h"

#ifdef FINDTHRESHOLD
#define RECURSE(i)    mpz_mul
#define TC8_THRESHOLD   8 /* must be >= 8 */
#define TC9_THRESHOLD   9 /* must be >= 9 */
#else
#define RECURSE(i)    i
#define TC8_THRESHOLD 1200 /* must be >= 8 */
#define TC9_THRESHOLD 1200 /* must be >= 9 */
#define TC8_SQR_THRESHOLD 1200 /* must be >= 8 */
#define TC9_SQR_THRESHOLD 1200 /* must be >= 9 */
#endif

int
cputime ()
{
  struct rusage rus;

  getrusage (0, &rus);
  return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000;
}

#define OUT(x) mpz_out_str (stdout, 10, x); printf ("\n")

/* initializes a "fake" mpz that points to {yp+offset, n} */
void
my_mpz_init (mpz_t x, mpz_t y, mp_size_t offset, mp_size_t n)
{
  mp_size_t yn = ABSIZ(y);

  if (offset >= yn)
    {
      ALLOC(x) = 1;
      PTR(x) = PTR(y);
      SIZ(x) = 0;
    }
  else
    {
      ALLOC(x) = (yn >= offset + n) ? n : yn - offset;
      PTR(x) = PTR(y) + offset;
      SIZ(x) = SIZ(y) > 0 ? ALLOC(x) : -ALLOC(x);
    }
}

/* adds x at {yp+offset}, assuming enough allocated space in y  */
void
mpz_copy (mpz_t y, mpz_t x, mp_size_t offset)
{
  mp_size_t yn = ABSIZ(y);
  mp_size_t xn = ABSIZ(x);
  mp_ptr xp = PTR(x);
  mp_ptr yp = PTR(y);
  mp_limb_t cy = 0;

  if (xn == 0)
    return;

  if (offset < yn) /* low part of x overlaps with y */
    {
      if (offset + xn <= yn) /* x entirely inside y */
        {
          cy = mpn_add_n (yp + offset, yp + offset, xp, xn);
          if (offset + xn < yn)
            cy = mpn_add_1 (yp + offset + xn, yp + offset + xn,
                            yn - (offset + xn), cy);
        }
      else
        cy = mpn_add_n (yp + offset, yp + offset, xp, yn - offset);
      /* now cy is the carry at yp + yn */
      if (xn + offset > yn) /* high part of x exceeds y */
        {
          MPN_COPY (yp + yn, xp + yn - offset, xn + offset - yn);
          cy = mpn_add_1 (yp + yn, yp + yn, xn + offset - yn, cy);
          yn = xn + offset;
        }
      /* now cy is the carry at yp + yn */
      if (cy)
        yp[yn++] = cy;
      MPN_NORMALIZE(yp, yn);
      SIZ(y) = yn;
    }
  else /* x does not overlap */
    {
      if (offset > yn)
        MPN_ZERO (yp + yn, offset - yn);
      MPN_COPY (yp + offset, xp, xn);
      SIZ(y) = offset + xn;
    }
}


/* Implementation of Toom-Cook algorithm 8-way.

  Written by Alberto Zanoni, March 2009.
 
  The (15x15) matrix to be inverted is given by the following gp code

  print1("[ 1 ");              \
  for(i=1,14,print1("0 "));    \
  forstep(i=6,0,-1,            \
   print1("]\n[ ");            \
   if (i<6,                    \
   forstep(j=16,0,-1,          \
    if (i*j == 0,              \
     print1((-1)^(j)," "),     \
       if(j == 1,              \
       print1("-",2^i," "),    \
       if ( j%2,               \
        print1("-"),           \
        print1(""));           \
       print1("2^(",i*j,") " );\
    )));                       \
   print1("]\n[ ");            \
   );                          \
   forstep(j=14,0,-1,          \
    if (i*j == 0,              \
     print1("1 "),             \
       if(j == 1,              \
       print1(" ",2^i," "),    \
       if ( j%2,               \
        print1(" "));          \
       print1("2^(",i*j,") " );\
    )));                       \
  );                           \
  print1("]\n[");              \
  for(i=1,14,print1(" 0"));    \
  print1(" 1 ]");

[1       0      0       0      0       0      0       0      0       0      0       0      0       0  0]
[2^(84)  2^(78) 2^(72)  2^(66) 2^(60)  2^(54) 2^(48)  2^(42) 2^(36)  2^(30) 2^(24)  2^(18) 2^(12)  64 1]
[2^(70) -2^(65) 2^(60) -2^(55) 2^(50) -2^(45) 2^(40) -2^(35) 2^(30) -2^(25) 2^(20) -2^(15) 2^(10) -32 1]
[2^(70)  2^(65) 2^(60)  2^(55) 2^(50)  2^(45) 2^(40)  2^(35) 2^(30)  2^(25) 2^(20)  2^(15) 2^(10)  32 1]
[2^(56) -2^(52) 2^(48) -2^(44) 2^(40) -2^(36) 2^(32) -2^(28) 2^(24) -2^(20) 2^(16) -2^(12) 2^(8)  -16 1]
[2^(56)  2^(52) 2^(48)  2^(44) 2^(40)  2^(36) 2^(32)  2^(28) 2^(24)  2^(20) 2^(16)  2^(12) 2^(8)   16 1]
[2^(42) -2^(39) 2^(36) -2^(33) 2^(30) -2^(27) 2^(24) -2^(21) 2^(18) -2^(15) 2^(12) -2^(9)  2^(6)   -8 1]
[2^(42)  2^(39) 2^(36)  2^(33) 2^(30)  2^(27) 2^(24)  2^(21) 2^(18)  2^(15) 2^(12)  2^(9)  2^(6)    8 1]
[2^(28) -2^(26) 2^(24) -2^(22) 2^(20) -2^(18) 2^(16) -2^(14) 2^(12) -2^(10) 2^(8)  -2^(6)  2^(4)   -4 1]
[2^(28)  2^(26) 2^(24)  2^(22) 2^(20)  2^(18) 2^(16)  2^(14) 2^(12)  2^(10) 2^(8)   2^(6)  2^(4)    4 1]
[2^(14) -2^(13) 2^(12) -2^(11) 2^(10) -2^(9)  2^(8)  -2^(7)  2^(6)  -2^(5)  2^(4)  -2^(3)  2^(2)   -2 1]
[2^(14)  2^(13) 2^(12)  2^(11) 2^(10)  2^(9)  2^(8)   2^(7)  2^(6)   2^(5)  2^(4)   2^(3)  2^(2)    2 1]
[1      -1      1      -1      1      -1      1      -1      1      -1      1      -1      1       -1 1]
[1       1      1       1      1       1      1       1      1       1      1       1      1        1 1]
[0       0      0       0      0       0      0       0      0       0      0       0      0        0 1]
*/

void
mpz_tc8 (mpz_t c, mpz_t a, mpz_t b)
{
  const unsigned int OFFSET = 1;
  mp_size_t  n = mpz_size(a), n8;
  mpz_t r[15], a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
  int sign,i=0,j=0,jp=0;

  if (mpz_size (b) > n)
    {
      mpz_tc8 (c, b, a);
      return;
    }

  if (n < TC8_THRESHOLD)
    {
      mpz_mul (c, a, b);
      return;
    }
  else
    n8 = (n - 1) / 8 + 1;

  sign = mpz_sgn (a) * mpz_sgn (b);
  for( i = 0; i < 15; i++ )
    mpz_init (r[i]);

  /***************************** Decomposition *******************************/
  my_mpz_init (a0, a,      0, n8);        my_mpz_init (b0, b,      0, n8);             
  my_mpz_init (a1, a,     n8, n8);        my_mpz_init (b1, b,     n8, n8); 
  my_mpz_init (a2, a, 2 * n8, n8);        my_mpz_init (b2, b, 2 * n8, n8); 
  my_mpz_init (a3, a, 3 * n8, n8);        my_mpz_init (b3, b, 3 * n8, n8); 
  my_mpz_init (a4, a, 4 * n8, n8);	  my_mpz_init (b4, b, 4 * n8, n8);
  my_mpz_init (a5, a, 5 * n8, n8);	  my_mpz_init (b5, b, 5 * n8, n8);
  my_mpz_init (a6, a, 6 * n8, n8);	  my_mpz_init (b6, b, 6 * n8, n8);
  my_mpz_init (a7, a, 7 * n8, n8);	  my_mpz_init (b7, b, 7 * n8, n8);

  /********************** Evaluation and recursive calls *********************/

  for(i=1; i <= 5; i++)  // Evaluation in powers of 2 and -2.
    {
      j++; jp += 2;
      mpz_set (r[15 - OFFSET], a1);   mpz_set (r[1 - OFFSET], a0); // a factor.

      mpz_addmul_ui (r[15 - OFFSET], a3, 1<<jp);
      mpz_addmul_ui (r[ 1 - OFFSET], a2, 1<<jp);

      mpz_addmul_ui (r[15 - OFFSET], a5, 1<<(2*jp));
      mpz_addmul_ui (r[ 1 - OFFSET], a4, 1<<(2*jp));
      
      mpz_addmul_ui (r[15 - OFFSET], a7, 1<<(3*jp));
      mpz_addmul_ui (r[ 1 - OFFSET], a6, 1<<(3*jp));

      mpz_mul_2exp (r[15 - OFFSET], r[15 - OFFSET], i);

      mpz_sub(r[2 - OFFSET], r[1-OFFSET], r[15 - OFFSET]);
      mpz_add(r[1 - OFFSET], r[1-OFFSET], r[15 - OFFSET]);
 			     
      ////////////////////////////////////////////////////////////////////////

      mpz_set (r[15 - OFFSET], b1);   mpz_set (r[14 - OFFSET], b0); // b factor.

      mpz_addmul_ui (r[15 - OFFSET], b3, 1<<jp);
      mpz_addmul_ui (r[14 - OFFSET], b2, 1<<jp);

      mpz_addmul_ui (r[15 - OFFSET], b5, 1<<(2*jp));
      mpz_addmul_ui (r[14 - OFFSET], b4, 1<<(2*jp));
      
      mpz_addmul_ui (r[15 - OFFSET], b7, 1<<(3*jp));
      mpz_addmul_ui (r[14 - OFFSET], b6, 1<<(3*jp));

      mpz_mul_2exp (r[15 - OFFSET], r[15 - OFFSET], i);

      mpz_add(r[3 - OFFSET], r[14-OFFSET], r[15 - OFFSET]);

      RECURSE(mpz_tc8) ( r[15-2-j - OFFSET], r[1 - OFFSET], r[ 3 - OFFSET] ); // Evaluation in 2^i

      mpz_sub(r[15 - OFFSET], r[14 - OFFSET], r[15 - OFFSET]);
      j++;
      RECURSE(mpz_tc8) ( r[15-2-j - OFFSET], r[2 - OFFSET], r[15 - OFFSET] ); // Evaluation in -2^i
  }

  // In 1 and -1.

  mpz_add(r[14 - OFFSET], a0, a2);              mpz_add(r[15 - OFFSET], a1, a3);
  mpz_add(r[14 - OFFSET], r[14 - OFFSET], a4);  mpz_add(r[15 - OFFSET], r[15 - OFFSET], a5);
  mpz_add(r[14 - OFFSET], r[14 - OFFSET], a6);  mpz_add(r[15 - OFFSET], r[15 - OFFSET], a7);

  mpz_add(r[ 1 - OFFSET], r[14 - OFFSET], r[15 - OFFSET]);
  mpz_sub(r[14 - OFFSET], r[14 - OFFSET], r[15 - OFFSET]);

  /////////////////////////////////////////////////////////

  mpz_add(r[15 - OFFSET], b0, b2);              mpz_add(r[13 - OFFSET], b1, b3);
  mpz_add(r[15 - OFFSET], r[15 - OFFSET], b4);  mpz_add(r[13 - OFFSET], r[13 - OFFSET], b5);
  mpz_add(r[15 - OFFSET], r[15 - OFFSET], b6);  mpz_add(r[13 - OFFSET], r[13 - OFFSET], b7);

  mpz_add(r[ 2 - OFFSET], r[15 - OFFSET], r[13 - OFFSET]);
  mpz_sub(r[15 - OFFSET], r[15 - OFFSET], r[13 - OFFSET]);
  
  RECURSE(mpz_tc8) ( r[13 - OFFSET], r[14 - OFFSET], r[15 - OFFSET] ); // Evaluation in -1.
  RECURSE(mpz_tc8) ( r[14 - OFFSET], r[ 1 - OFFSET], r[ 2 - OFFSET] ); // Evaluation in  1.

  // In 2^6 = 64.

  mpz_set(r[15 - OFFSET], a6);
  mpz_addmul_ui(r[15 - OFFSET], a7, 1<<6);
  mpz_mul_2exp(r[15 - OFFSET], r[15 - OFFSET], 6*6);

  mpz_set(r[2 - OFFSET], b6);
  mpz_addmul_ui(r[2 - OFFSET], b7, 1<<6);
  mpz_mul_2exp(r[2 - OFFSET], r[2 - OFFSET], 6*6);

  mpz_add(r[1 - OFFSET], r[15 - OFFSET], a0);  mpz_add(r[15 - OFFSET], r[2 - OFFSET], b0);
  mpz_addmul_ui(r[1 - OFFSET], a1, 1<<6);      mpz_addmul_ui(r[15 - OFFSET], b1, 1<<6);
  mpz_addmul_ui(r[1 - OFFSET], a2, 1<<(2*6));  mpz_addmul_ui(r[15 - OFFSET], b2, 1<<(2*6));
  mpz_addmul_ui(r[1 - OFFSET], a3, 1<<(3*6));  mpz_addmul_ui(r[15 - OFFSET], b3, 1<<(3*6));
  mpz_addmul_ui(r[1 - OFFSET], a4, 1<<(4*6));  mpz_addmul_ui(r[15 - OFFSET], b4, 1<<(4*6));
  mpz_addmul_ui(r[1 - OFFSET], a5, 1<<(5*6));  mpz_addmul_ui(r[15 - OFFSET], b5, 1<<(5*6));

  RECURSE(mpz_tc8) (r[2 - OFFSET], r[1 - OFFSET], r[15 - OFFSET]); // Evaluation in 2^6 = 64.

  // In (1/0).
  RECURSE(mpz_tc8) (r[1  - OFFSET], a7, b7); // Evaluation in 1/0 

  // Reordering defines
#define w1    r[0]
#define w2    r[1]
#define w3    r[2]
#define w4    r[3]
#define w5    r[4]
#define w6    r[5]
#define w7    r[6]
#define w8    r[7]
#define w9    r[8]
#define w10   r[9]
#define w11   r[10]
#define w12   r[11]
#define w13   r[12]
#define w14   r[13]
#define w15   r[14]

  /******************************* Interpolation *****************************/

 // Decoupling.

  mpz_sub(w4,  w4,  w3);  mpz_sub(w2,  w2,  w4);  mpz_div_2exp(w4,  w4,  1); mpz_add(w3,  w3,  w4);
  mpz_sub(w6,  w6,  w5);  mpz_sub(w4,  w4,  w6);  mpz_div_2exp(w6,  w6,  1); mpz_add(w5,  w5,  w6);
  mpz_sub(w8,  w8,  w7);  mpz_sub(w6,  w6,  w8);  mpz_div_2exp(w8,  w8,  1); mpz_add(w7,  w7,  w8);
  mpz_sub(w10, w10, w9);  mpz_sub(w8,  w8,  w10); mpz_div_2exp(w10, w10, 1); mpz_add(w9,  w9,  w10);
  mpz_sub(w12, w12, w11); mpz_sub(w10, w10, w12); mpz_div_2exp(w12, w12, 1); mpz_add(w11, w11, w12);
  mpz_sub(w14, w14, w13); mpz_sub(w12, w12, w14); mpz_div_2exp(w14, w14, 1); mpz_add(w13, w13, w14);

  // Remove left entries.

  mpz_sub(w13, w13, w1); mpz_submul_ui(w11, w1, 1<<14); mpz_submul_ui(w9, w1, 1<<28);

  mpz_mul_2exp(w15, w1, 42);
  mpz_sub(w7, w7, w15);
  mpz_mul_2exp(w15, w15, 14);    // w1<<56
  mpz_sub(w5, w5, w15);
  mpz_submul_ui(w3, w15, 1<<14); // w1<<70
  mpz_submul_ui(w2, w15, 1<<28); // w1<<84

  // Remove right entries.

  RECURSE(mpz_tc8) (r[15 - OFFSET], a0, b0); // Evaluation in 0.
  mpz_sub(w13, w13, w15); mpz_sub(w9, w9, w15);  mpz_sub(w5, w5, w15);  mpz_sub(w2, w2, w15);
  mpz_sub(w11, w11, w15); mpz_sub(w7, w7, w15);  mpz_sub(w3, w3, w15);

  mpz_submul_ui(w2, w3, 1<<2); mpz_submul_ui(w2, w4, 1<<3); // 2^64 line.

  // Left triangulation.

  mpz_submul_ui(w3,  w5,  1<<2);
  mpz_submul_ui(w5,  w7,  1<<2);
  mpz_submul_ui(w7,  w9,  1<<2);
  mpz_submul_ui(w9,  w11, 1<<2);
  mpz_submul_ui(w11, w13, 1<<2);   mpz_submul_ui(w2, w3, 1<<4);
  mpz_submul_ui(w3,  w5,  1<<4);
  mpz_submul_ui(w5,  w7,  1<<4);
  mpz_submul_ui(w7,  w9,  1<<4);
  mpz_submul_ui(w9,  w11, 1<<4);   mpz_submul_ui(w2, w3, 1<<6);
  mpz_submul_ui(w3,  w5,  1<<6);
  mpz_submul_ui(w5,  w7,  1<<6);
  mpz_submul_ui(w7,  w9,  1<<6);   mpz_submul_ui(w2, w3, 1<<8);
  mpz_submul_ui(w3,  w5,  1<<8);
  mpz_submul_ui(w5,  w7,  1<<8);   mpz_submul_ui(w2, w3, 1<<10);
  mpz_submul_ui(w3,  w5,  1<<10);  mpz_submul_ui(w2, w3, 1<<12);

  mpz_submul_ui(w4,  w6,  1<<3);
  mpz_submul_ui(w6,  w8,  1<<3);
  mpz_submul_ui(w8,  w10, 1<<3);
  mpz_submul_ui(w10, w12, 1<<3);   mpz_submul_ui(w2, w4, 1<<5);
  mpz_submul_ui(w4,  w6,  1<<5);
  mpz_submul_ui(w6,  w8,  1<<5);
  mpz_submul_ui(w8,  w10, 1<<5);   mpz_submul_ui(w2, w4, 1<<7);
  mpz_submul_ui(w4,  w6,  1<<7);
  mpz_submul_ui(w6,  w8,  1<<7);   mpz_submul_ui(w2, w4, 1<<9);
  mpz_submul_ui(w4,  w6,  1<<9);   mpz_submul_ui(w2, w4, 1<<11);

  // Divisions.

  mpz_divexact_ui(w12, w12, 6);
  mpz_divexact_ui(w11, w11, 12);
  mpz_divexact_ui(w10, w10, 720);
  mpz_divexact_ui(w9, w9, 2880);                                  // = 2^6*3*5
  mpz_divexact_ui(w8, w8, 1451520);                               // = 2^9*3^4*5*7
  mpz_divexact_ui(w7, w7, 11612160);                              // = 2^12*3^4*5*7

  mpz_div_2exp(w6, w6, 16);   mpz_divexact_ui(w6, w6, 722925);    // = 3^5*5^2*7*17
  mpz_div_2exp(w5, w5, 20);   mpz_divexact_ui(w5, w5, 722925);
  mpz_div_2exp(w4, w4, 25);   mpz_divexact_ui(w4, w4, 739552275); // = 3^6*5^2*7*11*17*31
  mpz_div_2exp(w3, w3, 30);   mpz_divexact_ui(w3, w3, 739552275);
  mpz_div_2exp(w2, w2, 36);   mpz_divexact_ui(w2, w2, 314645877); // = 3^8*7*13*17*31
                              mpz_divexact_ui(w2, w2, 9625);      // = 5^3*7*11
  // Backward substitution.			      

  mpz_sub(w11, w11, w5);       mpz_sub(w9, w9, w7);               // First submatrix.
  mpz_submul_ui(w5, w3, 341);  mpz_submul_ui(w7, w3, 5797);
  mpz_sub(w7, w7, w5);         mpz_sub(w11, w11, w7);
  mpz_submul_ui(w7, w5, 84);   mpz_submul_ui(w9, w5, 272);
  mpz_sub(w11, w11, w9);
  mpz_submul_ui(w9, w7, 20);   mpz_submul_ui(w11, w9, 1<<2);
  
  mpz_submul_ui(w8, w2, 376805);                                  // Second submatrix.
  mpz_sub(w12, w12, w4);       mpz_submul_ui(w4, w2, 1365);
  mpz_sub(w10, w10, w8);       mpz_submul_ui(w6, w4, 341);
  mpz_sub(w10, w10, w6);       mpz_submul_ui(w6, w2, 93093);
  mpz_submul_ui(w8, w4, 5797); mpz_sub(w12, w12, w8);
  mpz_submul_ui(w8, w6, 85);   mpz_submul_ui(w10, w6, 271);
  mpz_submul_ui(w12, w4, 340); mpz_sub(w12, w12, w10);
  mpz_submul_ui(w10, w8, 20);  mpz_submul_ui(w12, w10, 1<<2);

  mpz_sub(w13, w13, w3);       mpz_sub(w14, w14, w2);
  mpz_sub(w13, w13, w5);       mpz_sub(w14, w14, w4); 
  mpz_sub(w13, w13, w7);       mpz_sub(w14, w14, w6); 
  mpz_sub(w13, w13, w9);       mpz_sub(w14, w14, w8); 
  mpz_sub(w13, w13, w11);      mpz_sub(w14, w14, w10);    mpz_sub(w14, w14, w12);

  // Result reconstruction.

  _mpz_realloc (c, 2 * n);
  mpz_set  (c, w15);
  mpz_copy (c, w14,     n8);
  mpz_copy (c, w13, 2 * n8);
  mpz_copy (c, w12, 3 * n8);
  mpz_copy (c, w11, 4 * n8);
  mpz_copy (c, w10, 5 * n8);
  mpz_copy (c, w9,  6 * n8);
  mpz_copy (c, w8,  7 * n8);
  mpz_copy (c, w7,  8 * n8);
  mpz_copy (c, w6,  9 * n8);
  mpz_copy (c, w5, 10 * n8);
  mpz_copy (c, w4, 11 * n8);
  mpz_copy (c, w3, 12 * n8);
  mpz_copy (c, w2, 13 * n8);
  mpz_copy (c, w1, 14 * n8);

  /** Remove reordering defines **/
#undef w15
#undef w14
#undef w13
#undef w12
#undef w11
#undef w10
#undef w9
#undef w8
#undef w7
#undef w6
#undef w5
#undef w4
#undef w3
#undef w2
#undef w1

  if (sign < 0)
    mpz_neg (c, c);

  for ( i = 0; i < 15; i++ )
    mpz_clear(r[i]);
}

/* Toom-8 for squaring */

void
mpz_tc8_sqr (mpz_t c, mpz_t a)
{
  const unsigned int OFFSET = 1;
  mp_size_t  n = mpz_size(a), n8;
  mpz_t r[15], a0, a1, a2, a3, a4, a5, a6, a7;
  int sign,i=0,j=0,jp=0;

  if (n < TC8_SQR_THRESHOLD)
    {
      mpz_mul (c, a, a);
      return;
    }
  else
    n8 = (n - 1) / 8 + 1;

  sign = mpz_sgn (a) * mpz_sgn (a);
  for( i = 0; i < 15; i++ )
    mpz_init (r[i]);

  /***************************** Decomposition *******************************/
  my_mpz_init (a0, a,      0, n8);             
  my_mpz_init (a1, a,     n8, n8); 
  my_mpz_init (a2, a, 2 * n8, n8); 
  my_mpz_init (a3, a, 3 * n8, n8); 
  my_mpz_init (a4, a, 4 * n8, n8);
  my_mpz_init (a5, a, 5 * n8, n8);
  my_mpz_init (a6, a, 6 * n8, n8);
  my_mpz_init (a7, a, 7 * n8, n8);

  /********************** Evaluation and recursive calls *********************/

  for(i=1; i <= 5; i++)  // Evaluation in powers of 2 and -2.
    {
      j++; jp += 2;
      mpz_set (r[15 - OFFSET], a1);   mpz_set (r[1 - OFFSET], a0); // a factor.

      mpz_addmul_ui (r[15 - OFFSET], a3, 1<<jp);
      mpz_addmul_ui (r[ 1 - OFFSET], a2, 1<<jp);

      mpz_addmul_ui (r[15 - OFFSET], a5, 1<<(2*jp));
      mpz_addmul_ui (r[ 1 - OFFSET], a4, 1<<(2*jp));
      
      mpz_addmul_ui (r[15 - OFFSET], a7, 1<<(3*jp));
      mpz_addmul_ui (r[ 1 - OFFSET], a6, 1<<(3*jp));

      mpz_mul_2exp (r[15 - OFFSET], r[15 - OFFSET], i);

      mpz_sub(r[2 - OFFSET], r[1-OFFSET], r[15 - OFFSET]);
      mpz_add(r[1 - OFFSET], r[1-OFFSET], r[15 - OFFSET]);
 			     
      ////////////////////////////////////////////////////////////////////////

      RECURSE(mpz_tc8_sqr) ( r[15-2-j - OFFSET], r[1 - OFFSET] ); // Evaluation in 2^i
      j++;
      RECURSE(mpz_tc8_sqr) ( r[15-2-j - OFFSET], r[2 - OFFSET] ); // Evaluation in -2^i
  }

  // In 1 and -1.

  mpz_add(r[14 - OFFSET], a0, a2);              mpz_add(r[15 - OFFSET], a1, a3);
  mpz_add(r[14 - OFFSET], r[14 - OFFSET], a4);  mpz_add(r[15 - OFFSET], r[15 - OFFSET], a5);
  mpz_add(r[14 - OFFSET], r[14 - OFFSET], a6);  mpz_add(r[15 - OFFSET], r[15 - OFFSET], a7);

  mpz_add(r[ 1 - OFFSET], r[14 - OFFSET], r[15 - OFFSET]);
  mpz_sub(r[14 - OFFSET], r[14 - OFFSET], r[15 - OFFSET]);

  /////////////////////////////////////////////////////////

  RECURSE(mpz_tc8_sqr) ( r[13 - OFFSET], r[14 - OFFSET] ); // Evaluation in -1.
  RECURSE(mpz_tc8_sqr) ( r[14 - OFFSET], r[ 1 - OFFSET] ); // Evaluation in  1.

  // In 2^6 = 64.

  mpz_set(r[15 - OFFSET], a6);
  mpz_addmul_ui(r[15 - OFFSET], a7, 1<<6);
  mpz_mul_2exp(r[15 - OFFSET], r[15 - OFFSET], 6*6);

  mpz_add(r[1 - OFFSET], r[15 - OFFSET], a0);
  mpz_addmul_ui(r[1 - OFFSET], a1, 1<<6);
  mpz_addmul_ui(r[1 - OFFSET], a2, 1<<(2*6));
  mpz_addmul_ui(r[1 - OFFSET], a3, 1<<(3*6));
  mpz_addmul_ui(r[1 - OFFSET], a4, 1<<(4*6));
  mpz_addmul_ui(r[1 - OFFSET], a5, 1<<(5*6));

  RECURSE(mpz_tc8_sqr) (r[2 - OFFSET], r[1 - OFFSET]); // Evaluation in 2^6 = 64.

  // In (1/0).
  RECURSE(mpz_tc8_sqr) (r[1  - OFFSET], a7); // Evaluation in 1/0 

  // Reordering defines
#define w1    r[0]
#define w2    r[1]
#define w3    r[2]
#define w4    r[3]
#define w5    r[4]
#define w6    r[5]
#define w7    r[6]
#define w8    r[7]
#define w9    r[8]
#define w10   r[9]
#define w11   r[10]
#define w12   r[11]
#define w13   r[12]
#define w14   r[13]
#define w15   r[14]

  /******************************* Interpolation *****************************/

 // Decoupling.

  mpz_sub(w4,  w4,  w3);  mpz_sub(w2,  w2,  w4);  mpz_div_2exp(w4,  w4,  1); mpz_add(w3,  w3,  w4);
  mpz_sub(w6,  w6,  w5);  mpz_sub(w4,  w4,  w6);  mpz_div_2exp(w6,  w6,  1); mpz_add(w5,  w5,  w6);
  mpz_sub(w8,  w8,  w7);  mpz_sub(w6,  w6,  w8);  mpz_div_2exp(w8,  w8,  1); mpz_add(w7,  w7,  w8);
  mpz_sub(w10, w10, w9);  mpz_sub(w8,  w8,  w10); mpz_div_2exp(w10, w10, 1); mpz_add(w9,  w9,  w10);
  mpz_sub(w12, w12, w11); mpz_sub(w10, w10, w12); mpz_div_2exp(w12, w12, 1); mpz_add(w11, w11, w12);
  mpz_sub(w14, w14, w13); mpz_sub(w12, w12, w14); mpz_div_2exp(w14, w14, 1); mpz_add(w13, w13, w14);

  // Remove left entries.

  mpz_sub(w13, w13, w1); mpz_submul_ui(w11, w1, 1<<14); mpz_submul_ui(w9, w1, 1<<28);

  mpz_mul_2exp(w15, w1, 42);
  mpz_sub(w7, w7, w15);
  mpz_mul_2exp(w15, w15, 14);    // w1<<56
  mpz_sub(w5, w5, w15);
  mpz_submul_ui(w3, w15, 1<<14); // w1<<70
  mpz_submul_ui(w2, w15, 1<<28); // w1<<84

  // Remove right entries.

  RECURSE(mpz_tc8_sqr) (r[15 - OFFSET], a0); // Evaluation in 0.
  mpz_sub(w13, w13, w15); mpz_sub(w9, w9, w15);  mpz_sub(w5, w5, w15);  mpz_sub(w2, w2, w15);
  mpz_sub(w11, w11, w15); mpz_sub(w7, w7, w15);  mpz_sub(w3, w3, w15);

  mpz_submul_ui(w2, w3, 1<<2); mpz_submul_ui(w2, w4, 1<<3); // 2^64 line.

  // Left triangulation.

  mpz_submul_ui(w3,  w5,  1<<2);
  mpz_submul_ui(w5,  w7,  1<<2);
  mpz_submul_ui(w7,  w9,  1<<2);
  mpz_submul_ui(w9,  w11, 1<<2);
  mpz_submul_ui(w11, w13, 1<<2);   mpz_submul_ui(w2, w3, 1<<4);
  mpz_submul_ui(w3,  w5,  1<<4);
  mpz_submul_ui(w5,  w7,  1<<4);
  mpz_submul_ui(w7,  w9,  1<<4);
  mpz_submul_ui(w9,  w11, 1<<4);   mpz_submul_ui(w2, w3, 1<<6);
  mpz_submul_ui(w3,  w5,  1<<6);
  mpz_submul_ui(w5,  w7,  1<<6);
  mpz_submul_ui(w7,  w9,  1<<6);   mpz_submul_ui(w2, w3, 1<<8);
  mpz_submul_ui(w3,  w5,  1<<8);
  mpz_submul_ui(w5,  w7,  1<<8);   mpz_submul_ui(w2, w3, 1<<10);
  mpz_submul_ui(w3,  w5,  1<<10);  mpz_submul_ui(w2, w3, 1<<12);

  mpz_submul_ui(w4,  w6,  1<<3);
  mpz_submul_ui(w6,  w8,  1<<3);
  mpz_submul_ui(w8,  w10, 1<<3);
  mpz_submul_ui(w10, w12, 1<<3);   mpz_submul_ui(w2, w4, 1<<5);
  mpz_submul_ui(w4,  w6,  1<<5);
  mpz_submul_ui(w6,  w8,  1<<5);
  mpz_submul_ui(w8,  w10, 1<<5);   mpz_submul_ui(w2, w4, 1<<7);
  mpz_submul_ui(w4,  w6,  1<<7);
  mpz_submul_ui(w6,  w8,  1<<7);   mpz_submul_ui(w2, w4, 1<<9);
  mpz_submul_ui(w4,  w6,  1<<9);   mpz_submul_ui(w2, w4, 1<<11);

  // Divisions.

  mpz_divexact_ui(w12, w12, 6);
  mpz_divexact_ui(w11, w11, 12);
  mpz_divexact_ui(w10, w10, 720);
  mpz_divexact_ui(w9, w9, 2880);                                  // = 2^6*3*5
  mpz_divexact_ui(w8, w8, 1451520);                               // = 2^9*3^4*5*7
  mpz_divexact_ui(w7, w7, 11612160);                              // = 2^12*3^4*5*7

  mpz_div_2exp(w6, w6, 16);   mpz_divexact_ui(w6, w6, 722925);    // = 3^5*5^2*7*17
  mpz_div_2exp(w5, w5, 20);   mpz_divexact_ui(w5, w5, 722925);
  mpz_div_2exp(w4, w4, 25);   mpz_divexact_ui(w4, w4, 739552275); // = 3^6*5^2*7*11*17*31
  mpz_div_2exp(w3, w3, 30);   mpz_divexact_ui(w3, w3, 739552275);
  mpz_div_2exp(w2, w2, 36);   mpz_divexact_ui(w2, w2, 314645877); // = 3^8*7*13*17*31
                              mpz_divexact_ui(w2, w2, 9625);      // = 5^3*7*11
  // Backward substitution.			      

  mpz_sub(w11, w11, w5);       mpz_sub(w9, w9, w7);               // First submatrix.
  mpz_submul_ui(w5, w3, 341);  mpz_submul_ui(w7, w3, 5797);
  mpz_sub(w7, w7, w5);         mpz_sub(w11, w11, w7);
  mpz_submul_ui(w7, w5, 84);   mpz_submul_ui(w9, w5, 272);
  mpz_sub(w11, w11, w9);
  mpz_submul_ui(w9, w7, 20);   mpz_submul_ui(w11, w9, 1<<2);
  
  mpz_submul_ui(w8, w2, 376805);                                  // Second submatrix.
  mpz_sub(w12, w12, w4);       mpz_submul_ui(w4, w2, 1365);
  mpz_sub(w10, w10, w8);       mpz_submul_ui(w6, w4, 341);
  mpz_sub(w10, w10, w6);       mpz_submul_ui(w6, w2, 93093);
  mpz_submul_ui(w8, w4, 5797); mpz_sub(w12, w12, w8);
  mpz_submul_ui(w8, w6, 85);   mpz_submul_ui(w10, w6, 271);
  mpz_submul_ui(w12, w4, 340); mpz_sub(w12, w12, w10);
  mpz_submul_ui(w10, w8, 20);  mpz_submul_ui(w12, w10, 1<<2);

  mpz_sub(w13, w13, w3);       mpz_sub(w14, w14, w2);
  mpz_sub(w13, w13, w5);       mpz_sub(w14, w14, w4); 
  mpz_sub(w13, w13, w7);       mpz_sub(w14, w14, w6); 
  mpz_sub(w13, w13, w9);       mpz_sub(w14, w14, w8); 
  mpz_sub(w13, w13, w11);      mpz_sub(w14, w14, w10);    mpz_sub(w14, w14, w12);

  // Result reconstruction.

  _mpz_realloc (c, 2 * n);
  mpz_set  (c, w15);
  mpz_copy (c, w14,     n8);
  mpz_copy (c, w13, 2 * n8);
  mpz_copy (c, w12, 3 * n8);
  mpz_copy (c, w11, 4 * n8);
  mpz_copy (c, w10, 5 * n8);
  mpz_copy (c, w9,  6 * n8);
  mpz_copy (c, w8,  7 * n8);
  mpz_copy (c, w7,  8 * n8);
  mpz_copy (c, w6,  9 * n8);
  mpz_copy (c, w5, 10 * n8);
  mpz_copy (c, w4, 11 * n8);
  mpz_copy (c, w3, 12 * n8);
  mpz_copy (c, w2, 13 * n8);
  mpz_copy (c, w1, 14 * n8);

  /** Remove reordering defines **/
#undef w15
#undef w14
#undef w13
#undef w12
#undef w11
#undef w10
#undef w9
#undef w8
#undef w7
#undef w6
#undef w5
#undef w4
#undef w3
#undef w2
#undef w1

  if (sign < 0)
    mpz_neg (c, c);

  for ( i = 0; i < 15; i++ )
    mpz_clear(r[i]);
}

/* Implementation of Toom-Cook algorithm 9-way.

  Written by Alberto Zanoni, April 2009.
 
  The (17x17) matrix to be inverted is given by the following gp code

  print1("[ 1 ");              \
  for(i=1,16,print1("0 "));    \
  forstep(i=7,0,-1,            \
   print1("]\n[ ");            \
   if (i<7,                    \
   forstep(j=16,0,-1,          \
    if (i*j == 0,              \
     print1((-1)^(j)," "),     \
       if(j == 1,              \
       print1("-",2^i," "),    \
       if ( j%2,               \
        print1("-"),           \
        print1(""));           \
       print1("2^(",i*j,") " );\
    )));                       \
   print1("]\n[ ");            \
   );                          \
   forstep(j=16,0,-1,          \
    if (i*j == 0,              \
     print1("1 "),             \
       if(j == 1,              \
       print1(" ",2^i," "),    \
       if ( j%2,               \
        print1(" "));          \
       print1("2^(",i*j,") " );\
    )));                       \
  );                           \
  print1("]\n[");              \
  for(i=1,16,print1(" 0"));    \
  print1(" 1 ]");

[1       0      0       0      0       0      0       0      0       0      0       0      0       0      0        0 0]
[2^(112) 2^(105) 2^(98) 2^(91) 2^(84)  2^(77) 2^(70)  2^(63) 2^(56)  2^(49) 2^(42)  2^(35) 2^(28)  2^(21) 2^(14) 128 1]
[2^(96) -2^(90) 2^(84) -2^(78) 2^(72) -2^(66) 2^(60) -2^(54) 2^(48) -2^(42) 2^(36) -2^(30) 2^(24) -2^(18) 2^(12) -64 1]
[2^(96)  2^(90) 2^(84)  2^(78) 2^(72)  2^(66) 2^(60)  2^(54) 2^(48)  2^(42) 2^(36)  2^(30) 2^(24)  2^(18) 2^(12)  64 1]
[2^(80) -2^(75) 2^(70) -2^(65) 2^(60) -2^(55) 2^(50) -2^(45) 2^(40) -2^(35) 2^(30) -2^(25) 2^(20) -2^(15) 2^(10) -32 1]
[2^(80)  2^(75) 2^(70)  2^(65) 2^(60)  2^(55) 2^(50)  2^(45) 2^(40)  2^(35) 2^(30)  2^(25) 2^(20)  2^(15) 2^(10)  32 1]
[2^(64) -2^(60) 2^(56) -2^(52) 2^(48) -2^(44) 2^(40) -2^(36) 2^(32) -2^(28) 2^(24) -2^(20) 2^(16) -2^(12) 2^(8)  -16 1]
[2^(64)  2^(60) 2^(56)  2^(52) 2^(48)  2^(44) 2^(40)  2^(36) 2^(32)  2^(28) 2^(24)  2^(20) 2^(16)  2^(12) 2^(8)   16 1]
[2^(48) -2^(45) 2^(42) -2^(39) 2^(36) -2^(33) 2^(30) -2^(27) 2^(24) -2^(21) 2^(18) -2^(15) 2^(12) -2^(9)  2^(6)   -8 1]
[2^(48)  2^(45) 2^(42)  2^(39) 2^(36)  2^(33) 2^(30)  2^(27) 2^(24)  2^(21) 2^(18)  2^(15) 2^(12)  2^(9)  2^(6)    8 1]
[2^(32) -2^(30) 2^(28) -2^(26) 2^(24) -2^(22) 2^(20) -2^(18) 2^(16) -2^(14) 2^(12) -2^(10) 2^(8)  -2^(6)  2^(4)   -4 1]
[2^(32)  2^(30) 2^(28)  2^(26) 2^(24)  2^(22) 2^(20)  2^(18) 2^(16)  2^(14) 2^(12)  2^(10) 2^(8)   2^(6)  2^(4)    4 1]
[2^(16) -2^(15) 2^(14) -2^(13) 2^(12) -2^(11) 2^(10) -2^(9)  2^(8)  -2^(7)  2^(6)  -2^(5)  2^(4)  -2^(3)  2^(2)   -2 1]
[2^(16)  2^(15) 2^(14)  2^(13) 2^(12)  2^(11) 2^(10)  2^(9)  2^(8)   2^(7)  2^(6)   2^(5)  2^(4)   2^(3)  2^(2)    2 1]
[1      -1      1      -1      1      -1      1      -1      1      -1      1      -1      1      -1      1       -1 1]
[1       1      1       1      1       1      1       1      1       1      1       1      1       1      1        1 1]
[0       0      0       0      0       0      0       0      0       0      0       0      0       0      0        0 1]
*/

void
mpz_tc9 (mpz_t c, mpz_t a, mpz_t b)
{
  const unsigned int OFFSET = 1;
  mp_size_t n = mpz_size(a), n9;
  mpz_t r[17], a0, a1, a2, a3, a4, a5, a6, a7, a8, b0, b1, b2, b3, b4, b5, b6, b7,b8;
  int sign, i=0, j=0, jp=0;

  if (mpz_size (b) > n)
    {
      mpz_tc9 (c, b, a);
      return;
    }

  if (n < TC9_THRESHOLD)
    {
      mpz_mul (c, a, b);
      return;
    }
  else
    n9 = (n - 1) / 9 + 1;

  sign = mpz_sgn (a) * mpz_sgn (b);
  for( i = 0; i < 17; i++ )
    mpz_init (r[i]);

  /***************************** Decomposition *******************************/
  my_mpz_init (a0, a,      0, n9);        my_mpz_init (b0, b,      0, n9);             
  my_mpz_init (a1, a,     n9, n9);        my_mpz_init (b1, b,     n9, n9); 
  my_mpz_init (a2, a, 2 * n9, n9);        my_mpz_init (b2, b, 2 * n9, n9); 
  my_mpz_init (a3, a, 3 * n9, n9);        my_mpz_init (b3, b, 3 * n9, n9); 
  my_mpz_init (a4, a, 4 * n9, n9);	  my_mpz_init (b4, b, 4 * n9, n9);
  my_mpz_init (a5, a, 5 * n9, n9);	  my_mpz_init (b5, b, 5 * n9, n9);
  my_mpz_init (a6, a, 6 * n9, n9);	  my_mpz_init (b6, b, 6 * n9, n9);
  my_mpz_init (a7, a, 7 * n9, n9);	  my_mpz_init (b7, b, 7 * n9, n9);
  my_mpz_init (a8, a, 8 * n9, n9);	  my_mpz_init (b8, b, 8 * n9, n9);

  /********************** Evaluation and recursive calls *********************/

  for(i=1; i <= 6; i++)  // Evaluation in some powers of 2 and -2.
    {
      j++; jp += 2;
      mpz_set (r[17 - OFFSET], a1);                     // a factor.
      mpz_addmul_ui(r[17 - OFFSET], a3, 1<<jp);
      mpz_addmul_ui(r[17 - OFFSET], a5, 1<<(2*jp));
      if ( i <= 5 )
	mpz_addmul_ui(r[17 - OFFSET], a7, 1<<(3*jp));
      else
	{
	  mpz_mul_2exp(r[2 - OFFSET], a7, 3*jp);
	  mpz_add( r[17 - OFFSET], r[17 - OFFSET], r[2 - OFFSET]);
	}
      mpz_mul_2exp (r[17 - OFFSET], r[17 - OFFSET], i);

      mpz_set (r[1 - OFFSET], a0);
      mpz_addmul_ui(r[1 - OFFSET], a2, 1<<jp);
      mpz_addmul_ui(r[1 - OFFSET], a4, 1<<(2*jp));
      if ( i <= 3 )
	{
	  mpz_addmul_ui(r[ 1 - OFFSET], a6, 1<<(3*jp));
	  mpz_addmul_ui(r[ 1 - OFFSET], a8, 1<<(4*jp));
	}
      else
	{
	  mpz_mul_2exp(r[2 - OFFSET], a6, 3*jp);
	  mpz_add( r[1 - OFFSET], r[1 - OFFSET], r[2 - OFFSET]);
	  mpz_mul_2exp(r[2 - OFFSET], a8, 4*jp);
	  mpz_add( r[1 - OFFSET], r[1 - OFFSET], r[2 - OFFSET]);
	}

      mpz_sub(r[2 - OFFSET], r[1-OFFSET], r[17 - OFFSET]); // -2^i
      mpz_add(r[1 - OFFSET], r[1-OFFSET], r[17 - OFFSET]); //  2^i

      ////////////////////////////////////////////////////////////////////////

      mpz_set (r[17 - OFFSET], b1);                     // b factor.
      mpz_addmul_ui(r[17 - OFFSET], b3, 1<<jp);
      mpz_addmul_ui(r[17 - OFFSET], b5, 1<<(2*jp));
       if ( i <= 5 )
	 mpz_addmul_ui(r[17 - OFFSET], b7, 1<<(3*jp));
       else
	{
	  mpz_mul_2exp(r[3 - OFFSET], b7, 3*jp);
	  mpz_add( r[17 - OFFSET], r[17 - OFFSET], r[3 - OFFSET]);
	}

      mpz_mul_2exp (r[17 - OFFSET], r[17 - OFFSET], i);

      mpz_set (r[16 - OFFSET], b0);
      mpz_addmul_ui(r[16 - OFFSET], b2, 1<<jp);
      mpz_addmul_ui(r[16 - OFFSET], b4, 1<<(2*jp));
      if ( i <= 3 )
	{
	  mpz_addmul_ui(r[16 - OFFSET], b6, 1<<(3*jp));
	  mpz_addmul_ui(r[16 - OFFSET], b8, 1<<(4*jp));
	}
      else
	{
	  mpz_mul_2exp(r[3 - OFFSET], b6, 3*jp);
	  mpz_add( r[16 - OFFSET], r[16 - OFFSET], r[3 - OFFSET]);
	  mpz_mul_2exp(r[3 - OFFSET], b8, 4*jp);
	  mpz_add( r[16 - OFFSET], r[16 - OFFSET], r[3 - OFFSET]);
	}

      mpz_add(r[3 - OFFSET], r[16-OFFSET], r[17 - OFFSET]);
      RECURSE(mpz_tc9) ( r[17-2-j - OFFSET], r[1 - OFFSET], r[ 3 - OFFSET] ); // Evaluation in 2^i

      mpz_sub(r[17 - OFFSET], r[16 - OFFSET], r[17 - OFFSET]);
      j++;
      RECURSE(mpz_tc9) ( r[17-2-j - OFFSET], r[2 - OFFSET], r[17 - OFFSET] ); // Evaluation in -2^i
  }

  // In 1 and -1.

  mpz_add(r[16 - OFFSET], a0, a2);              mpz_add(r[17 - OFFSET], a1, a3);
  mpz_add(r[16 - OFFSET], r[16 - OFFSET], a4);  mpz_add(r[17 - OFFSET], r[17 - OFFSET], a5);
  mpz_add(r[16 - OFFSET], r[16 - OFFSET], a6);  mpz_add(r[17 - OFFSET], r[17 - OFFSET], a7);
  mpz_add(r[16 - OFFSET], r[16 - OFFSET], a8);

  mpz_add(r[ 1 - OFFSET], r[16 - OFFSET], r[17 - OFFSET]);
  mpz_sub(r[16 - OFFSET], r[16 - OFFSET], r[17 - OFFSET]);

  /////////////////////////////////////////////////////////

  mpz_add(r[17 - OFFSET], b0, b2);              mpz_add(r[15 - OFFSET], b1, b3);
  mpz_add(r[17 - OFFSET], r[17 - OFFSET], b4);  mpz_add(r[15 - OFFSET], r[15 - OFFSET], b5);
  mpz_add(r[17 - OFFSET], r[17 - OFFSET], b6);  mpz_add(r[15 - OFFSET], r[15 - OFFSET], b7);
  mpz_add(r[17 - OFFSET], r[17 - OFFSET], b8);

  mpz_add(r[ 2 - OFFSET], r[17 - OFFSET], r[15 - OFFSET]);
  mpz_sub(r[17 - OFFSET], r[17 - OFFSET], r[15 - OFFSET]);
  
  RECURSE(mpz_tc9) ( r[15 - OFFSET], r[16 - OFFSET], r[17 - OFFSET] ); // Evaluation in -1.
  RECURSE(mpz_tc9) ( r[16 - OFFSET], r[ 1 - OFFSET], r[ 2 - OFFSET] ); // Evaluation in  1.

  // In 2^7 = 128.

  mpz_set(r[17 - OFFSET], a5);
  mpz_addmul_ui(r[17 - OFFSET], a6, 1<<   7);
  mpz_addmul_ui(r[17 - OFFSET], a7, 1<<(2*7));
  mpz_addmul_ui(r[17 - OFFSET], a8, 1<<(3*7));
  mpz_mul_2exp(r[17 - OFFSET], r[17 - OFFSET], 5*7);

  mpz_set(r[2 - OFFSET], b5);
  mpz_addmul_ui(r[2 - OFFSET], b6, 1<<7);
  mpz_addmul_ui(r[2 - OFFSET], b7, 1<<(2*7));
  mpz_addmul_ui(r[2 - OFFSET], b8, 1<<(3*7));
  mpz_mul_2exp(r[2 - OFFSET], r[2 - OFFSET], 5*7);

  mpz_add(r[1 - OFFSET], r[17 - OFFSET], a0);  mpz_add(r[17 - OFFSET], r[2 - OFFSET], b0);
  mpz_addmul_ui(r[1 - OFFSET], a1, 1<<7);      mpz_addmul_ui(r[17 - OFFSET], b1, 1<<7);
  mpz_addmul_ui(r[1 - OFFSET], a2, 1<<(2*7));  mpz_addmul_ui(r[17 - OFFSET], b2, 1<<(2*7));
  mpz_addmul_ui(r[1 - OFFSET], a3, 1<<(3*7));  mpz_addmul_ui(r[17 - OFFSET], b3, 1<<(3*7));
  mpz_addmul_ui(r[1 - OFFSET], a4, 1<<(4*7));  mpz_addmul_ui(r[17 - OFFSET], b4, 1<<(4*7));

  RECURSE(mpz_tc9) (r[2 - OFFSET], r[1 - OFFSET], r[17 - OFFSET]); // Evaluation in 2^7 = 128.

  // In (1/0).
  RECURSE(mpz_tc9) (r[1  - OFFSET], a8, b8); // Evaluation in 1/0 

  // Reordering defines
#define w1    r[0]
#define w2    r[1]
#define w3    r[2]
#define w4    r[3]
#define w5    r[4]
#define w6    r[5]
#define w7    r[6]
#define w8    r[7]
#define w9    r[8]
#define w10   r[9]
#define w11   r[10]
#define w12   r[11]
#define w13   r[12]
#define w14   r[13]
#define w15   r[14]
#define w16   r[15]
#define w17   r[16]

  /******************************* Interpolation *****************************/

 // Decoupling.

  mpz_sub(w4,  w4,  w3);  mpz_sub(w2,  w2,  w4);  mpz_div_2exp(w4,  w4,  1); mpz_add(w3,  w3,  w4);
  mpz_sub(w6,  w6,  w5);  mpz_sub(w4,  w4,  w6);  mpz_div_2exp(w6,  w6,  1); mpz_add(w5,  w5,  w6);
  mpz_sub(w8,  w8,  w7);  mpz_sub(w6,  w6,  w8);  mpz_div_2exp(w8,  w8,  1); mpz_add(w7,  w7,  w8);
  mpz_sub(w10, w10, w9);  mpz_sub(w8,  w8,  w10); mpz_div_2exp(w10, w10, 1); mpz_add(w9,  w9,  w10);
  mpz_sub(w12, w12, w11); mpz_sub(w10, w10, w12); mpz_div_2exp(w12, w12, 1); mpz_add(w11, w11, w12);
  mpz_sub(w14, w14, w13); mpz_sub(w12, w12, w14); mpz_div_2exp(w14, w14, 1); mpz_add(w13, w13, w14);
  mpz_sub(w16, w16, w15); mpz_sub(w14, w14, w16); mpz_div_2exp(w16, w16, 1); mpz_add(w15, w15, w16);

  // Remove left entries.

  mpz_sub(w15, w15, w1);
  mpz_submul_ui(w13, w1, 1<<16);                       // w1<<16
  mpz_mul_2exp(w17,  w1,  32); mpz_sub(w11, w11, w17); // w1<<32
  mpz_submul_ui(w9,  w17, 1<<16);                      // w1<<48
  mpz_mul_2exp(w17,  w17, 32); mpz_sub(w7,  w7,  w17); // w1<<64
  mpz_submul_ui(w5,  w17, 1<<16);                      // w1<<80
  mpz_mul_2exp(w17,  w17, 32); mpz_sub(w3,  w3,  w17); // w1<<96
  mpz_submul_ui(w2,  w17, 1<<16);                      // w1<<112

  // Remove right entries.

  RECURSE(mpz_tc9) (r[17 - OFFSET], a0, b0); // Evaluation in 0.
  mpz_sub(w15, w15, w17); mpz_sub(w11, w11, w17); mpz_sub(w7, w7, w17);  mpz_sub(w3, w3, w17);
  mpz_sub(w13, w13, w17); mpz_sub(w9,  w9,  w17); mpz_sub(w5, w5, w17);  mpz_sub(w2, w2, w17);

  mpz_submul_ui(w2, w3, 1<<2); mpz_submul_ui(w2, w4, 1<<3); // 2^128 line.

  // Left triangulation.

  mpz_submul_ui(w3,  w5,  1<<2);
  mpz_submul_ui(w5,  w7,  1<<2);
  mpz_submul_ui(w7,  w9,  1<<2);
  mpz_submul_ui(w9,  w11, 1<<2);
  mpz_submul_ui(w11, w13, 1<<2);
  mpz_submul_ui(w13, w15, 1<<2);   mpz_submul_ui(w2, w3, 1<<4);
  mpz_submul_ui(w3,  w5,  1<<4);
  mpz_submul_ui(w5,  w7,  1<<4);
  mpz_submul_ui(w7,  w9,  1<<4);
  mpz_submul_ui(w9,  w11, 1<<4);
  mpz_submul_ui(w11, w13, 1<<4);   mpz_submul_ui(w2, w3, 1<<6);
  mpz_submul_ui(w3,  w5,  1<<6);
  mpz_submul_ui(w5,  w7,  1<<6);
  mpz_submul_ui(w7,  w9,  1<<6);
  mpz_submul_ui(w9,  w11, 1<<6);   mpz_submul_ui(w2, w3, 1<<8);
  mpz_submul_ui(w3,  w5,  1<<8);
  mpz_submul_ui(w5,  w7,  1<<8);
  mpz_submul_ui(w7,  w9,  1<<8);   mpz_submul_ui(w2, w3, 1<<10);
  mpz_submul_ui(w3,  w5,  1<<10);
  mpz_submul_ui(w5,  w7,  1<<10);  mpz_submul_ui(w2, w3, 1<<12);
  mpz_submul_ui(w3,  w5,  1<<12);  mpz_submul_ui(w2, w3, 1<<14);

  mpz_submul_ui(w4,  w6,  1<<3);
  mpz_submul_ui(w6,  w8,  1<<3);
  mpz_submul_ui(w8,  w10, 1<<3);
  mpz_submul_ui(w10, w12, 1<<3);
  mpz_submul_ui(w12, w14, 1<<3);   mpz_submul_ui(w2, w4, 1<<5);
  mpz_submul_ui(w4,  w6,  1<<5);
  mpz_submul_ui(w6,  w8,  1<<5);
  mpz_submul_ui(w8,  w10, 1<<5);
  mpz_submul_ui(w10, w12, 1<<5);   mpz_submul_ui(w2, w4, 1<<7);
  mpz_submul_ui(w4,  w6,  1<<7);
  mpz_submul_ui(w6,  w8,  1<<7);
  mpz_submul_ui(w8,  w10, 1<<7);   mpz_submul_ui(w2, w4, 1<<9);
  mpz_submul_ui(w4,  w6,  1<<9);
  mpz_submul_ui(w6,  w8,  1<<9);   mpz_submul_ui(w2, w4, 1<<11);
  mpz_submul_ui(w4,  w6,  1<<11);  mpz_submul_ui(w2, w4, 1<<13);

  // Divisions.

  mpz_divexact_ui(w14, w14, 6);
  mpz_divexact_ui(w13, w13, 12);
  mpz_divexact_ui(w12, w12, 720);
  mpz_divexact_ui(w11, w11, 2880);                                 // = 2^6*3*5
  mpz_divexact_ui(w10, w10, 1451520);                              // = 2^9*3^4*5*7
  mpz_divexact_ui(w9,  w9,  11612160);                             // = 2^12*3^4*5*7
								  
  mpz_div_2exp(w8, w8, 16);   mpz_divexact_ui(w8, w8, 722925);     // = 3^5*5^2*7*17
  mpz_div_2exp(w7, w7, 20);   mpz_divexact_ui(w7, w7, 722925);	  
  mpz_div_2exp(w6, w6, 25);   mpz_divexact_ui(w6, w6, 739552275);  // = 3^6*5^2*7*11*17*31
  mpz_div_2exp(w5, w5, 30);   mpz_divexact_ui(w5, w5, 739552275); 
  mpz_div_2exp(w4, w4, 36);   mpz_divexact_ui(w4, w4, 314645877);  // = 3^8*7*13*17*31
                              mpz_divexact_ui(w4, w4, 9625);       // = 5^3*7*11
  mpz_div_2exp(w3, w3, 42);   mpz_divexact_ui(w3, w3, 314645877);  // = 3^8*7*13*17*31
                              mpz_divexact_ui(w3, w3, 9625);       // = 5^3*7*11
  mpz_div_2exp(w2, w2, 49);   mpz_divexact_ui(w2, w2, 1326142125); // = 3^9*5^3*7^2*11
                              mpz_divexact_ui(w2, w2, 37413311);   // = 13*17*31*43*127

  // Backward substitution.			      

  mpz_submul_ui(w9, w3, 376805);
  mpz_sub(w11, w11, w9);          mpz_sub(w13, w13, w5);           // First submatrix.
  mpz_submul_ui(w5, w3, 1365);    mpz_submul_ui(w7, w5, 341);
  mpz_sub(w11, w11, w7);          mpz_submul_ui(w7, w3, 93093);
  mpz_submul_ui(w9, w5, 5797);    mpz_submul_ui(w13, w5, 340);
  mpz_sub(w13, w13, w9);          mpz_submul_ui(w9, w7, 85);
  mpz_submul_ui(w11, w7, 271);    mpz_sub(w13, w13, w11);
  mpz_submul_ui(w11, w9, 20);     mpz_submul_ui(w13, w11, 1<<2);

  mpz_sub(w10, w10, w8);          mpz_sub(w12, w12,  w6);          // Second submatrix.
  mpz_sub(w14, w14, w4);          mpz_submul_ui(w8,  w4, 4433);
  mpz_submul_ui(w6,  w4, 273);    mpz_submul_ui(w4,  w2, 5461);
  mpz_submul_ui(w6,  w4, 1092);   mpz_submul_ui(w8,  w4, 88660);
  mpz_submul_ui(w10, w4, 283712); mpz_submul_ui(w12, w4, 91728);
  mpz_submul_ui(w14, w4, 1364);   mpz_sub(w14, w14, w8);
  mpz_submul_ui(w10, w6, 5456);   mpz_submul_ui(w8,  w6, 341);
  mpz_submul_ui(w12, w6, 5796);

  mpz_sub(w14, w14, w10);         mpz_submul_ui(w12, w8, 357);
  mpz_submul_ui(w10, w8,  84);    mpz_submul_ui(w12, w10, 21);
  mpz_submul_ui(w14, w10, 20);    mpz_submul_ui(w14, w12,  5);

  mpz_sub(w15, w15, w3);          mpz_sub(w16, w16, w2);           // Last subtractions.
  mpz_sub(w15, w15, w5);          mpz_sub(w16, w16, w4); 
  mpz_sub(w15, w15, w7);          mpz_sub(w16, w16, w6); 
  mpz_sub(w15, w15, w9);          mpz_sub(w16, w16, w8); 
  mpz_sub(w15, w15, w11);         mpz_sub(w16, w16, w10);
  mpz_sub(w15, w15, w13);         mpz_sub(w16, w16, w12);    mpz_sub(w16, w16, w14);

  // Result reconstruction.

  _mpz_realloc (c, 2 * n);
  mpz_set  (c, w17);
  mpz_copy (c, w16,     n9);
  mpz_copy (c, w15, 2 * n9);
  mpz_copy (c, w14, 3 * n9);
  mpz_copy (c, w13, 4 * n9);
  mpz_copy (c, w12, 5 * n9);
  mpz_copy (c, w11, 6 * n9);
  mpz_copy (c, w10, 7 * n9);
  mpz_copy (c, w9,  8 * n9);
  mpz_copy (c, w8,  9 * n9);
  mpz_copy (c, w7, 10 * n9);
  mpz_copy (c, w6, 11 * n9);
  mpz_copy (c, w5, 12 * n9);
  mpz_copy (c, w4, 13 * n9);
  mpz_copy (c, w3, 14 * n9);
  mpz_copy (c, w2, 15 * n9);
  mpz_copy (c, w1, 16 * n9);

  /** Remove reordering defines **/
#undef w17
#undef w16
#undef w15
#undef w14
#undef w13
#undef w12
#undef w11
#undef w10
#undef w9
#undef w8
#undef w7
#undef w6
#undef w5
#undef w4
#undef w3
#undef w2
#undef w1

  if (sign < 0)
    mpz_neg (c, c);

  for ( i = 0; i < 17; i++ )
    mpz_clear(r[i]);
}


/* Main! compute the timings. */

int
main (int argc, char *argv[])
{
  mp_size_t n1 = atoi (argv[1]),
            n2 = n1;
  mpz_t a, b, c, d, e, f;
  int st, k, k1 , i, range, time_mpz, time_mpz_sqr,
      time_tc8, time_tc9, time_tc8_sqr, time_tc8_sqr_bis;

  if (argc > 3)
    n2 = atoi (argv[2]);

  k = (argc > 2) ? atoi (argv[argc-1]) : 1;
  k1 = k;

  mpz_init(a); mpz_init(b); mpz_init(c); mpz_init(d); mpz_init(e); mpz_init(f);

  printf ("Testing sizes from a: %d , b: %d\n", (int) n1, (int) n1);
  printf ("TC8_THRESHOLD = %d\tRepetitions = %d\t\tFFT Threshold = %d\n", TC8_THRESHOLD,k, MUL_FFT_THRESHOLD);
  printf ("TC9_THRESHOLD = %d\tKaratsuba Threshold = %d\tToom-3 Threshold = %d\tToom-4 Threshold = %d\n", TC9_THRESHOLD, MUL_KARATSUBA_THRESHOLD, MUL_TOOM3_THRESHOLD, MUL_TOOM44_THRESHOLD);
  printf ("Kar. Sqr Thr. = %d\tToom-3 Sqr Threshold = %d\tToom-4 Sqr Thr. = %d\tFFT Sqr Threshold = %d\n", SQR_KARATSUBA_THRESHOLD, SQR_TOOM3_THRESHOLD, SQR_TOOM4_THRESHOLD, SQR_FFT_THRESHOLD);
  printf("Limbs\tmpz_mul\tmpz_tc8\tmpz_tc9\tmpz_sqr\ttc8_sqr\tToom-8\tToom-9\tT8-sqr\n");

  // NOTE: PUT k1 = 0 HERE, TO DO ONLY SQUARE COMPUTATION. COMMENT
  //       IT TO DO ALSO MULTIPLICATION TESTS.
  //  k1 = 0;
  
  for (range = TC8_SQR_THRESHOLD - 100; range <= 10200; range += 10)
    {
      mpz_random (a, range);
      mpz_random (b, range);

      printf ("%d\t", range);
      st = cputime ();
      for (i = 0; i < k1; i++)
	mpz_mul (c, a, b);
      //  printf ("mpz_mul took %d ms\n", cputime () - st);
      time_mpz = cputime () - st;
      printf ("%d\t", time_mpz);
   
      st = cputime ();
      for (i = 0; i < k1; i++)
	mpz_tc8 (d, a, b);
      //  printf ("mpz_tc8 took %d ms\n", cputime () - st);
      time_tc8 = cputime () - st;
      printf ("%d\t", time_tc8);

      if (k1 && mpz_cmp (c, d)) // Check correctness of mpz_tc8
	{
	  printf ("a="); OUT(a);
	  printf ("b="); OUT(b);
	  printf ("mpz_mul gives "); OUT(c);
	  printf ("mpz_tc8 gives "); OUT(d);
	  exit (1);
	}

      st = cputime ();
      for (i = 0; i < k1; i++)
	mpz_tc9 (e, a, b);
      //  printf ("mpz_tc9 took %d ms\n", cputime () - st);
      time_tc9 = cputime () - st;
      printf ("%d\t", time_tc9);

      if (k1 && mpz_cmp (c, e)) // Check correctness of mpz_tc9
	{
	  printf ("a="); OUT(a);
	  printf ("b="); OUT(b);
	  printf ("mpz_mul gives "); OUT(c);
	  printf ("mpz_tc9 gives "); OUT(e);
	  exit (1);
	}

      // mpz squaring.
      st = cputime ();
      for (i = 0; i < k; i++)
	mpz_mul (c, a, a);
      //  printf ("mpz_mul took %d ms\n", cputime () - st);
      time_mpz_sqr = cputime () - st;
      printf ("%d\t", time_mpz_sqr);
   
      // Toom-8 squaring.
      st = cputime ();
      for (i = 0; i < k; i++)
	mpz_tc8_sqr (f, a);
      time_tc8_sqr = cputime () - st;
      printf ("%d\t", time_tc8_sqr);

      if (mpz_cmp (c, f)) // Check correctness of mpz_tc8_sqr
	{
	  printf ("a="); OUT(a);
	  printf ("b="); OUT(b);
	  printf ("mpz_mul     gives "); OUT(c);
	  printf ("mpz_tc8_sqr gives "); OUT(f);
	  exit (1);
	}

      // Stampa guadagno percentuale per tc8.
      if ( time_tc8 == 0 )
	printf("0\t");
      else if ( time_mpz == 0 )
	printf("2000\t");
      else
	printf ("%.2f\t", 100*((float)time_tc8/time_mpz));
   
      // Stampa guadagno percentuale per tc9.
      if ( time_tc9 == 0 )
	printf("0\t");
      else if ( time_mpz == 0 )
	printf("2000\t");
      else
	printf ("%.2f\t", 100*((float)time_tc9/time_mpz));
   
      // Stampa guadagno percentuale per tc8_sqr.
      if ( time_tc8_sqr == 0 )
	printf("0\n");
      else if ( time_mpz_sqr == 0 )
	printf("2000\n");
      else
	printf ("%.2f\n", 100*((float)time_tc8_sqr/time_mpz_sqr));
      
    }
  mpz_clear(a); mpz_clear(b); mpz_clear(c); mpz_clear(d); mpz_clear(e); mpz_clear(f);

  return 0;
}


