LCOV - code coverage report
Current view: top level - ecm - ks-multiply.c (source / functions) Hit Total Coverage
Test: unnamed Lines: 279 306 91.2 %
Date: 2022-03-21 11:19:20 Functions: 14 14 100.0 %

          Line data    Source code
       1             : /* Polynomial multiplication using GMP's integer multiplication code
       2             : 
       3             : Copyright 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Dave Newman,
       4             : Paul Zimmermann, Alexander Kruppa.
       5             : 
       6             : This file is part of the ECM Library.
       7             : 
       8             : The ECM Library is free software; you can redistribute it and/or modify
       9             : it under the terms of the GNU Lesser General Public License as published by
      10             : the Free Software Foundation; either version 3 of the License, or (at your
      11             : option) any later version.
      12             : 
      13             : The ECM Library is distributed in the hope that it will be useful, but
      14             : WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      15             : or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
      16             : License for more details.
      17             : 
      18             : You should have received a copy of the GNU Lesser General Public License
      19             : along with the ECM Library; see the file COPYING.LIB.  If not, see
      20             : http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
      21             : 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
      22             : 
      23             : #include <stdlib.h>
      24             : #include "ecm-gmp.h" /* for MPZ_REALLOC and MPN_COPY */
      25             : #include "ecm-impl.h"
      26             : 
      27             : #if defined(HAVE___GMPN_MULMOD_BNM1) && defined(HAVE___GMPN_MULMOD_BNM1_NEXT_SIZE)
      28             : #define FFT_WRAP /* use the wrap-around trick */
      29             : #endif
      30             : 
      31             : /* Copy at r+i*s the content of A[i*stride] for 0 <= i < l
      32             :    Assume all A[i*stride] are non-negative, and their size is <= s.
      33             :  */
      34             : static void
      35    13378606 : pack (mp_ptr r, mpz_t *A, mp_size_t l, mp_size_t stride, mp_size_t s)
      36             : {
      37             :   mp_size_t i, j, m;
      38             : 
      39    83959952 :   for (i = 0, j = 0; i < l; i++, j += stride, r += s)
      40             :     {
      41    70581346 :       m = SIZ(A[j]);
      42             :       ASSERT((0 <= m) && (m <= s));
      43    70581346 :       if (m)
      44    70564129 :         MPN_COPY (r, PTR(A[j]), m);
      45    70581346 :       if (m < s)
      46   788761600 :         MPN_ZERO (r + m, s - m);
      47             :     }
      48    13378606 : }
      49             : 
      50             : /* put in R[i*stride] for 0 <= i < l the content of {t+i*s, s} */
      51             : void
      52     6689303 : unpack (mpz_t *R, mp_size_t stride, mp_ptr t, mp_size_t l, mp_size_t s)
      53             : {
      54             :   mp_size_t i, j, size_tmp;
      55             :   mp_ptr r_ptr;
      56             : 
      57    72553853 :   for (i = 0, j = 0; i < l; i++, t += s, j += stride)
      58             :     {
      59    65864550 :       size_tmp = s;
      60   108211472 :       MPN_NORMALIZE(t, size_tmp); /* compute the actual size */
      61    65864550 :       r_ptr = MPZ_REALLOC (R[j], size_tmp);
      62    65864550 :       if (size_tmp)
      63    65847333 :         MPN_COPY (r_ptr, t, size_tmp);
      64    65864550 :       SIZ(R[j]) = size_tmp;
      65             :     }
      66     6689303 : }
      67             : 
      68             : /* R <- A * B where A = A[0] + A[1]*x + ... + A[n-1]*x^(n-1), idem for B */
      69             : void
      70    14996192 : list_mul_n_basecase (listz_t R, listz_t A, listz_t B, unsigned int n)
      71             : {
      72             :   unsigned int i, j;
      73             : 
      74    14996192 :   if (n == 1)
      75             :     {
      76    10150554 :       mpz_mul (R[0], A[0], B[0]);
      77    10150554 :       return;
      78             :     }
      79             : 
      80    28554736 :   for (i = 0; i < n; i++)
      81   243644756 :     for (j = 0; j < n; j++)
      82             :       {
      83   219935658 :         if (i == 0 || j == n - 1)
      84    42572558 :           mpz_mul (R[i+j], A[i], B[j]);
      85             :         else
      86   177363100 :           mpz_addmul (R[i+j], A[i], B[j]);
      87             :       }
      88             : }
      89             : 
      90             : static void
      91    23379158 : list_mul_n_kara2 (listz_t R, listz_t A, listz_t B)
      92             : {
      93    23379158 :   mpz_add (R[0], A[0], A[1]);
      94    23379158 :   mpz_add (R[2], B[0], B[1]);
      95    23379158 :   mpz_mul (R[1], R[0], R[2]);
      96    23379158 :   mpz_mul (R[0], A[0], B[0]);
      97    23379158 :   mpz_mul (R[2], A[1], B[1]);
      98    23379158 :   mpz_sub (R[1], R[1], R[0]);
      99    23379158 :   mpz_sub (R[1], R[1], R[2]);
     100    23379158 : }
     101             : 
     102             : /* R[0..4] <- A[0..2] * B[0..2] in 7 multiplies */
     103             : static void
     104     7503613 : list_mul_n_kara3 (listz_t R, listz_t A, listz_t B, listz_t T)
     105             : {
     106     7503613 :   mpz_add (T[0], A[0], A[2]);
     107     7503613 :   mpz_add (R[0], B[0], B[2]);
     108     7503613 :   mpz_mul (R[2], T[0], R[0]); /* (A0+A2)*(B0+B2) */
     109     7503613 :   mpz_mul (R[3], T[0], B[1]); /* (A0+A2)*B1 */
     110     7503613 :   mpz_mul (R[4], A[1], R[0]); /* A1*(B0+B2) */
     111     7503613 :   mpz_add (R[3], R[3], R[4]); /* (A0+A2)*B1+A1*(B0+B2) */
     112     7503613 :   list_mul_n_kara2 (T, A, B);
     113     7503613 :   mpz_sub (R[2], R[2], T[0]); /* A0*A2+A2*B0+A2*B2 */
     114     7503613 :   mpz_sub (R[3], R[3], T[1]); /* A2*B1+A1*B2 */
     115     7503613 :   mpz_add (R[2], R[2], T[2]); /* A0*A2+A2*B0+A2*B2+A1*B1 */
     116     7503613 :   mpz_swap (R[0], T[0]);      /* A0*B0 */
     117     7503613 :   mpz_swap (R[1], T[1]);      /* A0*B1+A1*B0 */
     118     7503613 :   mpz_mul (R[4], A[2], B[2]); /* A2*B2 */
     119     7503613 :   mpz_sub (R[2], R[2], R[4]); /* A0*A2+A2*B0+A1*B1 */
     120     7503613 : }
     121             : 
     122             : /* Assume n >= 2. T is a scratch space of enough entries. */
     123             : static void
     124    37550927 : list_mul_n_karatsuba_aux (listz_t R, listz_t A, listz_t B, unsigned int n,
     125             :                           listz_t T)
     126             : {
     127             :   unsigned int h, l;
     128             : 
     129    37550927 :   if (n == 1)
     130             :     {
     131     4194303 :       list_mul_n_basecase (R, A, B, n);
     132     4194303 :       return;
     133             :     }
     134             : 
     135    33356624 :   if (n == 2)
     136             :     {
     137    15875545 :       list_mul_n_kara2 (R, A, B);
     138    15875545 :       return;
     139             :     }
     140             : 
     141    17481079 :   if (n == 3)
     142             :     {
     143     7503613 :       list_mul_n_kara3 (R, A, B, T);
     144     7503613 :       return;
     145             :     }
     146             : 
     147     9977466 :   h = n / 2;
     148     9977466 :   l = n - h;
     149     9977466 :   list_add (R, A, A + l, h);
     150     9977466 :   list_add (R + l, B, B + l, h);
     151     9977466 :   if (h < l)
     152             :     {
     153     3145606 :       mpz_set (R[h], A[h]);
     154     3145606 :       mpz_set (R[l + h], B[h]);
     155             :     }
     156     9977466 :   list_mul_n_karatsuba_aux (T, R, R + l, l, T + 2 * l - 1);
     157     9977466 :   list_mul_n_karatsuba_aux (R, A, B, l, T + 2 * l - 1);
     158             :   /* {R,2l-1} = Al * Bl */
     159     9977466 :   list_mul_n_karatsuba_aux (R + 2 * l, A + l, B + l, h, T + 2 * l - 1);
     160             :   /* {R+2l,2h-1} = Ah * Bh */
     161             :   /* T will contain Al*Bh+Ah*Bl, it thus suffices to compute its low n-1
     162             :      coefficients */
     163     9977466 :   list_sub (T, T, R, n - 1);
     164     9977466 :   list_sub (T, T, R + 2 * l, 2 * h - 1);
     165     9977466 :   mpz_set_ui (R[2 * l - 1], 0);
     166     9977466 :   list_add (R + l, R + l, T, n - 1);
     167             : }
     168             : 
     169             : static unsigned int
     170    15695717 : list_mul_n_mem (unsigned int n)
     171             : {
     172    15695717 :   if (n == 1)
     173     7618529 :     return 0;
     174             :   else
     175             :     {
     176     8077188 :       unsigned int k = (n + 1) / 2;
     177     8077188 :       return 2 * k - 1 + list_mul_n_mem (k);
     178             :     }
     179             : }
     180             : 
     181             : void
     182     7618529 : list_mul_n_karatsuba (listz_t R, listz_t A, listz_t B, unsigned int n)
     183             : {
     184             :   listz_t T;
     185             :   unsigned int s;
     186             : 
     187     7618529 :   s = list_mul_n_mem (n);
     188     7618529 :   T = init_list (s);
     189     7618529 :   list_mul_n_karatsuba_aux (R, A, B, n, T);
     190     7618529 :   clear_list (T, s);
     191     7618529 : }
     192             : 
     193             : /* Classical one-point Kronecker-Schoenhage substitution.
     194             :    Notes:
     195             :     - this code aligns the coeffs at limb boundaries - if instead we aligned
     196             :       at byte boundaries then we could save up to 3*n bytes,
     197             :       but tests have shown this doesn't give any significant speed increase,
     198             :       even for large degree polynomials.
     199             :      - this code requires that all coefficients A[] and B[] are nonnegative. */
     200             : void
     201     2744289 : list_mul_n_KS1 (listz_t R, listz_t A, listz_t B, unsigned int l)
     202             : {
     203             :   unsigned long i;
     204     2744289 :   mp_size_t s, t = 0, size_t0;
     205             :   mp_ptr t0_ptr, t1_ptr, t2_ptr;
     206             : 
     207             :   /* compute the largest bit-size t of the A[i] and B[i] */
     208    15539697 :   for (i = 0; i < l; i++)
     209             :     {
     210    12795408 :       if ((s = mpz_sizeinbase (A[i], 2)) > t)
     211     3915716 :         t = s;
     212    12795408 :       if ((s = mpz_sizeinbase (B[i], 2)) > t)
     213     2744289 :         t = s;
     214             :     }
     215             :   /* For n > 0, s = sizeinbase (n, 2)     ==> n < 2^s. 
     216             :      For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
     217             :      Hence all A[i], B[i] < 2^t */
     218             :   
     219             :   /* Each coeff of A(x)*B(x) < l * 2^(2*t), so max number of bits in a 
     220             :      coeff of the product will be 2 * t + ceil(log_2(l)) */
     221     2744289 :   s = 2 * t;
     222     7028581 :   for (i = l; i > 1; s++, i = (i + 1) >> 1);
     223             :   
     224             :   /* work out the corresponding number of limbs */
     225     2744289 :   s = 1 + (s - 1) / GMP_NUMB_BITS;
     226             : 
     227     2744289 :   size_t0 = s * l;
     228             : 
     229             :   /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
     230     2744289 :   t0_ptr = (mp_ptr) malloc (4 * size_t0 * sizeof (mp_limb_t));
     231     2744289 :   if (t0_ptr == NULL)
     232             :     {
     233           0 :       outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
     234           0 :       exit (1);
     235             :     }
     236     2744289 :   t1_ptr = t0_ptr + size_t0;
     237     2744289 :   t2_ptr = t1_ptr + size_t0;
     238             :     
     239     2744289 :   pack (t0_ptr, A, l, 1, s);
     240     2744289 :   pack (t1_ptr, B, l, 1, s);
     241             : 
     242     2744289 :   mpn_mul_n (t2_ptr, t0_ptr, t1_ptr, size_t0);
     243             : 
     244     2744289 :   unpack (R, 1, t2_ptr, 2 * l - 1, s);
     245             : 
     246     2744289 :   free (t0_ptr);
     247     2744289 : }
     248             : 
     249             : /* Two-point Kronecker substitition.
     250             :    Reference: Algorithm 2 from "Faster polynomial multiplication via multipoint
     251             :    Kronecker substitution", David Harvey, Journal of Symbolic Computation,
     252             :    number 44 (2009), pages 1502-1510.
     253             :    Assume n >= 2.
     254             :    Notes:
     255             :     - this code aligns the coeffs at limb boundaries - if instead we aligned
     256             :       at byte boundaries then we could save up to 3*n bytes,
     257             :       but tests have shown this doesn't give any significant speed increase,
     258             :       even for large degree polynomials.
     259             :      - this code requires that all coefficients A[] and B[] are nonnegative.
     260             : */
     261             : void
     262     1972507 : list_mul_n_KS2 (listz_t R, listz_t A, listz_t B, unsigned int n)
     263             : {
     264             :   unsigned long i;
     265     1972507 :   mp_size_t s, s2, t = 0, l, h, ns2;
     266             :   mp_ptr tmp, A0, A1, B0, B1, C0, C1;
     267             :   int sA, sB;
     268             : 
     269     1972507 :   ASSERT_ALWAYS (n >= 2);
     270             : 
     271             :   /* compute the largest bit-size t of the A[i] and B[i] */
     272    24467772 :   for (i = 0; i < n; i++)
     273             :     {
     274    22495265 :       if ((s = mpz_sizeinbase (A[i], 2)) > t)
     275     3349528 :         t = s;
     276    22495265 :       if ((s = mpz_sizeinbase (B[i], 2)) > t)
     277     1926605 :         t = s;
     278             :     }
     279             :   /* For n > 0, s = sizeinbase (n, 2)     ==> n < 2^s. 
     280             :      For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
     281             :      Hence all A[i], B[i] < 2^t */
     282             :   
     283             :   /* Each coeff of A(x)*B(x) < n * 2^(2*t), so max number of bits in a 
     284             :      coeff of the product will be 2 * t + ceil(log_2(n)) */
     285     1972507 :   s = 2 * t;
     286     7520683 :   for (i = n; i > 1; s++, i = (i + 1) >> 1);
     287             :   
     288             :   /* work out the corresponding number of limbs */
     289     1972507 :   s = 1 + (s - 1) / GMP_NUMB_BITS;
     290             : 
     291             :   /* ensure s is even */
     292     1972507 :   s = s + (s & 1);
     293     1972507 :   s2 = s >> 1;
     294     1972507 :   ns2 = n * s2;
     295             : 
     296     1972507 :   l = n / 2;
     297     1972507 :   h = n - l;
     298             : 
     299             :   /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
     300     1972507 :   tmp = (mp_ptr) malloc (8 * ns2 * sizeof (mp_limb_t));
     301     1972507 :   if (tmp == NULL)
     302             :     {
     303           0 :       outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
     304           0 :       exit (1);
     305             :     }
     306             : 
     307     1972507 :   A0 = tmp;
     308     1972507 :   A1 = A0 + ns2;
     309     1972507 :   B0 = A1 + ns2;
     310     1972507 :   B1 = B0 + ns2;
     311     1972507 :   C0 = B1 + ns2;
     312     1972507 :   C1 = C0 + 2 * ns2;
     313             : 
     314     1972507 :   pack (A0, A, h, 2, s); /* A0 = Aeven(S) where S = 2^(s*GMP_NUMB_BITS) */
     315             :   /* A0 has in fact only n * s2 significant limbs:
     316             :      if n=2h, h*s = n*s2
     317             :      if n=2h-1, the last chunk from A0 has at most s2 limbs */
     318    21696481 :   MPN_ZERO(B0, s2);
     319     1972507 :   pack (B0 + s2, A + 1, l, 2, s);
     320             :   /* for the same reason as above, we have at most l*s-s2 significant limbs
     321             :      at B0+s2, thus at most l*s <= n*s2 at B0 */
     322     1972507 :   if ((sA = mpn_cmp (A0, B0, ns2)) >= 0)
     323      753928 :     mpn_sub_n (A1, A0, B0, ns2);
     324             :   else
     325     1218579 :     mpn_sub_n (A1, B0, A0, ns2);
     326     1972507 :   mpn_add_n (A0, A0, B0, ns2);
     327             :   /* now A0 is X+ with the notations of Algorithm, A1 is sA*X- */
     328             : 
     329     1972507 :   pack (B0, B, h, 2, s);
     330    21696481 :   MPN_ZERO(C0, s2);
     331     1972507 :   pack (C0 + s2, B + 1, l, 2, s);
     332     1972507 :   if ((sB = mpn_cmp (B0, C0, ns2)) >= 0)
     333      753267 :     mpn_sub_n (B1, B0, C0, ns2);
     334             :   else
     335     1219240 :     mpn_sub_n (B1, C0, B0, ns2);
     336     1972507 :   mpn_add_n (B0, B0, C0, ns2);
     337             :   /* B0 is Y+, B1 is sB*Y- with the notations of Algorithm 2 */
     338             : 
     339     1972507 :   mpn_mul_n (C0, A0, B0, ns2); /* C0 is Z+ = X+ * Y+ */
     340     1972507 :   mpn_mul_n (C1, A1, B1, ns2); /* C1 is sA * sB * Z- */
     341             : 
     342     1972507 :   if (sA * sB >= 0)
     343             :     {
     344     1971698 :       mpn_add_n (A0, C0, C1, 2 * ns2);
     345     1971698 :       mpn_sub_n (B0, C0, C1, 2 * ns2);
     346             :     }
     347             :   else
     348             :     {
     349         809 :       mpn_sub_n (A0, C0, C1, 2 * ns2);
     350         809 :       mpn_add_n (B0, C0, C1, 2 * ns2);
     351             :     }
     352     1972507 :   mpn_rshift (A0, A0, 4 * ns2, 1); /* global division by 2 */
     353             : 
     354             :   /* If A[] and B[] have n coefficients, the product has 2n-1 coefficients.
     355             :      The even part has n coefficients and the odd part n-1 coefficients */
     356     1972507 :   unpack (R, 2, A0, n, s);
     357     1972507 :   unpack (R + 1, 2, B0 + s2, n - 1, s);
     358             : 
     359     1972507 :   free (tmp);
     360     1972507 : }
     361             : 
     362             : /* Puts in R[0..2n-2] the product of A[0..n-1] and B[0..n-1], seen as
     363             :    polynomials.
     364             : */
     365             : void
     366     3845177 : list_mult_n (listz_t R, listz_t A, listz_t B, unsigned int n)
     367             : {
     368     3845177 :   int T[TUNE_LIST_MUL_N_MAX_SIZE] = LIST_MUL_TABLE, best;
     369             : 
     370             :   /* See tune_list_mul_n() in tune.c:
     371             :      0 : list_mul_n_basecase
     372             :      2 : list_mul_n_KS1
     373             :      3 : list_mul_n_KS2 */
     374     3845177 :   best = (n < TUNE_LIST_MUL_N_MAX_SIZE) ? T[n] : 3;
     375             : 
     376     3845177 :   if (best == 0)
     377     3732224 :     list_mul_n_basecase (R, A, B, n);
     378      112953 :   else if (best == 1)
     379           0 :     list_mul_n_karatsuba (R, A, B, n);
     380      112953 :   else if (best == 2)
     381           0 :     list_mul_n_KS1 (R, A, B, n);
     382             :   else
     383      112953 :     list_mul_n_KS2 (R, A, B, n);
     384     3845177 : }
     385             : 
     386             : /* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients
     387             :    of degree m to n+m of rev(a)*c, i.e.
     388             :    b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l)
     389             :    ...
     390             :    b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k)
     391             :    ...
     392             :    b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n].
     393             : 
     394             :    If rev=0, consider a instead of rev(a).
     395             : 
     396             :    Assumes n <= l.
     397             : 
     398             :    Return non-zero if an error occurred.
     399             : 
     400             :    low(b) is the coefficients of degree 0 to m-1 of a*c (or rev(a)*c)
     401             :    mid(b) is the coefficients of degree m to m+n of a*c
     402             :    high(b) is the coefficients of degree m+n+1 to m+l+1 of a*c
     403             : */
     404             : 
     405             : int
     406        8092 : TMulKS (listz_t b, unsigned int n, listz_t a, unsigned int m,
     407             :         listz_t c, unsigned int l, mpz_t modulus, int rev)
     408             : {
     409        8092 :   unsigned long i, s = 0, t;
     410             :   mp_ptr ap, bp, cp;
     411             :   mp_size_t an, bn, cn;
     412        8092 :   int ret = 0; /* default return value */
     413             : #ifdef DEBUG
     414             :   long st = cputime ();
     415             :   fprintf (ECM_STDOUT, "n=%u m=%u l=%u bits=%u n*bits=%u: ", n, m, l,
     416             :            mpz_sizeinbase (modulus, 2), n * mpz_sizeinbase (modulus, 2));
     417             : #endif
     418             : 
     419             :   ASSERT (n <= l); /* otherwise the upper coefficients of b are 0 */
     420        8092 :   if (l > n + m)
     421        8028 :     l = n + m; /* otherwise, c has too many coeffs */
     422             : 
     423             :   /* make coefficients a[] and c[] non-negative and compute max #bits */
     424     2606501 :   for (i = 0; i <= m; i++)
     425             :     {
     426     2598409 :       if (mpz_sgn (a[i]) < 0)
     427           0 :         mpz_mod (a[i], a[i], modulus);
     428     2598409 :       if ((t = mpz_sizeinbase (a[i], 2)) > s)
     429       12761 :         s = t;
     430             :     }
     431     5540945 :   for (i = 0; i <= l; i++)
     432             :     {
     433     5532853 :       if (mpz_sgn (c[i]) < 0)
     434           0 :         mpz_mod (c[i], c[i], modulus);
     435     5532853 :       if ((t = mpz_sizeinbase (c[i], 2)) > s)
     436        4099 :         s = t;
     437             :     }
     438             : 
     439             : #ifdef FFT_WRAP
     440        8092 :   s ++; /* need one extra bit to prevent carry of low(b) + high(b) */
     441             : #endif
     442             : 
     443             :   /* max coeff has 2*s+ceil(log2(min(m+1,l+1))) bits,
     444             :    i.e. 2*s + 1 + floor(log2(min(m,l))) */
     445       39860 :   for (s = 2 * s, i = (m < l) ? m : l; i; s++, i >>= 1);
     446             : 
     447             :   /* corresponding number of limbs */
     448        8092 :   s = 1 + (s - 1) / GMP_NUMB_BITS;
     449             : 
     450        8092 :   an = (m + 1) * s;
     451        8092 :   cn = (l + 1) * s;
     452        8092 :   bn = an + cn;
     453             : 
     454             :   /* a[0..m] needs (m+1) * s limbs */
     455        8092 :   ap = (mp_ptr) malloc (an * sizeof (mp_limb_t));
     456        8092 :   if (ap == NULL)
     457             :     {
     458           0 :       ret = 1;
     459           0 :       goto TMulKS_end;
     460             :     }
     461        8092 :   cp = (mp_ptr) malloc (cn * sizeof (mp_limb_t));
     462        8092 :   if (cp == NULL)
     463             :     {
     464           0 :       ret = 1;
     465           0 :       goto TMulKS_free_ap;
     466             :     }
     467             : 
     468    99209111 :   MPN_ZERO (ap, an);
     469   199916421 :   MPN_ZERO (cp, cn);
     470             : 
     471             :   /* a is reverted */
     472     2606501 :   for (i = 0; i <= m; i++)
     473     2598409 :     if (SIZ(a[i]))
     474     2598397 :       MPN_COPY (ap + ((rev) ? (m - i) : i) * s, PTR(a[i]), SIZ(a[i]));
     475     5540945 :   for (i = 0; i <= l; i++)
     476     5532853 :     if (SIZ(c[i]))
     477     5532837 :       MPN_COPY (cp + i * s, PTR(c[i]), SIZ(c[i]));
     478             : 
     479             : #ifdef FFT_WRAP
     480             :   /* the product rev(a) * c has m+l+1 coefficients.
     481             :      We throw away the first m and the last l-n <= m.
     482             :      If we compute mod (m+n+1) * s limbs, we are ok */
     483        8092 :   bn = mpn_mulmod_bnm1_next_size ((m + n + 1) * s);
     484             :   
     485        8092 :   bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
     486        8092 :   if (bp == NULL)
     487             :     {
     488           0 :       ret = 1;
     489           0 :       goto TMulKS_free_cp;
     490             :     }
     491             :   {
     492             :     mp_ptr tp;
     493        8092 :     tp = (mp_ptr) malloc ((2 * bn + 4) * sizeof (mp_limb_t));
     494        8092 :     if (tp == NULL)
     495             :       {
     496           0 :         ret = 1;
     497           0 :         goto TMulKS_free_cp;
     498             :       }
     499             :     /* mpn_mulmod_bnm1 requires that the first operand is larger */
     500        8092 :     if (an >= cn)
     501        1158 :       mpn_mulmod_bnm1 (bp, bn, ap, an, cp, cn, tp);
     502             :     else
     503        6934 :       mpn_mulmod_bnm1 (bp, bn, cp, cn, ap, an, tp);
     504        8092 :     free (tp);
     505             :   }
     506             : #else /* FFT_WRAP is not defined */
     507             :   bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
     508             :   if (bp == NULL)
     509             :     {
     510             :       ret = 1;
     511             :       goto TMulKS_free_cp;
     512             :     }
     513             :   if (an >= cn)
     514             :     mpn_mul (bp, ap, an, cp, cn);
     515             :   else
     516             :     mpn_mul (bp, cp, cn, ap, an);
     517             : #endif
     518             : 
     519             :   /* recover coefficients of degree m to n+m of product in b[0..n] */
     520        8092 :   bp += m * s;
     521     2950628 :   for (i = 0; i <= n; i++)
     522             :     {
     523     2942536 :       t = s;
     524     3012678 :       MPN_NORMALIZE(bp, t);
     525     2942536 :       MPZ_REALLOC (b[i], (mp_size_t) t);
     526     2942536 :       if (t)
     527     2942536 :         MPN_COPY (PTR(b[i]), bp, t);
     528     2942536 :       SIZ(b[i]) = t;
     529     2942536 :       bp += s;
     530             :     }
     531        8092 :   bp -= (m + n + 1) * s;
     532             : 
     533        8092 :   free (bp);
     534        8092 :  TMulKS_free_cp:
     535        8092 :   free (cp);
     536        8092 :  TMulKS_free_ap:
     537        8092 :   free (ap);
     538             : 
     539             : #ifdef DEBUG
     540             :   fprintf (ECM_STDOUT, "%ldms\n", elltime (st, cputime ()));
     541             : #endif
     542             :   
     543        8092 :  TMulKS_end:
     544        8092 :   return ret;
     545             : }
     546             : 
     547             : unsigned int
     548       75877 : ks_wrapmul_m (unsigned int m0, unsigned int k, mpz_t n)
     549             : {
     550             : #ifdef FFT_WRAP
     551             :   mp_size_t t, s;
     552             :   unsigned long i, m;
     553             : 
     554       75877 :   t = mpz_sizeinbase (n, 2);
     555       75877 :   s = t * 2 + 1;
     556      327724 :   for (i = k - 1; i; s++, i >>= 1);
     557       75877 :   s = 1 + (s - 1) / GMP_NUMB_BITS;
     558       75877 :   i = mpn_mulmod_bnm1_next_size (m0 * s);
     559      919780 :   while (i % s)
     560      843903 :     i = mpn_mulmod_bnm1_next_size (i + 1);
     561       75877 :   m = i / s;
     562       75877 :   return m;
     563             : #else
     564             :   return ~ (unsigned int) 0;
     565             : #endif
     566             : }
     567             : 
     568             : /* multiply in R[] A[0]+A[1]*x+...+A[k-1]*x^(k-1)
     569             :                 by B[0]+B[1]*x+...+B[l-1]*x^(l-1) modulo n,
     570             :    wrapping around coefficients of the product up from degree m >= m0.
     571             :    Assumes k >= l.
     572             :    R is assumed to have 2*m0-3+list_mul_mem(m0-1) allocated cells.
     573             :    Return m (or 0 if an error occurred).
     574             : */
     575             : unsigned int
     576       75829 : ks_wrapmul (listz_t R, unsigned int m0,
     577             :             listz_t A, unsigned int k,
     578             :             listz_t B, unsigned int l,
     579             :             mpz_t n)
     580             : {
     581             : #ifndef FFT_WRAP
     582             :   ASSERT_ALWAYS(0); /* ks_wrapmul should not be called in that case */
     583             :   return 0;
     584             : #else
     585             :   unsigned long i, m, t;
     586             :   mp_size_t s, size_t0, size_t1, size_tmp;
     587             :   mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr, tp;
     588             : 
     589             :   ASSERT(k >= l);
     590             : 
     591       75829 :   t = mpz_sizeinbase (n, 2);
     592     1061485 :   for (i = 0; i < k; i++)
     593      985656 :     if (mpz_sgn (A[i]) < 0 || mpz_sizeinbase (A[i], 2) > t)
     594           0 :       mpz_mod (A[i], A[i], n);
     595      909827 :   for (i = 0; i < l; i++)
     596      833998 :     if (mpz_sgn (B[i]) < 0 || mpz_sizeinbase (B[i], 2) > t)
     597           0 :       mpz_mod (B[i], B[i], n);
     598             :   
     599       75829 :   s = t * 2 + 1; /* one extra sign bit */
     600      327203 :   for (i = k - 1; i; s++, i >>= 1);
     601             :   
     602       75829 :   s = 1 + (s - 1) / GMP_NUMB_BITS;
     603             : 
     604       75829 :   size_t0 = s * k;
     605       75829 :   size_t1 = s * l;
     606             : 
     607             :   /* allocate one double-buffer to save malloc/MPN_ZERO/free calls */
     608       75829 :   t0_ptr = (mp_ptr) malloc (size_t0 * sizeof (mp_limb_t));
     609       75829 :   if (t0_ptr == NULL)
     610           0 :     return 0;
     611       75829 :   t1_ptr = (mp_ptr) malloc (size_t1 * sizeof (mp_limb_t));
     612       75829 :   if (t1_ptr == NULL)
     613             :     {
     614           0 :       free (t0_ptr);
     615           0 :       return 0;
     616             :     }
     617             :     
     618    26128645 :   MPN_ZERO (t0_ptr, size_t0);
     619    22961289 :   MPN_ZERO (t1_ptr, size_t1);
     620             : 
     621     1061485 :   for (i = 0; i < k; i++)
     622      985656 :     if (SIZ(A[i]))
     623      985656 :       MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i]));
     624      909827 :   for (i = 0; i < l; i++)
     625      833998 :     if (SIZ(B[i]))
     626      833998 :       MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i]));
     627             : 
     628       75829 :   i = mpn_mulmod_bnm1_next_size (m0 * s);
     629             :   /* the following loop ensures we don't cut in the middle of a
     630             :      coefficient */
     631      901205 :   while (i % s)
     632      825376 :     i = mpn_mulmod_bnm1_next_size (i + 1);
     633             :   ASSERT(i % s == 0);
     634       75829 :   m = i / s;
     635             :   ASSERT(m <= 2 * m0 - 3 + list_mul_mem (m0 - 1));
     636             : 
     637       75829 :   t2_ptr = (mp_ptr) malloc ((i + 1) * sizeof (mp_limb_t));
     638       75829 :   if (t2_ptr == NULL)
     639             :     {
     640           0 :       free (t0_ptr);
     641           0 :       free (t1_ptr);
     642           0 :       return 0;
     643             :     }
     644             : 
     645             :   {
     646       75829 :     mp_ptr tp = malloc ((2 * i + 4) * sizeof (mp_limb_t));
     647       75829 :     if (tp == NULL)
     648             :       {
     649           0 :         free (t0_ptr);
     650           0 :         free (t1_ptr);
     651           0 :         return 0;
     652             :       }
     653       75829 :     mpn_mulmod_bnm1 (t2_ptr, i, t0_ptr, size_t0, t1_ptr, size_t1, tp);
     654       75829 :     if ((mp_size_t) i > size_t0 + size_t1)
     655     1204192 :       MPN_ZERO(t2_ptr + size_t0 + size_t1, i - (size_t0 + size_t1));
     656       75829 :     free (tp);
     657             :   }
     658             : 
     659     1528068 :   for (t = 0, tp = t2_ptr; t < m; t++, tp += s)
     660             :     {
     661     1452239 :       size_tmp = s;
     662     4209389 :       MPN_NORMALIZE(tp, size_tmp);
     663     1452239 :       r_ptr = MPZ_REALLOC (R[t], size_tmp);
     664     1452239 :       if (size_tmp)
     665     1381462 :         MPN_COPY (r_ptr, tp, size_tmp);
     666     1452239 :       SIZ(R[t]) = size_tmp;
     667             :     }
     668             : 
     669       75829 :   free (t0_ptr);
     670       75829 :   free (t1_ptr);
     671       75829 :   free (t2_ptr);
     672             :   
     673       75829 :   return m;
     674             : #endif /* FFT_WRAP */
     675             : }

Generated by: LCOV version 1.14