LCOV - unnamed - ecm/pm1fs2.c

LCOV - code coverage report

Current view:	top level - ecm - pm1fs2.c (source / functions)		Hit	Total	Coverage
Test:	unnamed	Lines:	1528	1672	91.4 %
Date:	2022-03-21 11:19:20	Functions:	42	42	100.0 %

          Line data    Source code

       1             : /* Implementation of fast stage 2 for P-1 and P+1 as described in
       2             :    "Improved Stage 2 to $P\pm{}1$ Factoring Algorithms" by
       3             :    Peter L. Montgomery and Alexander Kruppa, ANTS 2008 (8th Algorithmic 
       4             :    Number Theory Symposium).
       5             :    
       6             : Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2016
       7             : Alexander Kruppa, Paul Zimmermann, David Cleaver
       8             : NTT functions are based on code Copyright 2005 Dave Newman.
       9             : 
      10             : This file is part of the ECM Library.
      11             : 
      12             : The ECM Library is free software; you can redistribute it and/or modify
      13             : it under the terms of the GNU Lesser General Public License as published by
      14             : the Free Software Foundation; either version 3 of the License, or (at your
      15             : option) any later version.
      16             : 
      17             : The ECM Library is distributed in the hope that it will be useful, but
      18             : WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      19             : or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
      20             : License for more details.
      21             : 
      22             : You should have received a copy of the GNU Lesser General Public License
      23             : along with the ECM Library; see the file COPYING.LIB.  If not, see
      24             : http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
      25             : 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
      26             : 
      27             : #include "config.h"
      28             : #include <stdio.h>
      29             : #include <stdlib.h>
      30             : #include <limits.h>
      31             : #include "ecm-impl.h"
      32             : #include "sp.h"
      33             : #include <math.h>
      34             : #ifdef HAVE_ALLOCA_H
      35             : #include <alloca.h>
      36             : #endif
      37             : #ifdef HAVE_STRING_H
      38             : #include <string.h>
      39             : #endif
      40             : #ifdef _OPENMP
      41             : #include <omp.h>
      42             : #endif
      43             : 
      44             : /* TODO:
      45             :    - move functions into their proper files (i.e. NTT functions etc.)
      46             :    - later: allow storing NTT vectors on disk
      47             : */
      48             : 
      49             : /* Define TEST_ZERO_RESULT to test if any result of the multipoint
      50             :    evaluation is equal to zero. If the modulus is composite, this
      51             :    happening might indicate a problem in the evalutaion code */
      52             : #define TEST_ZERO_RESULT
      53             : 
      54             : const int pari = 0;
      55             : 
      56             : const unsigned long Pvalues[] = {
      57             :     3UL, 5UL, 9UL, 15UL, 21UL, 17UL, 27UL, 33UL, 45UL, 51UL, 63UL, 75UL, 
      58             :     105UL, 99UL, 135UL, 165UL, 195UL, 189UL, 231UL, 255UL, 315UL, 345UL, 
      59             :     357UL, 375UL, 405UL, 435UL, 525UL, 585UL, 615UL, 735UL, 765UL, 825UL, 
      60             :     945UL, 1155UL, 1065UL, 1365UL, 1305UL, 1335UL, 1575UL, 1785UL, 1995UL, 
      61             :     2145UL, 2205UL, 2415UL, 2625UL, 2805UL, 3045UL, 3465UL, 3675UL, 4095UL, 
      62             :     4305UL, 4515UL, 4725UL, 4785UL, 5355UL, 5775UL, 5985UL, 5865UL, 6825UL, 
      63             :     7245UL, 8085UL, 8925UL, 9555UL, 10395UL, 10725UL, 11025UL, 12285UL, 
      64             :     12705UL, 15015UL, 14175UL, 15225UL, 16065UL, 17325UL, 19635UL, 21945UL, 
      65             :     23205UL, 24255UL, 25935UL, 26775UL, 28875UL, 31395UL, 33495UL, 35805UL, 
      66             :     36465UL, 38115UL, 39585UL, 40425UL, 45045UL, 45885UL, 49665UL, 51765UL, 
      67             :     58905UL, 65835UL, 69615UL, 75075UL, 77805UL, 82005UL, 84315UL, 86625UL, 
      68             :     88935UL, 94185UL, 98175UL, 105105UL, 109725UL, 116025UL, 118755UL, 
      69             :     121275UL, 135135UL, 137445UL, 137655UL, 144375UL, 153615UL, 165165UL, 
      70             :     167475UL, 176715UL, 179025UL, 185955UL, 197505UL, 208845UL, 215985UL, 
      71             :     225225UL, 255255UL, 250635UL, 285285UL, 277095UL, 294525UL, 315315UL, 
      72             :     345345UL, 373065UL, 368445UL, 405405UL, 435435UL, 451605UL, 465465UL, 
      73             :     454545UL, 504735UL, 525525UL, 555555UL, 569415UL, 596505UL, 645645UL, 
      74             :     647955UL, 672945UL, 687225UL, 765765UL, 770385UL, 805035UL, 855855UL, 
      75             :     858585UL, 915915UL, 945945UL, 962115UL, 1036035UL, 1066065UL, 1119195UL, 
      76             :     1156155UL, 1276275UL, 1306305UL, 1354815UL, 1426425UL, 1456455UL, 
      77             :     1514205UL, 1576575UL, 1666665UL, 1726725UL, 1786785UL, 1789515UL, 
      78             :     1865325UL, 1996995UL, 1983135UL, 2177175UL, 2297295UL, 2327325UL, 
      79             :     2417415UL, 2567565UL, 2611455UL, 2807805UL, 2847075UL, 2878785UL, 
      80             :     3048045UL, 3161235UL, 3258255UL, 3357585UL, 3401475UL, 3533145UL, 
      81             :     3828825UL, 3918915UL, 3985905UL, 4279275UL, 4849845UL, 4789785UL, 
      82             :     4967655UL, 5180175UL, 5360355UL, 5870865UL, 5990985UL, 6561555UL, 
      83             :     6531525UL, 6891885UL, 7402395UL, 7912905UL, 8273265UL, 8580495UL, 
      84             :     8843835UL, 9444435UL, 10015005UL, 10465455UL, 10705695UL, 10885875UL, 
      85             :     11696685UL, 12267255UL, 12507495UL, 12785955UL, 13498485UL, 14549535UL, 
      86             :     14849835UL, 15570555UL, 16111095UL, 16291275UL, 17612595UL, 18123105UL, 
      87             :     18633615UL, 19684665UL, 20255235UL, 20825805UL, 22207185UL, 22717695UL, 
      88             :     24249225UL, 24819795UL, 25741485UL, 26531505UL, 28333305UL, 29354325UL, 
      89             :     30045015UL, 31396365UL, 32807775UL, 33948915UL, 33528495UL, 34879845UL, 
      90             :     37011975UL, 37522485UL, 39564525UL, 41096055UL, 43648605UL, 44219175UL, 
      91             :     45930885UL, 47222175UL, 48333285UL, 50075025UL, 51816765UL, 52777725UL, 
      92             :     55390335UL, 55547415UL, 59053995UL, 60063465UL, 61906845UL, 64579515UL, 
      93             :     66621555UL, 67492425UL, 70105035UL, 73258185UL, 74939865UL, 77224455UL, 
      94             :     79594515UL, 81876795UL, 84999915UL, 88062975UL, 91005915UL, 94189095UL, 
      95             :     98423325UL, 101846745UL, 111546435UL, 111035925UL, 115120005UL, 
      96             :     121246125UL, 124098975UL, 130945815UL, 140645505UL, 150345195UL, 
      97             :     150225075UL, 155450295UL, 158333175UL, 170255085UL, 179444265UL, 
      98             :     190285095UL, 198843645UL, 203408205UL, 206831625UL, 217222005UL, 
      99             :     229474245UL, 240705465UL, 252447195UL, 254999745UL, 269023755UL, 
     100             :     282146865UL, 287672385UL, 294076965UL, 306110805UL, 318302985UL, 
     101             :     334639305UL, 344338995UL, 354038685UL, 363738375UL, 373438065UL,
     102             :     387221835UL, 400254855UL, 421936515UL, 431636205UL, 451035585UL,
     103             :     453888435UL, 470434965UL, 480134655UL, 510765255UL, 522506985UL,
     104             :     557732175UL, 570855285UL, 596530935UL, 610224615UL, 627912285UL,
     105             :     654729075UL, 703227525UL, 722116395UL, 751725975UL, 780825045UL,
     106             :     790524735UL, 821665845UL, 851275425UL, 863017155UL, 909984075UL,
     107             :     936020085UL, 984518535UL, 1017041025UL, 1052416365UL
     108             : #if (ULONG_MAX > 4294967295)
     109             :    ,1086110025UL, 1110614505UL, 1147371225UL, 1191785595UL, 1213887675UL,
     110             :     1265809545UL, 1282356075UL, 1331995665UL, 1391905515UL, 1450103655UL,
     111             :     1479202725UL, 1547100555UL, 1555088535UL, 1673196525UL, 1712565855UL,
     112             :     1767130365UL, 1830673845UL, 1883166285UL, 1954487535UL, 2001964965UL,
     113             :     2119382265UL, 2187280095UL, 2255177925UL, 2342475135UL, 2390973585UL,
     114             :     2421213795UL, 2555868315UL, 2672264595UL, 2788660875UL, 2856558705UL,
     115             :     2953555605UL, 3050552505UL, 3234846615UL, 3457939485UL, 3516137625UL,
     116             :     3681032355UL, 3758629875UL, 3904125225UL, 4127218095UL, 4360010655UL,
     117             :     4573403835UL, 4796496705UL, 4844995155UL, 5019589575UL, 5203883685UL,
     118             :     5262081825UL, 5465775315UL, 5766465705UL, 5898837945UL, 6164152995UL,
     119             :     6358146795UL, 6411780375UL, 6804332535UL, 6980458485UL, 7172920755UL,
     120             :     7473611145UL, 7716103395UL, 7968295335UL, 8182259085UL, 8342499165UL,
     121             :     8812168365UL, 9023519505UL, 9704539845UL, 9927632715UL, 10373818455UL,
     122             :     10439434005UL, 10820004195UL, 11043097065UL, 11489282805UL,
     123             :     11877270405UL, 12381654285UL, 12604747155UL, 13080031965UL,
     124             :     13274025765UL, 13642613985UL, 14389490115UL, 14583483915UL,
     125             :     15058768725UL, 15611651055UL, 16174233075UL, 16397325945UL,
     126             :     17289697425UL, 17735883165UL, 18143270145UL, 18381678315UL,
     127             :     19074440385UL, 19559424885UL, 20636090475UL, 20941375455UL,
     128             :     21800053275UL, 22643926305UL, 23148310185UL, 24205576395UL,
     129             :     24546777255UL, 25544133615UL, 26389538175UL, 26863291455UL,
     130             :     27813861075UL, 29113619535UL, 29494189725UL, 30520074585UL,
     131             :     30684969315UL, 31790733975UL, 33575476935UL, 34467848415UL,
     132             :     35202742575UL, 36427185795UL, 38037334335UL, 39240095895UL,
     133             :     40365259935UL, 42053005995UL, 43168470345UL, 44953213305UL,
     134             :     45845584785UL, 48522699225UL, 50307442185UL, 51869092275UL,
     135             :     53653835235UL, 54546206715UL, 56680138515UL, 58784971245UL,
     136             :     59386352025UL, 61908271425UL, 63431122755UL, 65700850215UL,
     137             :     67931778915UL, 70162707615UL, 72616729185UL, 74120181135UL,
     138             :     75740029365UL, 78417143805UL, 80871165375UL, 82840202445UL,
     139             :     86448487125UL, 88466022645UL, 91133437395UL, 92918180355UL,
     140             :     100280245065UL, 100726430805UL, 102811864155UL, 106749938295UL,
     141             :     109000266375UL, 113219631525UL, 119689324755UL, 121027881975UL,
     142             :     127943760945UL, 132628711215UL, 134859639915UL, 141775518885UL,
     143             :     148691397855UL, 150922326555UL, 155607276825UL, 161320394235UL,
     144             :     164977177365UL, 171446870595UL, 177470378085UL, 183270792705UL
     145             : #endif
     146             : };
     147             : 
     148             : /* All the prime factors that can appear in eulerphi(P) */
     149             : const unsigned long phiPfactors[] = {2UL, 3UL, 5UL, 7UL, 11UL, 13UL, 
     150             :                                      17UL, 19UL};
     151             : 
     152             : 
     153             : /* Some useful PARI functions:
     154             :    sumset(a,b) = {local(i, j, l); l = listcreate (length(a) * length(b)); for (i = 1, length(a), for (j = 1, length(b), listput(l, a[i] + b[j]))); listsort (l, 1); l}
     155             : 
     156             :    V(i,X) = { if (i==0, return(2)); if (i==1, return(X)); if(i%2 == 0, return (V (i/2, X)^2-2)); return (V ((i+1)/2, X) * V ((i-1)/2, X) - X)}
     157             : 
     158             :    U(i,X) = { if (i==0, return(0)); if (i==1, return(1)); if(i%2 == 0, return (U (i/2, X) * V(i/2,X))); return (V ((i+1)/2, X)  *U( (i-1)/2, X) + 1)}
     159             : */
     160             : 
     161             : #ifndef _OPENMP
     162        5628 : static int omp_get_num_threads () {return 1;}
     163        5628 : static int omp_get_thread_num () {return 0;}
     164             : #endif
     165             : 
     166             : static void 
     167             : ntt_sqr_reciprocal (mpzv_t, const mpzv_t, mpzspv_t, const spv_size_t, 
     168             :                     const mpzspm_t);
     169             : 
     170             : static void
     171        4406 : print_elapsed_time (int verbosity, long cpu_start, 
     172             :                     ATTRIBUTE_UNUSED long real_start)
     173             : {
     174             : #ifdef _OPENMP
     175             :   if (real_start != 0L)
     176             :     {
     177             :       outputf (verbosity, " took %lums (%lums real)\n", 
     178             :                elltime (cpu_start, cputime()), 
     179             :                elltime (real_start, realtime()));
     180             :       return;
     181             :     }
     182             : #endif
     183        4406 :   outputf (verbosity, " took %lums\n", elltime (cpu_start, cputime()));
     184        4406 : }
     185             : 
     186             : 
     187             : static void
     188         748 : print_CRT_primes (const int verbosity, const char *prefix, 
     189             :                    const mpzspm_t ntt_context)
     190             : {
     191         748 :   double modbits = 0.;
     192             :   unsigned int i;
     193             :   
     194         748 :   if (test_verbose (verbosity))
     195             :     {
     196          42 :       outputf (verbosity, "%s%lu", prefix, ntt_context->spm[0]->sp);
     197          42 :       modbits += log ((double) ntt_context->spm[0]->sp);
     198        1150 :       for (i = 1; i < ntt_context->sp_num; i++)
     199             :         {
     200        1108 :           outputf (verbosity, " * %lu", ntt_context->spm[i]->sp);
     201        1108 :           modbits += log ((double) ntt_context->spm[i]->sp);
     202             :         }
     203          42 :       outputf (verbosity, ", has %d primes, %f bits\n", 
     204             :                ntt_context->sp_num, modbits / log (2.));
     205             :     }
     206         748 : }
     207             : 
     208             : /* Approximate amount of memory in bytes each coefficient in an NTT takes 
     209             :    so that NTT can do transforms up to length lmax with modulus, or
     210             :    with 2*modulus if twice != 0 */
     211             : static size_t
     212         243 : ntt_coeff_mem (const unsigned long lmax, const mpz_t modulus, const int twice)
     213             : {
     214             :   mpz_t t;
     215             :   size_t n;
     216             :   
     217         243 :   mpz_init (t);
     218         243 :   mpz_mul (t, modulus, modulus);
     219         243 :   mpz_mul_ui (t, t, lmax);
     220         243 :   if (twice)
     221          21 :     mpz_mul_2exp (t, t, 1UL);
     222             :   /* +4: +1 for rounding up, +3 for extra words due to ECRT */
     223         243 :   n = (mpz_sizeinbase (t, 2) - 1) / SP_NUMB_BITS + 4;
     224         243 :   mpz_clear (t);
     225         243 :   return n * sizeof (sp_t);
     226             : }
     227             : 
     228             : size_t
     229         267 : pm1fs2_memory_use (const unsigned long lmax, const mpz_t modulus, 
     230             :                    const int use_ntt)
     231             : {
     232         267 :   if (use_ntt)
     233             :     {
     234             :       /* We store lmax / 2 + 1 coefficients for the DCT-I of F and lmax 
     235             :          coefficients for G in NTT ready format. Each coefficient in 
     236             :          NTT-ready format occupies approx. 
     237             :          ceil(log(lmax*modulus^2)/log(bits per sp_t)) + 3 words. */
     238             :       
     239             :       size_t n;
     240             :       
     241         205 :       n = ntt_coeff_mem (lmax, modulus, 0) * (size_t) (3 * lmax / 2 + 1);
     242         205 :       outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
     243             :                "with lmax = %lu NTT is %lu bytes\n", lmax, n);
     244         205 :       return n;
     245             :     }
     246             :   else
     247             :     {
     248             :       /* F stores s_1/2 residues,
     249             :          h stores s_1 mpz_t structs (residues get cloned from F)
     250             :          g stores lmax residues, 
     251             :          R stores lmax-s_1 residues, 
     252             :          and tmp stores 3*lmax+list_mul_mem (lmax / 2) residues.
     253             :          Assume s_1 is close to lmax/2.
     254             :          Then we have 
     255             :          lmax/4 + lmax/2 + lmax + lmax/2 + 3*lmax + list_mul_mem (lmax / 2)
     256             :          = (5+1/4)*lmax + list_mul_mem (lmax / 2) residues, plus s_1 mpz_t.
     257             :       */
     258             :       
     259             :       size_t n;
     260             :       
     261          62 :       n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
     262          62 :       n *= 5 * lmax + lmax / 4 + list_mul_mem (lmax / 2);
     263          62 :       n += lmax / 2 * sizeof (mpz_t);
     264             :       /* Memory use due to temp space allocation in TMulKS appears to 
     265             :          approximately triple the estimated memory use. This is hard to
     266             :          estimate precisely, so let's go with the fudge factor of 3 here */
     267          62 :       n *= 3;
     268          62 :       outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
     269             :                "with lmax = %lu is %lu bytes\n", lmax, n);
     270          62 :       return n;
     271             :     }
     272             : }
     273             : 
     274             : /* return the possible lmax for given memory use and modulus */
     275             : 
     276             : unsigned long
     277          15 : pm1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt)
     278             : {
     279          15 :   if (use_ntt)
     280             :     {
     281           5 :       size_t n, lmax = 1;
     282             :   
     283           5 :       n = ntt_coeff_mem (lmax, modulus, 0);
     284           5 :       lmax = 1UL << ceil_log2 (memory / n / 3);
     285           5 :       return lmax;
     286             :     }
     287             :   else
     288             :     {
     289             :       size_t lmax, n;
     290             :       
     291          10 :       n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
     292             : 
     293             :       /* Guess an initial value of lmax for list_mul_mem (lmax / 2) */
     294             :       /* memory = n * 25/4 * lmax + lmax / 2 * sizeof (mpz_t); */
     295             :       /* Fudge factor of 3 for TMulKS as above */
     296          10 :       lmax = memory / (3 * 25 * n / 4 + 3 * sizeof (mpz_t) / 2);
     297          10 :       return lmax;
     298             :     }
     299             : }
     300             : 
     301             : size_t 
     302          21 : pp1fs2_memory_use (const unsigned long lmax, const mpz_t modulus, 
     303             :                    const int use_ntt, const int twopass)
     304             : {
     305             :   size_t n, m;
     306             :   
     307          21 :   m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
     308          21 :   if (use_ntt)
     309             :     {
     310             :       /* In one pass mode, we store h_x_ntt and h_y_ntt, each of length 
     311             :          lmax/2(+1), and g_x_ntt and g_y_ntt, each of length lmax, all in 
     312             :          NTT ready format. In two pass mode, we store h_x_ntt, h_y_ntt and 
     313             :          g_x_ntt as before, plus R which is lmax - s_1 mpz_t. 
     314             :          We assume s_1 ~= lmax/2.
     315             :       */
     316             : 
     317          11 :       n = ntt_coeff_mem (lmax, modulus, !twopass);
     318          11 :       if (twopass)
     319           1 :         return lmax * (2 * n + m / 2);
     320             :       else
     321          10 :         return lmax * 3 * n;
     322             :     }
     323             :   else
     324             :     {
     325             :       /* We allocate:
     326             :          F: s_1/2 coefficients
     327             :          fh_x, fh_y: s_1/2 coefficients
     328             :          h_x, h_y: s_1 mpz_t's (cloned from fh_x and fh_y)
     329             :          g_x, g_y: lmax coefficients
     330             :          R_x, R_y: lmax - s_1 coefficients
     331             :          tmp: 3UL * lmax + list_mul_mem (lmax / 2)
     332             :          Assuming s_1 ~ lmax/2, that's
     333             :          lmax/2 + 2*lmax/4 + 2*lmax + 2*lmax/2 * 3*lmax + 
     334             :            list_mul_mem (lmax / 2) =
     335             :          7 + list_mul_mem (lmax / 2) coefficients and lmax mpz_t.
     336             :        */
     337             :       
     338          10 :       n = m * (7 * lmax + list_mul_mem (lmax / 2));
     339          10 :       n += lmax * sizeof (mpz_t);
     340          10 :       n = 5 * n / 2; /* A fudge factor again */
     341          10 :       return n;
     342             :     }
     343             : }
     344             : 
     345             : unsigned long 
     346          43 : pp1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt, 
     347             :                const int twopass)
     348             : {
     349             :   size_t n, m;
     350             :   
     351          43 :   m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
     352          43 :   if (use_ntt)
     353             :     {
     354          22 :       n = ntt_coeff_mem (1, modulus, !twopass);
     355          22 :       if (twopass)
     356          11 :         n = memory / (2 * n + m / 2);
     357             :       else
     358          11 :         n = memory / (3 * n);
     359          22 :       return 1UL << (ceil_log2 (n / 2)); /* Rounded down to power of 2 */
     360             :     }
     361             :   else
     362             :     {
     363          21 :       return memory / 5 / (m * 8 + sizeof (mpz_t)) * 2;
     364             :     }
     365             : }
     366             : 
     367             : 
     368             : /* Test if for given P, nr, B2min and B2 we can choose an m_1 so that the 
     369             :    stage 2 interval [B2min, B2] is covered. The effective B2min and B2
     370             :    are stored in effB2min and effB2 */
     371             : 
     372             : static int
     373     1050536 : test_P (const mpz_t B2min, const mpz_t B2, mpz_t m_1, const unsigned long P, 
     374             :         const unsigned long nr, mpz_t effB2min, mpz_t effB2)
     375             : {
     376             :   mpz_t m;
     377             :   /* We need B2min >= 2 * max(S_1 + S_2) + (2*m_1 - 1)*P + 1, or
     378             :      B2min - 2 * max(S_1 + S_2) - 1 >= (2*m_1)*P - P, or
     379             :      (B2min - 2*max(S_1 + S_2) + P - 1)/(2P) >= m_1
     380             :      Choose m_1 accordingly */
     381             :   
     382     1050536 :   mpz_init (m);
     383     1050536 :   sets_max (m, P);
     384     1050536 :   mpz_mul_2exp (m, m, 1UL); /* m = 2*max(S_1 + S_2) */
     385             : 
     386     1050536 :   mpz_sub (m_1, B2min, m);
     387     1050536 :   mpz_sub_ui (m_1, m_1, 1UL); /* m_1 = B2min - 2*max(S_1 + S_2) - 1 */
     388     1050536 :   mpz_add_ui (m_1, m_1, P);
     389     1050536 :   mpz_fdiv_q_2exp (m_1, m_1, 1UL);
     390     1050536 :   mpz_fdiv_q_ui (m_1, m_1, P);    /* 2UL*P may overflow */
     391             :   
     392             :   /* Compute effB2min = 2 * max(S_1 + S_2) + (2*(m_1 - 1) + 1)*P + 1 */
     393             :   
     394     1050536 :   mpz_mul_2exp (effB2min, m_1, 1UL);
     395     1050536 :   mpz_sub_ui (effB2min, effB2min, 1UL);
     396     1050536 :   mpz_mul_ui (effB2min, effB2min, P);
     397     1050536 :   mpz_add (effB2min, effB2min, m);
     398     1050536 :   mpz_add_ui (effB2min, effB2min, 1UL);
     399     1050536 :   ASSERT_ALWAYS (mpz_cmp (effB2min, B2min) <= 0);
     400             : 
     401             :   /* Compute the smallest value coprime to P at the high end of the stage 2
     402             :      interval that will not be covered: 
     403             :      2*(min(S_1 + S_2)) + (2*(m_1 + nr) + 1)*P. 
     404             :      We assume min(S_1 + S_2) = -max(S_1 + S_2) */
     405     1050536 :   mpz_add_ui (effB2, m_1, nr);
     406     1050536 :   mpz_mul_2exp (effB2, effB2, 1UL);
     407     1050536 :   mpz_add_ui (effB2, effB2, 1UL);
     408     1050536 :   mpz_mul_ui (effB2, effB2, P);
     409     1050536 :   mpz_sub (effB2, effB2, m);
     410             : 
     411             :   /* The effective B2 values is that value, minus 1 */
     412     1050536 :   mpz_sub_ui (effB2, effB2, 1UL);
     413             : 
     414     1050536 :   mpz_clear (m);
     415     1050536 :   return (mpz_cmp (B2, effB2) <= 0);
     416             : }
     417             : 
     418             : 
     419             : static void
     420     1052843 : factor_phiP (int *exponents, const unsigned long phiP)
     421             : {
     422     1052843 :     const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
     423     1052843 :     unsigned long cofactor = phiP;
     424             :     int i;
     425             :     
     426     1052843 :     ASSERT_ALWAYS (phiP > 0UL);
     427             : 
     428     9475587 :     for (i = 0; i < nrprimes; i++)
     429    15008152 :         for (exponents[i] = 0; cofactor % phiPfactors[i] == 0UL; exponents[i]++)
     430     6585408 :             cofactor /= phiPfactors[i];
     431             : 
     432     1052843 :     ASSERT_ALWAYS (cofactor == 1UL);
     433     1052843 : }
     434             : 
     435             : 
     436             : static unsigned long 
     437   214580208 : pow_ul (const unsigned long b, const unsigned int e)
     438             : {
     439   214580208 :     unsigned long r = 1UL;
     440             :     unsigned int i;
     441             : 
     442   327259471 :     for (i = 0; i < e; i++)
     443   112679263 :         r *= b;
     444             : 
     445   214580208 :     return r;
     446             : }
     447             : 
     448             : static unsigned long
     449    53604626 : absdiff_ul (unsigned long a, unsigned long b)
     450             : {
     451    53604626 :     return (a > b) ? a - b : b - a;
     452             : }
     453             : 
     454             : /* Choose s_1 so that s_1 * s_2 = phiP, s_1 is positive and even, 
     455             :    s_2 >= min_s2 and s_2 is minimal and abs(s_1 - l) is minimal 
     456             :    under those conditions. If use_ntt == 1, we require s_1 < l.
     457             :    Returns 0 if no such choice is possible */
     458             : 
     459             : static unsigned long 
     460     1052843 : choose_s_1 (const unsigned long phiP, const unsigned long min_s2,
     461             :             const unsigned long l, const int use_ntt)
     462             : {
     463     1052843 :   const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
     464             :   /* Using [nrprimes] here makes the compiler complain about variable-sized
     465             :      arrays */
     466             :   int phiPexponents[sizeof (phiPfactors) / sizeof (unsigned long)], 
     467             :     exponents[sizeof (phiPfactors) / sizeof (unsigned long)];
     468     1052843 :   unsigned long s_1 = 0UL, s_2 = 0UL, trys_1;
     469             :   int i;
     470             : 
     471     1052843 :   ASSERT_ALWAYS (phiP > 0 && phiP % 2 == 0);
     472             : 
     473             :   /* We want only even s_1. We divide one 2 out of phiP here... */
     474     1052843 :   factor_phiP (phiPexponents, phiP / 2);
     475     9475587 :   for (i = 0; i < nrprimes; i++)
     476     8422744 :       exponents[i] = 0;
     477             : 
     478             :   do {
     479    26822526 :       trys_1 = 2; /* ... and add a 2 here */
     480   241402734 :       for (i = 0; i < nrprimes; i++)
     481   214580208 :           trys_1 *= pow_ul (phiPfactors[i], exponents[i]);
     482             : #if 0
     483             :       printf ("choose_s_1: Trying trys_1 = %lu\n", trys_1);
     484             : #endif
     485             :       /* See if it satisfies all the required conditions and is an 
     486             :          improvement over the previous choice */
     487    26822526 :       if (phiP / trys_1 >= min_s2 && 
     488    26802313 :           (s_2 == 0UL || phiP / trys_1 < s_2) && 
     489    37656228 :           absdiff_ul (trys_1, l) < absdiff_ul (s_1, l) &&
     490     3502374 :           (use_ntt == 0 || trys_1 < l))
     491             :       {
     492             : #if 0
     493             :           printf ("choose_s_1: New best s_1 for phiP = %lu, min_s2 = %lu, "
     494             :                   "l = %lu : %lu\n", phiP, min_s2, l, trys_1);
     495             : #endif
     496    10828870 :           s_1 = trys_1;
     497             :       }
     498    40553699 :       for (i = 0; i < nrprimes; i++)
     499             :       {
     500    39500856 :           if (++(exponents[i]) <= phiPexponents[i])
     501    25769683 :               break;
     502    13731173 :           exponents[i] = 0;
     503             :       }
     504    26822526 :   } while (i < nrprimes);
     505             : 
     506     1052843 :   return s_1;
     507             : }
     508             : 
     509             : 
     510             : /* Approximate cost of stage 2. Cost with and without ntt are not 
     511             :    comparable. We have l > s_1 and s_1 * s_2 = eulerphi(P), hence
     512             :    s_2*l > eulerphi(P) and so cost (s_2, l) > eulerphi(P) for all P */
     513             : static unsigned long 
     514      873981 : est_cost (const unsigned long s_2, const unsigned long l, const int use_ntt,
     515             :           const int method)
     516             : {
     517      873981 :   if (method == ECM_PM1)
     518             :     {
     519             :       /* The time for building f, h and DCT-I of h seems to be about 
     520             :          7/6 of the time of computing g, h*g and gcd with NTT, and 
     521             :          3/2 of the time of computing g, h*g and gcd without NTT */
     522             : 
     523      691077 :       if (use_ntt)
     524      161788 :         return (7 * l) / 6 + s_2 * l;
     525             :       else
     526      529289 :         return (3 * l) / 2 + s_2 * l;
     527             :     }
     528      182904 :   else if (method == ECM_PP1)
     529             :     {
     530             :       /* Building f is the same, building h and its forward transform is
     531             :          twice about as expensive as for P-1. Each multi-point evaluation
     532             :          is twice as expensive as for P-1.
     533             :          FIXME: The estimate for NTT assumes the "one-pass" variant, in 
     534             :          "two-pass" the multipoint evaluations are slower, so the optimum 
     535             :          shifts towards smaller s_2 some more */
     536      182904 :       if (use_ntt)
     537      128002 :         return (4 * l) / 5 + s_2 * l;
     538             :       else
     539       54902 :         return (3 * l) / 4 + s_2 * l;
     540             :     }
     541             :   else
     542           0 :     abort (); /* Invalid value for method */
     543             : }
     544             : 
     545             : /* Choose P so that a stage 2 range from B2min to B2 can be covered with
     546             :    multipoint evaluations, each using a convolution of length at most lmax. 
     547             :    The parameters for stage 2 are stored in finalparams, the final effective
     548             :    B2min and B2 values in final_B2min and final_B2, respecively. Each of these
     549             :    may be NULL, in which case the value is not stored. It is permissible
     550             :    to let B2min and final_B2min, or B2 and final_B2 point at the same mpz_t. */
     551             : 
     552             : long
     553         736 : choose_P (const mpz_t B2min, const mpz_t B2, const unsigned long lmax,
     554             :           const unsigned long min_s2, faststage2_param_t *finalparams, 
     555             :           mpz_t final_B2min, mpz_t final_B2, const int use_ntt, 
     556             :           const int method)
     557             : {
     558             :   /* Let S_1 + S_2 == (Z/PZ)* (mod P).
     559             : 
     560             :      Let F(x) = \prod_{k_1 \in S_1} (x - b_1^{2 k_1}).
     561             : 
     562             :      If we evaluate F(b_1^{2 k_2 + (2m + 1)P}) for all k_2 \in S_2 with 
     563             :      m_1 <= m < m_1+nr, we test all exponents 2 k_2 + (2m + 1)P - 2 k_1.
     564             :      The largest value coprime to P at the low end of the stage 2 interval 
     565             :      *not* covered will be 
     566             :        2*max(S_2) + (2*(m_1-1) + 1)*P - 2*min(S_1).
     567             :      The smallest value at the high end not covered will be
     568             :        2*min(S_2) + (2*(m_1 + nr) + 1)*P - 2*max(S_1).
     569             :      Assume S_1 and S_2 are symmetric around 0, so that max(S_1) = -min(S_1).
     570             :      Then the largest ... is:
     571             :        2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P
     572             :      The smallest ... is:
     573             :        -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P
     574             :      The effective B2min = 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P + 1
     575             :      The effective B2max = -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P - 1
     576             : 
     577             :      Then the difference effB2max - effB2min =
     578             :        -4*(max(S_1) + max(S_2)) + 2P*(nr + 1) - 2
     579             : 
     580             :      We obviously require B2max - B2min <= 2*nr*P
     581             :      Since nr < lmax, B2max - B2min <= 2*lmax*P or
     582             :      P >= ceil((B2max - B2min)/(2*lmax))
     583             : 
     584             :      Hence we are looking for an odd P with s_1 * s_2 = eulerphi(P) so that
     585             :      s_1 ~= lmax / 2 and the whole stage 2 interval is covered. s_2 should 
     586             :      be small, as long as s_1 is small enough.
     587             :   */
     588             : 
     589             :   mpz_t B2l, m_1, effB2min, tryeffB2, effB2, lmin;
     590             :   /* The best parameters found so far, P == 0 means that no suitable P
     591             :      has been found yet: */
     592         736 :   unsigned long P = 0, s_1 = 0, s_2 = 0, l = 0, cost = 0;
     593             :   unsigned int i;
     594         736 :   const unsigned int Pvalues_len = sizeof (Pvalues) / sizeof (unsigned long);
     595             :   int r;
     596             : 
     597         736 :   outputf (OUTPUT_TRACE, 
     598             :            "choose_P(B2min = %Zd, B2 = %Zd, lmax = %lu, min_s2 = %ld, "
     599             :            "use_ntt = %d, method = %d\n", 
     600             :            B2min, B2, lmax, min_s2, use_ntt, method);
     601             : 
     602         736 :   if (mpz_cmp (B2, B2min) < 0)
     603          29 :     return 0L;
     604             : 
     605             :   /* If we use the NTT, we allow only power-of-two transform lengths.
     606             :      In that case, the code below assumes that lmax is a power of two.
     607             :      If that is not the case, print error and return. */
     608         707 :   if (use_ntt && (lmax & (lmax - 1UL)) != 0)
     609             :     {
     610           0 :       outputf (OUTPUT_ERROR, 
     611             :                "choose_P: Error, lmax = %lu is not a power of two\n", lmax);
     612           0 :       return ECM_ERROR;
     613             :     }
     614             :   
     615         707 :   mpz_init (effB2);
     616         707 :   mpz_init (tryeffB2);
     617         707 :   mpz_init (effB2min);
     618         707 :   mpz_init (B2l);
     619         707 :   mpz_init (m_1);
     620         707 :   mpz_init (lmin);
     621             :   
     622         707 :   mpz_sub (B2l, B2, B2min);
     623         707 :   mpz_add_ui (B2l, B2l, 1UL); /* +1 due to closed interval */
     624             :   
     625             :   /* For each candidate P, check if [B2min, B2] can be covered at all,
     626             :      and if so, what the best parameters (minimizing the cost, maximizing 
     627             :      effB2) are. If they are better than the best parameters for the best P 
     628             :      so far, remember them. */
     629             : 
     630      347137 :   for (i = 0 ; i < Pvalues_len; i++)
     631             :     {
     632             :       unsigned long tryP, tryphiP, trys_1, trys_2, tryl, trycost;
     633             :       
     634      346430 :       tryP = Pvalues[i];
     635      346430 :       tryphiP = eulerphi (tryP);
     636             :       
     637      346430 :       outputf (OUTPUT_TRACE, 
     638             :                "choose_P: trying P = %lu, eulerphi(P) = %lu\n", tryP, tryphiP);
     639             :       
     640             :       /* If we have a good P already and this tryphiP >= cost, then 
     641             :          there's no hope for this tryP, since cost(s_2, l) > eulerphi(P) */
     642      346430 :       if (P != 0 && tryphiP >= cost)
     643             :         {
     644      301707 :           outputf (OUTPUT_TRACE, 
     645             :                    "choose_P: tryphiP > cost = %lu, this P is too large\n",
     646             :                    cost);
     647      301707 :           continue;
     648             :         }
     649             :       
     650             :       /* We have nr < l and effB2-effB2min <= 2*nr*P. Hence we need 
     651             :          l >= B2l/P/2 */
     652       44723 :       mpz_cdiv_q_ui (lmin, B2l, tryP);
     653       44723 :       mpz_cdiv_q_2exp (lmin, lmin, 1UL);
     654       44723 :       outputf (OUTPUT_TRACE, "choose_P: lmin = %Zd for P = %lu\n", lmin, tryP);
     655       44723 :       if (mpz_cmp_ui (lmin, lmax) > 0)
     656             :         {
     657        5859 :           outputf (OUTPUT_TRACE, 
     658             :                    "choose_P: lmin > lmax, this P is too small\n");
     659        5859 :           continue;
     660             :         }
     661             :       
     662             :       /* Try all possible transform lengths and store parameters in 
     663             :          P, s_1, s_2, l if they are better than the previously best ones */
     664             :        
     665             :       /* Keep reducing tryl to find best parameters. For NTT, we only have 
     666             :          power of 2 lengths so far, so we can simply divide by 2. 
     667             :          For non-NTT, we have arbitrary transform lengths so we can decrease 
     668             :          in smaller steps... let's say by, umm, 25% each time? */
     669     1091707 :       for (tryl = lmax; mpz_cmp_ui (lmin, tryl) <= 0;
     670     1052843 :            tryl = (use_ntt) ? tryl / 2 : 3 * tryl / 4)
     671             :         {
     672     1052843 :           trys_1 = choose_s_1 (tryphiP, min_s2, tryl / 2, use_ntt);
     673     1052843 :           if (trys_1 == 0)
     674             :             {
     675        3004 :               outputf (OUTPUT_TRACE, 
     676             :                        "choose_P: could not choose s_1 for P = %lu, l = %lu\n",
     677             :                        tryP, tryl);
     678        3004 :               continue;
     679             :             }
     680             :           ASSERT (tryphiP % trys_1 == 0UL);
     681     1049839 :           trys_2 = tryphiP / trys_1;
     682     1049839 :           outputf (OUTPUT_TRACE, "choose_P: chose s_1 = %lu, k = s_2 = %lu "
     683             :                    "for P = %lu, l = %lu\n", trys_1, trys_2, tryP, tryl);
     684             :           
     685     1049839 :           if (test_P (B2min, B2, m_1, tryP, tryl - trys_1, effB2min, tryeffB2))
     686             :             {
     687     1021837 :               outputf (OUTPUT_TRACE, 
     688             :                        "choose_P: P = %lu, l = %lu, s_1 = %lu, k = s_2 = %lu "
     689             :                        "works, m_1 = %Zd, effB2min = %Zd, effB2 = %zZd\n",
     690             :                        tryP, tryl, trys_1, trys_2, m_1, effB2min, tryeffB2);
     691             : 
     692             :           /* do not allow B2min to be less than 0 */
     693     1021837 :           if (mpz_cmp_ui(effB2min, 0) < 0)
     694      147856 :             continue;
     695             : 
     696             :               /* We use these parameters if we 
     697             :                  1. didn't have any suitable ones yet, or 
     698             :                  2. these cover [B2min, B2] and are cheaper than the best 
     699             :                     ones so far, or 
     700             :                  3. they are as expensive but reach greater effB2. */
     701      873981 :               trycost = est_cost (trys_2, tryl, use_ntt, method);
     702             :               ASSERT (tryphiP < trycost);
     703      873981 :               if (P == 0 || trycost < cost ||
     704       24814 :                   (trycost == cost && mpz_cmp (tryeffB2, effB2) > 0))
     705             :                 {
     706       43179 :                   outputf (OUTPUT_TRACE, 
     707             :                            "choose_P: and is the new optimum (cost = %lu)\n",
     708             :                            trycost);
     709       43179 :                   P = tryP;
     710       43179 :                   s_1 = trys_1;
     711       43179 :                   s_2 = trys_2;
     712       43179 :                   l = tryl;
     713       43179 :                   cost = trycost;
     714       43179 :                   mpz_set (effB2, tryeffB2);
     715             :                 }
     716             :             }
     717             :         }
     718             :   }
     719             :   
     720         707 :   if (P != 0) /* If we found a suitable P */
     721             :     {
     722             :       /* Compute m_1, effB2min, effB2 again */
     723         697 :       r = test_P (B2min, B2, m_1, P, l - s_1, effB2min, effB2);
     724         697 :       ASSERT_ALWAYS(r != 0);
     725         697 :       if (finalparams != NULL)
     726             :         {
     727         697 :           finalparams->P = P;
     728         697 :           finalparams->s_1 = s_1;
     729         697 :           finalparams->s_2 = s_2;
     730         697 :           finalparams->l = l;
     731         697 :           mpz_set (finalparams->m_1, m_1);
     732             :         }
     733         697 :       if (final_B2min != NULL)
     734         697 :         mpz_set (final_B2min, effB2min);
     735         697 :       if (final_B2 != NULL)
     736         697 :         mpz_set (final_B2, effB2);
     737             :     }
     738             :   
     739         707 :   mpz_clear (effB2);
     740         707 :   mpz_clear (tryeffB2);
     741         707 :   mpz_clear (effB2min);
     742         707 :   mpz_clear (B2l);
     743         707 :   mpz_clear (m_1);
     744         707 :   mpz_clear (lmin);
     745             : 
     746         707 :   return (P != 0) ? (long) P : ECM_ERROR;
     747             : }
     748             : 
     749             : 
     750             : 
     751             : static void
     752       21255 : list_output_poly (listz_t l, unsigned long len, int monic, int symmetric,
     753             :                   char *prefix, char *suffix, int verbosity)
     754             : {
     755             :   unsigned long i;
     756             : 
     757       21255 :   if (prefix != NULL)
     758       21255 :     outputf (verbosity, prefix);
     759             : 
     760       21255 :   if (len == 0)
     761             :     {
     762           0 :       if (monic)
     763           0 :         outputf (verbosity, "1\n", len, len);
     764             :       else
     765           0 :         outputf (verbosity, "0\n", len);
     766           0 :       return;
     767             :     }
     768             : 
     769       21255 :   if (monic)
     770             :     {
     771           0 :       if (symmetric)
     772           0 :         outputf (verbosity, "(x^%lu + x^-%lu) + ", len, len);
     773             :       else
     774           0 :         outputf (verbosity, "x^%lu + ", len);
     775             :     }
     776    14177972 :   for (i = len - 1; i > 0; i--)
     777    14156717 :     if (symmetric)
     778    14156717 :       outputf (verbosity, "%Zd * (x^%lu + x^-%lu) + ", l[i], i, i);
     779             :     else
     780           0 :       outputf (verbosity, "%Zd * x^%lu + ", l[i], i);
     781       21255 :   outputf (verbosity, "%Zd", l[0]);
     782       21255 :   if (suffix != NULL)
     783       21255 :     outputf (verbosity, suffix);
     784             : }
     785             : 
     786             : #if 0
     787             : /* Multiply P[i] by r^{k(deg-i)}, for 0 <= i <= deg. Needs 3 entries in tmp. */
     788             : /* I.e., let P(x) = x^deg + \sum_{i=0}^{deg - 1} P[i] * x^i. The output is 
     789             :    R(x) = x^deg + \sum_{i=0}^{deg - 1} R[i] * x^i = r^(k deg) P(r^{-k} x). */
     790             : /* The input and output polynomials are monic and have the leading monomial
     791             :    implicit, i.e. not actually stored in the array of coefficients. */
     792             : /* Returns 0 if a modular inversion failed (in which case R is left 
     793             :    unchanged), 1 otherwise */
     794             : 
     795             : static int
     796             : list_scale_rev (listz_t R, listz_t S, mpz_t r, long k, unsigned long deg, 
     797             :                 mpz_t modulus, listz_t tmp, 
     798             :                 ATTRIBUTE_UNUSED const unsigned long tmplen)
     799             : {
     800             :   unsigned long i;
     801             : 
     802             :   ASSERT (tmplen >= 3);
     803             :   mpz_powm_ui (tmp[0], r, (unsigned long) labs (k), modulus);
     804             :   if (k < 0)
     805             :     {
     806             :       if (!mpz_invert (tmp[0], tmp[0], modulus)) /* FIXME: get rid of this! */
     807             :         return 0;
     808             :     }
     809             :   /* Here, tmp[0] = r^k */
     810             :   mpz_set (tmp[1], tmp[0]);
     811             :   /* mpz_set (R[deg], S[deg]); Leading monomial is not stored! */
     812             :   for (i = 1; i + 1 <= deg; i++)
     813             :     {
     814             :       /* Here, tmp[1] = r^(ki) */
     815             :       mpz_mul (tmp[2], S[deg-i], tmp[1]);
     816             :       mpz_mod (R[deg-i], tmp[2], modulus);
     817             :       mpz_mul (tmp[2], tmp[1], tmp[0]);  /* FIXME, avoid unnecessary mul */
     818             :       mpz_mod (tmp[1], tmp[2], modulus); /* at end of loop */
     819             :     }
     820             :   if (i <= deg)
     821             :     {
     822             :       mpz_mul (tmp[2], S[deg-i], tmp[1]);
     823             :       mpz_mod (R[deg-i], tmp[2], modulus);
     824             :     }
     825             : 
     826             :   return 1;
     827             : }
     828             : #endif
     829             : 
     830             : /* Same, but does squaring which makes things easier */
     831             : 
     832             : static void
     833        1278 : list_sqr_reciprocal (listz_t R, listz_t S, const unsigned long l, 
     834             :                      mpz_t modulus, listz_t tmp, 
     835             :                      ATTRIBUTE_UNUSED const unsigned long tmplen)
     836             : {
     837             :   unsigned long i;
     838        1278 :   listz_t Srev, r1 = tmp, r2 = tmp + 2 * l - 1, t = tmp + 4 * l - 2;
     839             : 
     840        1278 :   if (l == 0UL)
     841           0 :     return;
     842             : 
     843             :   /* FIXME: This modifies the input arguments. */
     844             :   /* We have to divide S[0] by 2 */
     845             : 
     846             :   ASSERT (tmplen >= 4 * l - 2 + list_mul_mem (l));
     847             : 
     848             : #if 0
     849             :   gmp_printf ("/* list_sqr_reciprocal */ S(x) = %Zd", S[0]);
     850             :   for (i = 1; i < l1; i++)
     851             :     gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S[i], i, i);
     852             :   gmp_printf ("\n");
     853             : #endif
     854             : 
     855        1278 :   if (mpz_odd_p (S[0]))
     856             :     {
     857         616 :       ASSERT_ALWAYS (mpz_odd_p (modulus));
     858         616 :       mpz_add (S[0], S[0], modulus);
     859             :     }
     860        1278 :   mpz_tdiv_q_2exp (S[0], S[0], 1UL);
     861             :   
     862        1278 :   list_mul (r1, S, l, S, l, 0, t);
     863             :   /* r1 = f0*g0/4 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
     864             : #if 0
     865             :   for (i = 0; i < 2UL * l - 1UL; i++)
     866             :     gmp_printf ("list_sqr_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
     867             : #endif
     868             : 
     869        1278 :   Srev = (listz_t) malloc (l * sizeof (mpz_t));
     870        1278 :   ASSERT_ALWAYS (Srev != NULL);
     871      114827 :   for (i = 0UL; i < l; i++)
     872      113549 :       (*Srev)[i] = (*S)[l - 1UL - i];
     873        1278 :   list_mul (r2, S, l, Srev, l, 0, t);
     874             :   /* r2 is symmetric, r2[i] = r2[2*l - 2 - i]. Check this */
     875             : #if 0
     876             :   for (i = 0; 0 && i < 2UL * l - 1UL; i++)
     877             :     gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
     878             : #endif
     879             : #ifdef WANT_ASSERT
     880             :   for (i = 0UL; i < l; i++)
     881             :     ASSERT (mpz_cmp (r2[i], r2[2UL * l - 2UL - i]) == 0);
     882             : #endif
     883        1278 :   free (Srev);
     884             :   /* r2 = g1*f0/2 + (g0*f0/4 + g1*f1) * x + g0*f1/2 * x^2 */
     885             : #if 0
     886             :   for (i = 0; i < 2UL * l - 1UL; i++)
     887             :     gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
     888             : #endif
     889             : 
     890        1278 :   mpz_mul_2exp (r1[0], r1[0], 1UL);
     891             :   /* r1 = f0*g0/2 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
     892      114827 :   for (i = 0UL; i < l; i++)
     893             :     {
     894      113549 :       mpz_mul_2exp (r2[l - i - 1UL], r2[l - i - 1UL], 1UL);
     895      113549 :       mpz_add (R[i], r1[i], r2[l - i - 1UL]);
     896             :     }
     897             :   /* r1 = 3/4*f0*g0 + g1*f1 + (f0*g1 + 2*f1*g0)/2 * x + f1*g1 * x^2 */
     898             :   /* r1 = f0*g0 + 2*g1*f1 + (f0*g1 + f1*g0) * x + f1*g1 * x^2 */
     899      113549 :   for (i = l; i < 2UL * l - 1UL; i++)
     900      112271 :       mpz_set (R[i], r1[i]);
     901             : 
     902        1278 :   if (R != S)
     903           0 :     mpz_mul_2exp (S[0], S[0], 1UL);
     904             :         
     905             : #if 0
     906             :   for (i = 0; i < 2UL * l; i++)
     907             :     gmp_printf ("list_sqr_reciprocal: R[%lu] = %Zd\n", i, R[i]);
     908             : #endif
     909             : }
     910             : 
     911             : #ifdef WANT_ASSERT
     912             : static void
     913             : list_recip_eval1 (mpz_t R, const listz_t S, const unsigned long l)
     914             : {
     915             :   unsigned long i;
     916             : 
     917             :   mpz_set_ui (R, 0UL);
     918             :   for (i = 1; i < l; i++)
     919             :     mpz_add (R, R, S[i]);
     920             :   mpz_mul_2exp (R, R, 1UL);
     921             :   if (l > 0UL)
     922             :     mpz_add (R, R, S[0]);
     923             : }
     924             : #endif
     925             : 
     926             : /* Multiply two reciprocal polynomials of degree 2*l1-2 and 2*l2-2, resp., 
     927             :    with coefficients in standard basis
     928             : 
     929             :    S_1(x) = S1[0] + sum_{1 \leq i \leq l1 - 1} S1[i] (x^i + x^{-i})
     930             :    S_2(x) = S2[0] + sum_{1 \leq i \leq l2 - 1} S2[i] (x^i + x^{-i})
     931             : 
     932             :    to the reciprocal polynomial of degree 2*(l1 + l2) - 4
     933             : 
     934             :    R(x) = R[0] + sum_{1 \leq i \leq l1 + l2 - 2} R[i] (x^i + x^{-i}) 
     935             :         = S_1(x) * S_2(x)
     936             : 
     937             :    R == S1 == S2 is permissible, however if S1 == S2, l1 must be equal 
     938             :    to l2 (i.e. the multiplication must be a squaring)
     939             : */
     940             :   /* FIXME: This modifies the input arguments. */
     941             :   /* We have to divide S1[0] and S2[0] by 2 */
     942             : 
     943             : static void
     944        1457 : list_mul_reciprocal (listz_t R, listz_t S1, unsigned long l1, 
     945             :                      listz_t S2, unsigned long l2,
     946             :                      mpz_t modulus, listz_t tmp, 
     947             :                      ATTRIBUTE_UNUSED const unsigned long tmplen)
     948             : {
     949             :   unsigned long i;
     950        1457 :   const unsigned long lmax = MAX(l1, l2);
     951        1457 :   listz_t r1 = tmp, r2 = tmp + 2*lmax - 1, rev = tmp + 4*lmax - 2,
     952        1457 :     t = tmp + 6*lmax - 3;
     953             : #ifdef WANT_ASSERT
     954             :   mpz_t sum1, sum2, prod;
     955             : #endif
     956             : 
     957             :   ASSERT (S1 < tmp || S1 >= tmp + tmplen);
     958             :   ASSERT (S2 < tmp || S2 >= tmp + tmplen);
     959             :   ASSERT (R < tmp || R >= tmp + tmplen);
     960             : 
     961        1457 :   if (l1 == 0UL || l2 == 0UL)
     962           0 :     return;
     963             : 
     964        1457 :   if (S1 == S2)
     965             :     {
     966           0 :       ASSERT_ALWAYS (l1 == l2);
     967           0 :       list_sqr_reciprocal (R, S1, l1, modulus, tmp, tmplen);
     968           0 :       return;
     969             :     }
     970             : 
     971             :   ASSERT (tmplen >= 6*lmax - 3 + list_mul_mem (lmax));
     972             : #ifdef WANT_ASSERT
     973             :   mpz_init (sum1);
     974             :   mpz_init (sum2);
     975             :   mpz_init (prod);
     976             :   list_recip_eval1 (sum1, S1, l1);
     977             :   list_recip_eval1 (sum2, S2, l2);
     978             :   mpz_mul (prod, sum1, sum2);
     979             :   mpz_mod (prod, prod, modulus);
     980             : #endif
     981             : 
     982             : 
     983             :   /* Make S1 the longer of the two, i.e. l1 >= l2 */
     984        1457 :   if (l2 > l1)
     985             :     {
     986         972 :       listz_t St = S1;
     987         972 :       unsigned long lt = l1;
     988         972 :       S1 = S2;
     989         972 :       S2 = St;
     990         972 :       l1 = l2;
     991         972 :       l2 = lt;
     992             :     }
     993             :   
     994             : #if 0
     995             :   gmp_printf ("/* list_mul_reciprocal */ S1(x) = %Zd", S1[0]);
     996             :   for (i = 1; i < l1; i++)
     997             :     gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S1[i], i, i);
     998             :   gmp_printf ("\n");
     999             :   gmp_printf ("/* list_mul_reciprocal */ S2(x) = %Zd", S2[0]);
    1000             :   for (i = 1; i < l1; i++)
    1001             :     gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S2[i], i, i);
    1002             :   gmp_printf ("\n");
    1003             : #endif
    1004             :   
    1005             :   /* Divide S1[0] and S2[0] by 2 */
    1006        1457 :   if (mpz_odd_p (S1[0]))
    1007             :     {
    1008         727 :       ASSERT_ALWAYS (mpz_odd_p (modulus));
    1009         727 :       mpz_add (S1[0], S1[0], modulus);
    1010             :     }
    1011        1457 :   mpz_tdiv_q_2exp (S1[0], S1[0], 1UL);
    1012             :   
    1013        1457 :   if (mpz_odd_p (S2[0]))
    1014             :     {
    1015         718 :       ASSERT_ALWAYS (mpz_odd_p (modulus));
    1016         718 :       mpz_add (S2[0], S2[0], modulus);
    1017             :     }
    1018        1457 :   mpz_tdiv_q_2exp (S2[0], S2[0], 1UL);
    1019             : 
    1020             :   /* Pad rev with zeros */
    1021       12456 :   for (i = l2; i < lmax; i++)
    1022       10999 :     mpz_set_ui (rev[i], 0UL);
    1023             :   
    1024       14278 :   for (i = 0UL; i < l2; i++)
    1025       12821 :     mpz_set (rev[i], S2[l2 - 1UL - i]);
    1026        1457 :   list_mul (r1, S1, lmax, rev, lmax, 0, t);
    1027             :   /* r1 = \tilde{f}(x) \rev(\tilde{g}(x)) and has degree l1 + l2 - 2,
    1028             :      i.e. l1 + l2 - 1 entries. */
    1029             : #if 0
    1030             :   for (i = 0; i < 2 * lmax - 1; i++)
    1031             :     gmp_printf ("list_mul_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
    1032             : #endif
    1033             :   
    1034       14278 :   for (i = 0UL; i < l2; i++)
    1035       12821 :     mpz_set(rev[i], S2[i]);
    1036        1457 :   list_mul (r2, S1, lmax, rev, lmax, 0, t);
    1037             :   /* \tilde{f}(x) \tilde{g}(x) */
    1038             :   
    1039             : #if 0
    1040             :   for (i = 0; i < 2 * lmax - 1; i++)
    1041             :     gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
    1042             : #endif
    1043             :   
    1044             :   /* Add f_0*g_0 by doubling the f_0*g_0 term in r2 */
    1045        1457 :   mpz_mul_2exp (r2[0], r2[0], 1UL);
    1046             :   
    1047             :   /* Add \flloor x^{-d_g} \tilde{f}(x) \rev(\tilde{g}(x)) \rfloor.
    1048             :      d_g = l2 - 1. */
    1049       25277 :   for (i = 0; i < l1; i++)
    1050       23820 :     mpz_add (r2[i], r2[i], r1[i + l2 - 1]);
    1051             :   
    1052             :   /* Add \floor x^{-d_f} rev(\tilde{f}(x) \rev(\tilde{g}(x))) \rfloor.
    1053             :      d_f = l1 - 1. rev(r2)[i] = r2[l1 + l2 - 2 - i]. We want
    1054             :      rev(r2)[l1 - 1 ... l1 + l2 - 2], hence 
    1055             :      r2[l2 - 1 ... 0] */
    1056       14278 :   for (i = 0; i < l2; i++)
    1057       12821 :     mpz_add (r2[i], r2[i], r1[l2 - 1 - i]);
    1058             :   
    1059             : #if 0
    1060             :   for (i = 0; i < l1 + l2 - 1; i++)
    1061             :     gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
    1062             : #endif
    1063             :   
    1064        1457 :   mpz_mul_2exp (S1[0], S1[0], 1UL);
    1065        1457 :   mpz_mul_2exp (S2[0], S2[0], 1UL);
    1066             :   
    1067       36641 :   for (i = 0; i < l1 + l2 - 1; i++)
    1068       35184 :     mpz_set (R[i], r2[i]);
    1069             :   
    1070             : #if 0
    1071             :   for (i = 0; i < l1 + l2 - 1; i++)
    1072             :     gmp_printf ("list_mul_reciprocal: R[%lu] = %Zd\n", i, R[i]);
    1073             : #endif
    1074             : #ifdef WANT_ASSERT
    1075             :   list_recip_eval1 (sum1, R, l1 + l2 - 1);
    1076             :   mpz_mod (sum1, sum1, modulus);
    1077             :   ASSERT (mpz_cmp (prod, sum1) == 0);
    1078             :   mpz_clear (sum1);
    1079             :   mpz_clear (sum2);
    1080             :   mpz_clear (prod);
    1081             : #endif
    1082             : }
    1083             : 
    1084             : /* 
    1085             :   Computes V_k(S), where the Chebyshev polynomial V_k(X) is defined by 
    1086             :   V_k(X + 1/X) = X^k + 1/X^k
    1087             : */
    1088             : 
    1089             : static void
    1090       12684 : V (mpres_t R, const mpres_t S, const long k, mpmod_t modulus)
    1091             : {
    1092             :   mpres_t V0, Vi, Vi1;
    1093             :   unsigned long j, uk;
    1094             :   int po2;
    1095             : 
    1096       12684 :   if (k == 0L)
    1097             :     {
    1098        2814 :       mpres_set_ui (R, 2UL, modulus);
    1099        9207 :       return;
    1100             :     }
    1101             : 
    1102        9870 :   uk = labs (k);
    1103             : 
    1104        9870 :   if (uk == 1UL)
    1105             :     {
    1106        3098 :       mpres_set (R, S, modulus);
    1107        3098 :       return;
    1108             :     }
    1109             : 
    1110       11951 :   for (po2 = 0; uk % 2UL == 0UL; uk >>= 1, po2++);
    1111             : 
    1112        6772 :   mpres_init (V0, modulus);
    1113        6772 :   mpres_set_ui (V0, 2UL, modulus); /* V0 = V_0(S) = 2 */
    1114             : 
    1115        6772 :   if (uk == 1UL)
    1116             :     {
    1117        3295 :       mpres_set (R, S, modulus);
    1118        6590 :       while (po2-- > 0)
    1119             :         {
    1120        3295 :           mpres_sqr (R, R, modulus);
    1121        3295 :           mpres_sub (R, R, V0, modulus);
    1122             :         }
    1123        3295 :       mpres_clear (V0, modulus);
    1124        3295 :       return;
    1125             :     }
    1126             : 
    1127             :   if (0)
    1128             :     {
    1129             :       mpz_t tz;
    1130             :       mpz_init (tz);
    1131             :       mpres_get_z (tz, S, modulus);
    1132             :       gmp_printf ("Chebyshev_V(%ld, Mod(%Zd,N)) == ", k, tz);
    1133             :       mpz_clear (tz);
    1134             :     }
    1135             : 
    1136       30071 :   for (j = 1UL; j <= uk / 2UL; j <<= 1);
    1137             : 
    1138        3477 :   mpres_init (Vi, modulus);
    1139        3477 :   mpres_init (Vi1, modulus);
    1140             : 
    1141             :   /* i = 1. Vi = V_i(S), Vi1 = V_{i+1}(S) */
    1142        3477 :   mpres_set (Vi, S, modulus);
    1143        3477 :   mpres_sqr (Vi1, S, modulus);
    1144        3477 :   mpres_sub (Vi1, Vi1, V0, modulus);
    1145        3477 :   j >>= 1;
    1146             : 
    1147       26594 :   while (j > 1)
    1148             :     {
    1149       23117 :       if ((uk & j) != 0UL)
    1150             :         {
    1151             :           /* i' = 2i + 1.
    1152             :              V_{i'} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1
    1153             :              V_{i'+1} = V_{2i + 2} = {V_{i+1}}^2 - V_0. */
    1154       11789 :           mpres_mul (Vi, Vi, Vi1, modulus);
    1155       11789 :           mpres_sub (Vi, Vi, S, modulus);
    1156       11789 :           mpres_sqr (Vi1, Vi1, modulus);
    1157       11789 :           mpres_sub (Vi1, Vi1, V0, modulus);
    1158             :         }
    1159             :       else
    1160             :         {
    1161             :           /* i' = 2i. 
    1162             :              V_{i'} = V_{2i} = {V_i}^2 - V0.
    1163             :              V_{i'+1} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1 */
    1164       11328 :           mpres_mul (Vi1, Vi, Vi1, modulus);
    1165       11328 :           mpres_sub (Vi1, Vi1, S, modulus);
    1166             : 
    1167       11328 :           mpres_sqr (Vi, Vi, modulus);
    1168       11328 :           mpres_sub (Vi, Vi, V0, modulus);
    1169             :         }
    1170       23117 :       j >>= 1;
    1171             :     }
    1172             : 
    1173             :   /* Least significant bit of uk is always 1 */
    1174        3477 :   mpres_mul (Vi, Vi, Vi1, modulus);
    1175        3477 :   mpres_sub (Vi, Vi, S, modulus);
    1176             : 
    1177        5361 :   while (po2-- > 0)
    1178             :     {
    1179        1884 :       mpres_sqr (Vi, Vi, modulus);
    1180        1884 :       mpres_sub (Vi, Vi, V0, modulus);
    1181             :     }
    1182             : 
    1183        3477 :   mpres_set (R, Vi, modulus);
    1184             : 
    1185        3477 :   mpres_clear (Vi, modulus);
    1186        3477 :   mpres_clear (Vi1, modulus);
    1187        3477 :   mpres_clear (V0, modulus);
    1188             : 
    1189             :   if (0)
    1190             :     {
    1191             :       mpz_t tz;
    1192             :       mpz_init (tz);
    1193             :       mpres_get_z (tz, R, modulus);
    1194             :       gmp_printf ("%Zd\n", tz);
    1195             :       mpz_clear (tz);
    1196             :     }
    1197             : }
    1198             : 
    1199             : /* 
    1200             :   Computes U_k(S), where the Chebyshev polynomial U_k(X) is defined by 
    1201             :   U_k(X + 1/X) = (X^k - 1/X^k) / (X - 1/X)
    1202             :   If R1 != NULL, stores U_{k+1}(S) there
    1203             : */
    1204             : 
    1205             : static void
    1206        2814 : U (mpres_t R, mpres_t R1, const mpres_t S, const long k, mpmod_t modulus)
    1207             : {
    1208             :   mpres_t V0, Vi, Vi1, Ui, Ui1, t;
    1209             :   unsigned long j, uk;
    1210             : 
    1211        2814 :   if (k == 0L)
    1212             :     {
    1213        2814 :       mpres_set_ui (R, 0UL, modulus); /* U_0 = 0 */
    1214        2814 :       if (R1 != NULL)
    1215        2814 :         mpres_set_ui (R1, 1UL, modulus); /* U_1 = 1 */
    1216        2814 :       return;
    1217             :     }
    1218             : 
    1219           0 :   uk = labs (k);
    1220             : 
    1221           0 :   if (uk == 1UL)
    1222             :     {
    1223           0 :       mpres_set_ui (R, 1UL, modulus);
    1224           0 :       if (k == -1)
    1225           0 :         mpres_neg (R, R, modulus);
    1226             :       
    1227           0 :       if (R1 != NULL)
    1228             :         {
    1229           0 :           if (k == -1)
    1230           0 :             mpres_set_ui (R1, 0UL, modulus);
    1231             :           else
    1232           0 :             mpres_set (R1, S, modulus); /* U_2(S) = S */
    1233             :         }
    1234             : 
    1235           0 :       return;
    1236             :     }
    1237             : 
    1238             :   if (0)
    1239             :     {
    1240             :       mpz_t tz;
    1241             :       mpz_init (tz);
    1242             :       mpres_get_z (tz, S, modulus);
    1243             :       gmp_printf ("Chebyshev_U(%ld, Mod(%Zd,N)) == ", k, tz);
    1244             :       mpz_clear (tz);
    1245             :     }
    1246             : 
    1247           0 :   mpres_init (V0, modulus);
    1248           0 :   mpres_init (Vi, modulus);
    1249           0 :   mpres_init (Vi1, modulus);
    1250           0 :   mpres_init (Ui, modulus);
    1251           0 :   mpres_init (Ui1, modulus);
    1252           0 :   mpres_init (t, modulus);
    1253             : 
    1254           0 :   for (j = 1UL; j <= uk / 2UL; j <<= 1);
    1255             : 
    1256           0 :   mpres_set_ui (Ui, 1UL, modulus);   /* Ui = U_1(S) = 1 */
    1257           0 :   mpres_set (Ui1, S, modulus);       /* Ui1 = U_2(S) = S */
    1258           0 :   mpres_add (V0, Ui, Ui, modulus);   /* V0 = V_0(S) = 2 */
    1259           0 :   mpres_set (Vi, S, modulus);        /* Vi = V_1(S) = S */
    1260           0 :   mpres_sqr (Vi1, Vi, modulus);
    1261           0 :   mpres_sub (Vi1, Vi1, V0, modulus); /* Vi1 = V_2(S) = S^2 - 2 */
    1262           0 :   j >>= 1; /* i = 1 */
    1263             : 
    1264           0 :   while (j != 0)
    1265             :     {
    1266           0 :       if ((uk & j) == 0UL)
    1267             :         {
    1268           0 :           mpres_mul (Vi1, Vi1, Vi, modulus);
    1269           0 :           mpres_sub (Vi1, Vi1, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
    1270             :           /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
    1271           0 :           mpres_sub (t, Ui1, Ui, modulus);
    1272           0 :           mpres_add (Ui1, Ui1, Ui, modulus);
    1273           0 :           mpres_mul (Ui1, Ui1, t, modulus); 
    1274           0 :           mpres_mul (Ui, Ui, Vi, modulus); /* U_{2n} = U_n V_n */
    1275           0 :           mpres_sqr (Vi, Vi, modulus);
    1276           0 :           mpres_sub (Vi, Vi, V0, modulus); /* V_{2n} = V_n^2 - 2 */
    1277             :         }
    1278             :       else
    1279             :         {
    1280             :           /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
    1281           0 :           mpres_sub (t, Ui1, Ui, modulus);
    1282           0 :           mpres_add (Ui, Ui, Ui1, modulus);
    1283           0 :           mpres_mul (Ui, Ui, t, modulus);
    1284           0 :           mpres_mul (Ui1, Ui1, Vi1, modulus); /* U_{2n+2} = U_{n+1} V_{n+1} */
    1285           0 :           mpres_mul (Vi, Vi, Vi1, modulus);
    1286           0 :           mpres_sub (Vi, Vi, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
    1287           0 :           mpres_sqr (Vi1, Vi1, modulus);
    1288           0 :           mpres_sub (Vi1, Vi1, V0, modulus); /* V_{2n+2} = V_{n+1}^2 - 2 */
    1289             :         }
    1290           0 :       j >>= 1;
    1291             :     }
    1292             : 
    1293           0 :   if (k > 0)
    1294           0 :     mpres_set (R, Ui, modulus);
    1295             :   else
    1296           0 :     mpres_neg (R, Ui, modulus);
    1297             : 
    1298           0 :   if (R1 != NULL)
    1299             :     {
    1300             :       /* Here k != -1,0,1, so k+1 is negative iff k is */
    1301           0 :       if (k > 0)
    1302           0 :         mpres_set (R1, Ui1, modulus);
    1303             :       else
    1304           0 :         mpres_neg (R1, Ui1, modulus);
    1305             :     }
    1306             : 
    1307           0 :   mpres_clear (V0, modulus);
    1308           0 :   mpres_clear (Vi, modulus);
    1309           0 :   mpres_clear (Vi1, modulus);
    1310           0 :   mpres_clear (Ui, modulus);
    1311           0 :   mpres_clear (Ui1, modulus);
    1312           0 :   mpres_clear (t, modulus);
    1313             : 
    1314             :   if (0)
    1315             :     {
    1316             :       mpz_t tz;
    1317             :       mpz_init (tz);
    1318             :       mpres_get_z (tz, R, modulus);
    1319             :       gmp_printf ("%Zd\n", tz);
    1320             :       mpz_clear (tz);
    1321             :     }
    1322             : }
    1323             : 
    1324             : 
    1325             : /* Set R[i] = V_{i+k}(Q) * F[i] or U_{i+k}(Q) * F[i], for 0 <= i < len
    1326             :    We compute V_{i+k+1}(Q) by V_{i+k}(Q)*V_1(Q) - V_{i+k-1}(Q).
    1327             :    For U, we compute U_{i+k+1}(Q) by U_{i+k}(Q)*V_1(Q) - U_{i+k-1}(Q).
    1328             :    The values of V_1(Q), V_{k-1}(Q) and V_k(Q) and V_k(Q) are in 
    1329             :    V1, Vk_1 and Vk, resp. 
    1330             :    The values of Vk_1 and Vk are clobbered. */
    1331             : static void
    1332        5628 : scale_by_chebyshev (listz_t R, const listz_t F, const unsigned long len,
    1333             :                     mpmod_t modulus, const mpres_t V1, mpres_t Vk_1, 
    1334             :                     mpres_t Vk)
    1335             : {
    1336             :   mpres_t Vt;
    1337             :   unsigned long i;
    1338             : 
    1339        5628 :   mpres_init (Vt, modulus);
    1340             : 
    1341     3139702 :   for (i = 0; i < len; i++)
    1342             :     {
    1343     3134074 :       mpres_mul_z_to_z (R[i], Vk, F[i], modulus);
    1344     3134074 :       mpres_mul (Vt, Vk, V1, modulus);
    1345     3134074 :       mpres_sub (Vt, Vt, Vk_1, modulus);
    1346     3134074 :       mpres_set (Vk_1, Vk, modulus); /* Could be a swap */
    1347     3134074 :       mpres_set (Vk, Vt, modulus); /* Could be a swap */
    1348             :     }
    1349             : 
    1350        5628 :   mpres_clear (Vt, modulus);
    1351        5628 : }
    1352             : 
    1353             : 
    1354             : /* For a given reciprocal polynomial 
    1355             :    F(x) = f_0 + sum_{i=1}^{deg} f_i V_i(x+1/x),
    1356             :    compute F(\gamma x)F(\gamma^{-1} x), with Q = \gamma + 1 / \gamma
    1357             : 
    1358             :    If NTT is used, needs 4 * deg + 3 entries in tmp.
    1359             :    If no NTT is used, needs 4 * deg + 2 + (memory use of list_sqr_reciprocal)
    1360             : */
    1361             : 
    1362             : static void
    1363        2814 : list_scale_V (listz_t R, const listz_t F, const mpres_t Q, 
    1364             :               const unsigned long deg, mpmod_t modulus, listz_t tmp, 
    1365             :               const unsigned long tmplen, 
    1366             :               mpzspv_t dct, const mpzspm_t ntt_context)
    1367             : {
    1368             :   mpres_t Vt;
    1369             :   unsigned long i;
    1370        2814 :   const listz_t G = tmp, H = tmp + 2 * deg + 1, newtmp = tmp + 4 * deg + 2;
    1371        2814 :   const unsigned long newtmplen = tmplen - 4 * deg - 2;
    1372             : #ifdef WANT_ASSERT
    1373             :   mpz_t leading;
    1374             : #endif
    1375             :   
    1376        2814 :   if (deg == 0)
    1377             :     {
    1378             :       ASSERT(tmplen >= 1);
    1379           0 :       mpz_mul (tmp[0], F[0], F[0]);
    1380           0 :       mpz_mod (R[0], tmp[0], modulus->orig_modulus);
    1381           0 :       return;
    1382             :     }
    1383             :   
    1384             :   /* Make sure newtmplen does not underflow */
    1385        2814 :   ASSERT_ALWAYS (tmplen >= 4 * deg + 2);
    1386             : #ifdef WANT_ASSERT
    1387             :   mpz_init (leading);
    1388             :   mpz_mul (leading, F[deg], F[deg]);
    1389             :   mpz_mod (leading, leading, modulus->orig_modulus);
    1390             : #endif
    1391             : 
    1392             :   /* Generate V_1(Q)/2 ... V_{deg}(Q)/2, multiply by f_i to form coefficients 
    1393             :      of G(x). Square the symmetric G(x) polynomial. */
    1394             : 
    1395        2814 :   outputf (OUTPUT_TRACE, "list_scale_V: Q=%Zd, deg = %lu\n", Q, deg);
    1396        2814 :   list_output_poly (F, deg + 1, 0, 1, "/* list_scale_V */ F(x) = ", "\n", 
    1397             :                     OUTPUT_TRACE);
    1398             : 
    1399             :   /* Compute G[i] = V_i(Q)/2 * F[i] for i = 0, ..., deg.
    1400             :      For i=0, V_0(Q) = 2, so G[0] = F[0], 
    1401             :      which leaves deg entries to process */
    1402             : 
    1403        2814 :   mpz_set (G[0], F[0]);
    1404             : 
    1405             : #if defined(_OPENMP)
    1406             : #pragma omp parallel if (deg > 1000)
    1407             : #endif
    1408             :   {
    1409        2814 :     const int nr_chunks = omp_get_num_threads();
    1410        2814 :     const int thread_nr = omp_get_thread_num();
    1411             :     mpmod_t modulus_local;
    1412             :     unsigned long l, start_i;
    1413             :     mpres_t Vi, Vi_1;
    1414             :     
    1415        2814 :     l = (deg - 1) / nr_chunks + 1; /* l = ceil (deg / nr_chunks) */
    1416        2814 :     start_i = thread_nr * l + 1;
    1417        2814 :     if (start_i <= deg + 1)
    1418        2814 :       l = MIN(l, deg + 1 - start_i);
    1419             :     else
    1420           0 :       l = 0;
    1421             : 
    1422        2814 :     mpmod_init_set (modulus_local, modulus);
    1423        2814 :     mpres_init (Vi_1, modulus_local);
    1424        2814 :     mpres_init (Vi, modulus_local);
    1425             :     
    1426        2814 :     V (Vi, Q, start_i, modulus_local);
    1427        2814 :     mpres_div_2exp (Vi, Vi, 1, modulus_local);
    1428        2814 :     V (Vi_1, Q, start_i - 1UL, modulus_local);
    1429        2814 :     mpres_div_2exp (Vi_1, Vi_1, 1, modulus_local);
    1430        2814 :     scale_by_chebyshev (G + start_i, F + start_i, l, modulus_local, 
    1431             :                         Q, Vi_1, Vi);
    1432             :     
    1433        2814 :     mpres_clear (Vi_1, modulus_local);
    1434        2814 :     mpres_clear (Vi, modulus_local);
    1435        2814 :     mpmod_clear (modulus_local);
    1436             :   }
    1437             : 
    1438             : 
    1439        2814 :   list_output_poly (G, deg + 1, 0, 1, "/* list_scale_V */ G(x) = ", "\n", 
    1440             :                     OUTPUT_TRACE);
    1441             : 
    1442             :   /* Now square the G polynomial in G[0 .. deg], put result in
    1443             :      G[0 .. 2*deg] */
    1444             : 
    1445             :   /* Bugfix: ks_multiply() does not like negative coefficients. FIXME */
    1446             : 
    1447     1572665 :   for (i = 0; i <= deg; i++)
    1448     1569851 :     if (mpz_sgn (G[i]) < 0)
    1449             :       {
    1450           0 :         mpz_add (G[i], G[i], modulus->orig_modulus);
    1451             :         /* FIXME: make sure the absolute size does not "run away" */
    1452           0 :         if (mpz_sgn (G[i]) < 0)
    1453             :           {
    1454           0 :             outputf (OUTPUT_ERROR, "list_scale_V: G[%lu] still negative\n", i);
    1455           0 :             mpz_mod (G[i], G[i], modulus->orig_modulus);
    1456             :           }
    1457             :       }
    1458             : 
    1459        2814 :   if (dct != NULL && ntt_context != NULL)
    1460        2175 :     ntt_sqr_reciprocal (G, G, dct, deg + 1, ntt_context);
    1461             :   else
    1462         639 :     list_sqr_reciprocal (G, G, deg + 1, modulus->orig_modulus, 
    1463             :                          newtmp, newtmplen);
    1464             : 
    1465        2814 :   list_output_poly (G, 2 * deg + 1, 0, 1, "/* list_scale_V */ G(x)^2 == ", 
    1466             :                     "\n", OUTPUT_TRACE);
    1467             : 
    1468             :   /* Compute H[i-1] = U_i(Q)/2 * F[i] for i = 1, ..., deg */
    1469             : 
    1470             : #if defined(_OPENMP)
    1471             : #pragma omp parallel if (deg > 1000)
    1472             : #endif
    1473             :   {
    1474        2814 :     const int nr_chunks = omp_get_num_threads();
    1475        2814 :     const int thread_nr = omp_get_thread_num();
    1476             :     mpmod_t modulus_local;
    1477             :     unsigned long l, start_i;
    1478             :     mpres_t Ui, Ui_1;
    1479             :     
    1480        2814 :     l = (deg - 1) / nr_chunks + 1; /* l = ceil(deg / nr_chunks) */
    1481        2814 :     start_i = thread_nr * l + 1UL;
    1482        2814 :     if (start_i <= deg + 1)
    1483        2814 :       l = MIN(l, deg + 1 - start_i);
    1484             :     else
    1485           0 :       l = 0;
    1486             :     
    1487        2814 :     mpmod_init_set (modulus_local, modulus);
    1488        2814 :     mpres_init (Ui_1, modulus_local);
    1489        2814 :     mpres_init (Ui, modulus_local);
    1490             :     
    1491        2814 :     U (Ui_1, Ui, Q, start_i - 1, modulus_local);
    1492        2814 :     mpres_div_2exp (Ui, Ui, 1, modulus_local);
    1493        2814 :     mpres_div_2exp (Ui_1, Ui_1, 1, modulus_local);
    1494             :     
    1495        2814 :     scale_by_chebyshev (H - 1 + start_i, F + start_i, l, modulus_local, 
    1496             :                         Q, Ui_1, Ui);
    1497             :     
    1498        2814 :     mpres_clear (Ui_1, modulus_local);
    1499        2814 :     mpres_clear (Ui, modulus_local);
    1500        2814 :     mpmod_clear (modulus_local);
    1501             :   }
    1502             : 
    1503             :   
    1504             :   /* Convert H to standard basis */
    1505             :   /* We can do it in-place with H - 1 = H_U. */
    1506             : 
    1507     1565126 :   for (i = deg; i >= 3; i--)
    1508             :     {
    1509     1562312 :       mpz_add (H[i - 3], H[i - 3], H[i - 1]);
    1510     1562312 :       if (mpz_cmp (H[i - 3], modulus->orig_modulus) >= 0)
    1511      780677 :         mpz_sub (H[i - 3], H[i - 3], modulus->orig_modulus);
    1512             :     }
    1513             :   
    1514             :   /* U_2(X+1/X) = (X^2 - 1/X^2)/(X-1/X) = X+1/X = V_1(X+1/X),
    1515             :      so no addition occures here */
    1516             :   /* if (deg >= 2)
    1517             :      mpz_set (H[1], H[1]); Again, a no-op. */
    1518             :   
    1519             :   /* U_1(X+1/X) = 1, so this goes to coefficient of index 0 in std. basis */
    1520             :   /* mpz_set (H[0], H[0]); Another no-op. */
    1521             :   
    1522             :   /* Now H[0 ... deg-1] contains the deg coefficients in standard basis
    1523             :      of symmetric H(X) of degree 2*deg-2. */
    1524             :   
    1525        2814 :   list_output_poly (H, deg, 0, 1, "/* list_scale_V */ H(x) = ", "\n",
    1526             :                     OUTPUT_TRACE);
    1527             : 
    1528             :   /* Square the symmetric H polynomial of degree 2*deg-2 (i.e. with deg 
    1529             :      coefficents in standard basis in H[0 ... deg-1]) */
    1530             : 
    1531             :   /* Bugfix: ks_multiply() does not like negative coefficients. */
    1532             : 
    1533     1572665 :   for (i = 0; i <= deg; i++)
    1534     1569851 :     if (mpz_sgn (H[i]) < 0)
    1535             :       {
    1536           0 :         mpz_add (H[i], H[i], modulus->orig_modulus);
    1537           0 :         if (mpz_sgn (H[i]) < 0)
    1538             :           {
    1539           0 :             outputf (OUTPUT_ERROR, "list_scale_V: H[%lu] still negative\n", i);
    1540           0 :             mpz_mod (H[i], H[i], modulus->orig_modulus);
    1541             :           }
    1542             :       }
    1543             : 
    1544        2814 :   if (dct != NULL && ntt_context != NULL)
    1545        2175 :     ntt_sqr_reciprocal (H, H, dct, deg, ntt_context);
    1546             :   else
    1547         639 :     list_sqr_reciprocal (H, H, deg, modulus->orig_modulus, 
    1548             :                          newtmp, newtmplen);
    1549             : 
    1550             :   /* Now there are the 2*deg-1 coefficients in standard basis of a 
    1551             :      symmetric polynomial of degree 4*deg - 4 in H[0 ... 2*deg-2] */
    1552             : 
    1553        2814 :   list_output_poly (H, 2*deg - 1, 0, 1, "/* list_scale_V */ H(x)^2 == ", "\n",
    1554             :                     OUTPUT_TRACE);
    1555             : 
    1556             :   /* Multiply by Q^2-4 */
    1557        2814 :   mpres_init (Vt, modulus);
    1558        2814 :   mpres_sqr (Vt, Q, modulus);
    1559        2814 :   mpres_sub_ui (Vt, Vt, 4, modulus);
    1560             : 
    1561             : #if defined(_OPENMP)
    1562             : #pragma omp parallel if (deg > 1000)
    1563             :   {
    1564             :     mpmod_t modulus_local;
    1565             : #ifdef _MSC_VER
    1566             :     long i; /* Microsoft C/C++ stuck on OpenMP 2.0 :( */
    1567             : #endif
    1568             :     mpmod_init_set (modulus_local, modulus);
    1569             :     
    1570             : #pragma omp for
    1571             :     for (i = 0; i <= 2 * deg - 2; i++)
    1572             :       mpres_mul_z_to_z (H[i], Vt, H[i], modulus_local);
    1573             :     mpmod_clear (modulus_local);
    1574             :   }
    1575             : #else
    1576     3134074 :   for (i = 0; i <= 2 * deg - 2; i++)
    1577     3131260 :     mpres_mul_z_to_z (H[i], Vt, H[i], modulus);
    1578             : #endif
    1579             : 
    1580        2814 :   list_output_poly (H, 2 * deg - 1, 0, 1, "/* list_scale_V */ "
    1581             :                     "H(x)^2*(Q^2-4) == ", "\n", OUTPUT_TRACE);
    1582             : 
    1583             : 
    1584             :   /* Multiply by (X - 1/X)^2 = X^2 - 2 + 1/X^2 and subtract from G */
    1585             :   ASSERT (newtmplen > 0UL);
    1586        2814 :   if (deg == 1)
    1587             :     {
    1588             :       /* H(X) has degree 2*deg-2 = 0, so H(X) = h_0
    1589             :          H(X) * (X - 1/X)^2 = -2 h_0 + h_0 V_2(Y)  */
    1590         903 :       mpz_mul_2exp (newtmp[0], H[0], 1UL);
    1591         903 :       mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*H[0] */
    1592         903 :       mpz_sub (G[2], G[2], H[0]);
    1593             :     }
    1594        1911 :   else if (deg == 2)
    1595             :     {
    1596             :       /* H(X) has degree 2*deg-2 = 2, , so 
    1597             :          H(X) = h_0 + h_1 (X+1/X) + h_2 (X^2+1/X^2)
    1598             : 
    1599             :          H(X) * (X - 1/X)^2 =
    1600             :          -2*(h_0 - h_2) - h_1 * V_1(Y) + (h_0 - 2*h_2) * V_2(Y) + 
    1601             :          h_1 * V_3(Y) + h_2 * V_4(Y)
    1602             :       */
    1603           4 :       mpz_sub (newtmp[0], H[0], H[2]);          /* h_0 - h_2 */
    1604           4 :       mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* 2*(h_0 - h_2) */
    1605           4 :       mpz_add (G[0], G[0], newtmp[0]);          /* G[0] -= -2*(h_0 - h_2) */
    1606             : 
    1607           4 :       mpz_add (G[1], G[1], H[1]);               /* G[1] -= -h_1 */
    1608           4 :       mpz_sub (newtmp[0], newtmp[0], H[0]);     /* h_0 - 2*h_2 */
    1609           4 :       mpz_sub (G[2], G[2], newtmp[0]);          /* G[2] -= h_0 - 2*h_2 */
    1610           4 :       mpz_sub (G[3], G[3], H[1]);               /* G[3] -= h_1 */
    1611           4 :       mpz_sub (G[4], G[4], H[2]);               /* G[3] -= h_2 */
    1612             :     }
    1613             :   else
    1614             :     {
    1615             :       /* Let H(X) = h_0 + \sum_{i=1}^{n} h_i V_i(Y), Y = X+1/X. Then
    1616             :          (x - 1/x)^2 H(X) = 
    1617             :          -2(h_0 - h_2) +
    1618             :          (- h_1 + h_3) V_1(Y) +
    1619             :          \sum_{i=2}^{n-2} (h_{i-2} - 2h_i + h_{i+2}) V_i(Y) +
    1620             :          (h_{n-3} - 2h_{n-1}) V_{n-1}(Y) +
    1621             :          (h_{n-2} - 2h_n) V_n(Y) +
    1622             :          h_{n-1} V_{n+1}(Y) +
    1623             :          h_n V_{n+2}(Y)
    1624             :          
    1625             :          In our case, n = 2 * deg - 2
    1626             :       */
    1627        1907 :       mpz_sub (newtmp[0], H[0], H[2]);
    1628        1907 :       mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* t[0] = 2*(h_0 - h_2) */
    1629        1907 :       mpz_add (G[0], G[0], newtmp[0]);          /* G[0] -= -2*(h_0 - h_2) */
    1630             :       
    1631        1907 :       mpz_add (G[1], G[1], H[1]);
    1632        1907 :       mpz_sub (G[1], G[1], H[3]); /* G[1] -= -h_1 + h_3 */
    1633             :       
    1634     3124624 :       for (i = 2; i <= 2 * deg - 4; i++)
    1635             :         {
    1636     3122717 :           mpz_mul_2exp (newtmp[0], H[i], 1);
    1637     3122717 :           mpz_sub (newtmp[0], newtmp[0], H[i - 2]);
    1638     3122717 :           mpz_sub (newtmp[0], newtmp[0], H[i + 2]); /* 2h_i-h_{i-2}-h_{i+2} */
    1639     3122717 :           mpz_add (G[i], G[i], newtmp[0]); /* G[i] -= -2h_i+h_{i-2}+h_{i+2} */
    1640             :         }
    1641             :       
    1642        5721 :       for ( ; i <= 2 * deg - 2; i++)
    1643             :         {
    1644        3814 :           mpz_mul_2exp (newtmp[0], H[i], 1UL);
    1645        3814 :           mpz_sub (newtmp[0], H[i - 2], newtmp[0]); /* h_{n-3} - 2h_{n-1} */
    1646        3814 :           mpz_sub (G[i], G[i], newtmp[0]);
    1647             :         }
    1648             :       
    1649        1907 :       mpz_sub (G[i], G[i], H[i - 2]);
    1650        1907 :       mpz_sub (G[i + 1], G[i + 1], H[i - 1]);
    1651             :     }
    1652             : 
    1653     3139702 :   for (i = 0; i <= 2 * deg; i++)
    1654     3136888 :     mpz_mod (R[i], G[i], modulus->orig_modulus);
    1655             : 
    1656        2814 :   if (test_verbose (OUTPUT_TRACE))
    1657       18526 :     for (i = 0; i <= 2 * deg; i++)
    1658       18435 :       outputf (OUTPUT_TRACE, "list_scale_V: R[%lu] = %Zd\n", i, R[i]);
    1659             : 
    1660             : #ifdef WANT_ASSERT
    1661             :   mpz_mod (R[2 * deg], R[2 * deg], modulus->orig_modulus);
    1662             :   ASSERT (mpz_cmp (leading, R[2 * deg]) == 0);
    1663             :   mpz_clear (leading);
    1664             : #endif
    1665             : 
    1666        2814 :   mpres_clear (Vt, modulus);
    1667             : }
    1668             : 
    1669             : 
    1670             : #ifdef WANT_ASSERT
    1671             : /* Check if l is an (anti-)symmetric, possibly monic, polynomial. 
    1672             :    Returns -1 if it is (anti-)symmetric, or the smallest index i where 
    1673             :    l[i] != l[len - 1 + monic - i])
    1674             :    If anti == 1, the list is checked for symmetry, if it is -1, for
    1675             :    antisymmetry.
    1676             :    This function is used only if assertions are enabled.
    1677             : */
    1678             : 
    1679             : static long int ATTRIBUTE_UNUSED
    1680             : list_is_symmetric (listz_t l, unsigned long len, int monic, int anti, 
    1681             :                    mpz_t modulus, mpz_t tmp)
    1682             : {
    1683             :     unsigned long i;
    1684             : 
    1685             :     ASSERT (monic == 0 || monic == 1);
    1686             :     ASSERT (anti == 1 || anti == -1);
    1687             : 
    1688             :     if (monic && anti == 1 && mpz_cmp_ui (l[0], 1) != 0)
    1689             :         return 0L;
    1690             : 
    1691             :     if (monic && anti == -1)
    1692             :       {
    1693             :         mpz_sub_ui (tmp, modulus, 1);
    1694             :         if (mpz_cmp (tmp, l[0]) != 0)
    1695             :           return 0L;
    1696             :       }
    1697             : 
    1698             :     for (i = monic; i < len / 2; i++)
    1699             :       {
    1700             :         if (anti == -1)
    1701             :           {
    1702             :             /* Negate (mod modulus) */
    1703             :             if (mpz_sgn (l[i]) == 0)
    1704             :               {
    1705             :                 if (mpz_sgn (l[len - 1 + monic - i]) != 0)
    1706             :                   return (long) i;
    1707             :               }
    1708             :             else
    1709             :               {
    1710             :                 mpz_sub (tmp, modulus, l[i]);
    1711             :                 if (mpz_cmp (tmp, l[len - 1 + monic - i]) != 0)
    1712             :                   return (long) i;
    1713             :               }
    1714             :           }
    1715             :         else if (mpz_cmp (l[i], l[len - 1 + monic - i]) != 0)
    1716             :             return (long) i;
    1717             :       }
    1718             : 
    1719             :     return -1L;
    1720             : }
    1721             : #endif
    1722             : 
    1723             : #if 0 && defined(WANT_ASSERT)
    1724             : /* Evaluate a polynomial of degree n-1 with all coefficients given in F[],
    1725             :    or of degree n with an implicit leading 1 monomial not stored in F[],
    1726             :    at x modulo modulus. Result goes in r. tmp needs 2 entries. */
    1727             : 
    1728             : static void 
    1729             : list_eval_poly (mpz_t r, const listz_t F, const mpz_t x, 
    1730             :                 const unsigned long n, const int monic, const mpz_t modulus, 
    1731             :                 listz_t tmp)
    1732             : {
    1733             :   unsigned long i;
    1734             : 
    1735             :   mpz_set_ui (tmp[0], 1UL);
    1736             :   mpz_set_ui (r, 0UL);
    1737             : 
    1738             :   for (i = 0UL; i < n; i++)
    1739             :     {
    1740             :       /* tmp[0] = x^i */
    1741             :       mpz_mul (tmp[1], F[i], tmp[0]);
    1742             :       mpz_mod (tmp[1], tmp[1], modulus);
    1743             :       mpz_add (r, r, tmp[1]);
    1744             : 
    1745             :       mpz_mul (tmp[1], tmp[0], x);
    1746             :       mpz_mod (tmp[0], tmp[1], modulus);
    1747             :     }
    1748             : 
    1749             :   if (monic)
    1750             :     mpz_add (r, r, tmp[0]);
    1751             : 
    1752             :   mpz_mod (r, r, modulus);
    1753             : }
    1754             : #endif
    1755             : 
    1756             : /* return the memory needed in the F[] array for poly_from_sets_V */
    1757             : static unsigned long
    1758         374 : mem_poly_from_sets_V (sets_long_t *sets)
    1759             : {
    1760             :   unsigned long c, deg, i, nr;
    1761         374 :   set_long_t *set = sets->sets;
    1762         374 :   unsigned long mem, maxmem = 0;
    1763             :   
    1764         374 :   deg = 1UL;
    1765        2193 :   for (nr = sets->nr - 1UL; nr > 0UL; nr--)
    1766             :     {
    1767        1819 :       set = sets_nextset (sets->sets); /* Skip first set */
    1768        7180 :       for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
    1769        5361 :         set = sets_nextset (set);
    1770        1819 :       c = set->card;
    1771        1819 :       if (c == 2UL)
    1772        1061 :         deg *= 2UL;
    1773             :       else
    1774             :         {
    1775         758 :           i = (c - 1UL) / 2UL - 1; /* maximal value of i */
    1776             :           /* list_scale_V needs 2*deg+1 entries starting at
    1777             :              F + (2UL * i + 1UL) * (deg + 1UL) */
    1778         758 :           mem = (2UL * i + 1UL) * (deg + 1UL) + (2 * deg + 1UL);
    1779         758 :           if (mem > maxmem)
    1780         758 :             maxmem = mem;
    1781         758 :           deg *= c;
    1782             :         }
    1783             :     }
    1784             : 
    1785         374 :   return maxmem;
    1786             : }
    1787             : 
    1788             : /* Build a polynomial with roots r^2i, i in the sumset of the sets in "sets".
    1789             :    The parameter Q = r + 1/r. This code uses the fact that the polynomials 
    1790             :    are symmetric. Requires that the first set in "sets" has cardinality 2,
    1791             :    all sets must be symmetric around 0. The resulting polynomial of degree 
    1792             :    2*d is F(x) = f_0 + \sum_{1 <= i <= d} f_i (x^i + 1/x^i). The coefficient
    1793             :    f_i is stored in F[i], which therefore needs d+1 elements. */
    1794             : 
    1795             : static unsigned long
    1796         480 : poly_from_sets_V (listz_t F, const mpres_t Q, sets_long_t *sets, 
    1797             :                   listz_t tmp, const unsigned long tmplen, mpmod_t modulus,
    1798             :                   mpzspv_t dct, const mpzspm_t ntt_context)
    1799             : {
    1800             :   unsigned long c, deg, i, nr;
    1801         480 :   set_long_t *set = sets->sets;
    1802             :   mpres_t Qt;
    1803             :   
    1804         480 :   ASSERT_ALWAYS (sets->nr > 0UL);
    1805         480 :   ASSERT_ALWAYS (set->card == 2UL); /* Check that the cardinality of 
    1806             :                                        first set is 2 */
    1807             :   /* Check that first set is symmetric around 0 (we write card-1
    1808             :      instead of 1 to avoid a compiler warning with clang 2.9) */
    1809         480 :   ASSERT_ALWAYS (set->elem[0] == -set->elem[set->card - 1]);
    1810             : 
    1811         480 :   if (test_verbose (OUTPUT_TRACE))
    1812             :     {
    1813             :       mpz_t t;
    1814          10 :       mpz_init (t);
    1815          10 :       mpres_get_z (t, Q, modulus);
    1816          10 :       outputf (OUTPUT_TRACE, "poly_from_sets_V (F, Q = %Zd, sets)\n", t);
    1817          10 :       mpz_clear (t);
    1818             :     }
    1819             : 
    1820         480 :   mpres_init (Qt, modulus);
    1821             :   
    1822         480 :   outputf (OUTPUT_DEVVERBOSE, " (processing set of size 2");
    1823             : 
    1824         480 :   V (Qt, Q, set->elem[0], modulus); /* First set in sets is {-k, k} */ 
    1825         480 :   V (Qt, Qt, 2UL, modulus);         /* Qt = V_2k(Q) */
    1826             :   
    1827         480 :   mpres_neg (Qt, Qt, modulus);
    1828         480 :   mpres_get_z (F[0], Qt, modulus);
    1829         480 :   mpz_set_ui (F[1], 1UL);
    1830         480 :   deg = 1UL;
    1831             :   /* Here, F(x) = (x - r^{2k_1})(x - r^{-2k_1}) / x = 
    1832             :                   (x^2 - x (r^{2k_1} + r^{-2k_1}) + 1) / x =
    1833             :                   (x + 1/x) - V_{2k_1}(r + 1/r) */
    1834             : 
    1835        2809 :   for (nr = sets->nr - 1UL; nr > 0UL; nr--)
    1836             :     {
    1837             :       /* Assuming the sets are sorted in order of ascending cardinality, 
    1838             :          we process them back-to-front so the sets of cardinality 2 are 
    1839             :          processed last, but skipping the first set which we processed 
    1840             :          already. */
    1841             :       
    1842        2329 :       set = sets_nextset (sets->sets); /* Skip first set */
    1843        8944 :       for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
    1844        6615 :         set = sets_nextset (set);
    1845             :         
    1846             :       /* Process this set. We assume it is either of cardinality 2, or of 
    1847             :          odd cardinality */
    1848        2329 :       c = set->card;
    1849        2329 :       outputf (OUTPUT_DEVVERBOSE, " %lu", c);
    1850             : 
    1851        2329 :       if (c == 2UL)
    1852             :         {
    1853             :           /* Check it's symmetric (we write c-1 instead of 1 to avoid a
    1854             :            compiler warning with clang 2.9) */
    1855        1357 :           ASSERT_ALWAYS (set->elem[0] == -set->elem[c - 1]);
    1856        1357 :           V (Qt, Q, set->elem[0], modulus);
    1857        1357 :           V (Qt, Qt, 2UL, modulus);
    1858        1357 :           list_scale_V (F, F, Qt, deg, modulus, tmp, tmplen, dct, 
    1859             :                         ntt_context);
    1860        1357 :           deg *= 2UL;
    1861        1357 :           ASSERT_ALWAYS (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
    1862             :         }
    1863             :       else
    1864             :         {
    1865         972 :           ASSERT_ALWAYS (c % 2UL == 1UL);
    1866         972 :           ASSERT_ALWAYS (set->elem[(c - 1UL) / 2UL] == 0UL);
    1867             :           /* Generate the F(Q^{2k_i} * X)*F(Q^{-2k_i} * X) polynomials.
    1868             :              Each is symmetric of degree 2*deg, so each has deg+1 coefficients
    1869             :              in standard basis. */
    1870        2429 :           for (i = 0UL; i < (c - 1UL) / 2UL; i++)
    1871             :             {
    1872             :               /* Check it's symmetric */
    1873        1457 :               ASSERT_ALWAYS (set->elem[i] == -set->elem[c - 1L - i]);
    1874        1457 :               V (Qt, Q, set->elem[i], modulus);
    1875        1457 :               V (Qt, Qt, 2UL, modulus);
    1876             :               ASSERT (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
    1877        1457 :               list_scale_V (F + (2UL * i + 1UL) * (deg + 1UL), F, Qt, deg, 
    1878             :                             modulus, tmp, tmplen, dct, ntt_context);
    1879             :               ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL * deg], 
    1880             :                       1UL) == 0); /* Check it's monic */
    1881             :             }
    1882             :           /* Multiply the polynomials */
    1883        2429 :           for (i = 0UL; i < (c - 1UL) / 2UL; i++)
    1884             :             {
    1885             :               /* So far, we have the product 
    1886             :                  F(X) * F(Q^{2k_j} * X) * F(Q^{-2k_j} * X), 1 <= j <= i,
    1887             :                  at F. This product has degree 2 * deg + i * 4 * deg, that is
    1888             :                  (2 * i + 1) * 2 * deg, which means (2 * i + 1) * deg + 1
    1889             :                  coefficients in F[0 ... (i * 2 + 1) * deg]. */
    1890             :               ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * deg], 1UL) == 0);
    1891             :               ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL*deg], 
    1892             :                                   1UL) == 0);
    1893        1457 :               list_output_poly (F, (2UL * i + 1UL) * deg + 1, 0, 1, 
    1894             :                                 "poly_from_sets_V: Multiplying ", "\n",
    1895             :                                 OUTPUT_TRACE);
    1896        1457 :               list_output_poly (F + (2UL * i + 1UL) * (deg + 1UL), 
    1897        1457 :                                 2UL * deg + 1UL, 0, 1, " and ", "\n", 
    1898             :                                 OUTPUT_TRACE);
    1899        1457 :               list_mul_reciprocal (F, 
    1900        1457 :                                    F, (2UL * i + 1UL) * deg + 1UL, 
    1901        1457 :                                    F + (2UL * i + 1UL) * (deg + 1UL), 
    1902        1457 :                                    2UL * deg + 1UL, modulus->orig_modulus,
    1903             :                                    tmp, tmplen);
    1904        1457 :               list_mod (F, F, (2UL * i + 3UL) * deg + 1UL, 
    1905        1457 :                         modulus->orig_modulus);
    1906        1457 :               list_output_poly (F, (2UL * i + 3UL) * deg + 1UL, 0, 1, 
    1907             :                                 " = ", "\n", OUTPUT_TRACE);
    1908             :               ASSERT (mpz_cmp_ui (F[(2UL * i + 3UL) * deg], 1UL) == 0);
    1909             :             }
    1910         972 :           deg *= c;
    1911             :         }
    1912             :     }
    1913             : 
    1914         480 :   mpres_clear (Qt, modulus);
    1915         480 :   outputf (OUTPUT_DEVVERBOSE, ")");
    1916             : 
    1917         480 :   return deg;
    1918             : }
    1919             : 
    1920             : static int
    1921         374 : build_F_ntt (listz_t F, const mpres_t P_1, sets_long_t *S_1,
    1922             :              const faststage2_param_t *params, mpmod_t modulus)
    1923             : {
    1924             :   mpzspm_t F_ntt_context;
    1925             :   mpzspv_t F_ntt;
    1926             :   unsigned long tmplen;
    1927             :   listz_t tmp;
    1928             :   long timestart, realstart;
    1929             :   unsigned long i;
    1930             : 
    1931         374 :   timestart = cputime ();
    1932         374 :   realstart = realtime ();
    1933             :   
    1934             :   /* Precompute the small primes, primitive roots and inverses etc. for 
    1935             :      the NTT. The code to multiply wants a 3*k-th root of unity, where 
    1936             :      k is the smallest power of 2 with k > s_1/2 */
    1937             :   
    1938         374 :   F_ntt_context = mpzspm_init (3UL << ceil_log2 (params->s_1 / 2 + 1), 
    1939         374 :                                modulus->orig_modulus);
    1940         374 :   if (F_ntt_context == NULL)
    1941             :     {
    1942           0 :       outputf (OUTPUT_ERROR, "Could not initialise F_ntt_context, "
    1943             :                "presumably out of memory\n");
    1944           0 :       return ECM_ERROR;
    1945             :     }
    1946             :   
    1947         374 :   print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for building F = ",
    1948             :                     F_ntt_context);
    1949             :   
    1950         374 :   outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
    1951             :   
    1952         374 :   tmplen = params->s_1 + 100;
    1953         374 :   tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
    1954         374 :   F_ntt = mpzspv_init (1UL << ceil_log2 (params->s_1 / 2 + 1), F_ntt_context);
    1955             : 
    1956         374 :   i = poly_from_sets_V (F, P_1, S_1, tmp, tmplen, modulus, F_ntt,
    1957             :                         F_ntt_context);
    1958         374 :   ASSERT_ALWAYS(2 * i == params->s_1);
    1959         374 :   ASSERT_ALWAYS(mpz_cmp_ui (F[i], 1UL) == 0);
    1960             :   
    1961         374 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    1962         374 :   if (test_verbose (OUTPUT_TRACE))
    1963             :     {
    1964        8080 :       for (i = 0; i < params->s_1 / 2 + 1; i++)
    1965        8072 :         outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
    1966           8 :       outputf (OUTPUT_TRACE, "f(x) = f_0");
    1967        8072 :       for (i = 1; i < params->s_1 / 2 + 1; i++)
    1968        8064 :         outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
    1969           8 :       outputf (OUTPUT_TRACE, "/* PARI */ \n");
    1970             :     }
    1971             :   
    1972         374 :   clear_list (tmp, tmplen);
    1973         374 :   tmp = NULL;
    1974         374 :   mpzspv_clear (F_ntt, F_ntt_context);
    1975         374 :   F_ntt = NULL;
    1976         374 :   mpzspm_clear (F_ntt_context);
    1977         374 :   F_ntt_context = NULL;
    1978             : 
    1979         374 :   return 0;
    1980             : }
    1981             : 
    1982             : /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l. 
    1983             :    x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P. 
    1984             :    Stores the result in g[0 ... l] and/or in g_ntt[offset ... offset + l] */
    1985             : 
    1986             : static void
    1987         452 : pm1_sequence_g (listz_t g_mpz, mpzspv_t g_ntt, const mpres_t b_1, 
    1988             :                 const unsigned long P, const long M_param, 
    1989             :                 const unsigned long l_param, const mpz_t m_1, const long k_2, 
    1990             :                 mpmod_t modulus_param, const mpzspm_t ntt_context)
    1991             : {
    1992             :   mpres_t r[3], x_0, x_Mi;
    1993             :   mpz_t t;
    1994             :   unsigned long i;
    1995             :   long timestart, realstart;
    1996         452 :   long M = M_param;
    1997         452 :   unsigned long l = l_param, offset = 0UL;
    1998             :   mpmod_t modulus;
    1999         452 :   int want_output = 1;
    2000             : 
    2001         452 :   outputf (OUTPUT_VERBOSE, "Computing g_i");
    2002         452 :   outputf (OUTPUT_DEVVERBOSE, "\npm1_sequence_g: P = %lu, M_param = %lu, "
    2003             :            "l_param = %lu, m_1 = %Zd, k_2 = %lu\n", 
    2004             :            P, M_param, l_param, m_1, k_2);
    2005         452 :   timestart = cputime ();
    2006         452 :   realstart = realtime ();
    2007             : 
    2008             : #ifdef _OPENMP
    2009             : #pragma omp parallel if (l > 100) private(r, x_0, x_Mi, t, i, M, l, offset, modulus, want_output)
    2010             :   {
    2011             :     /* When multi-threading, we adjust the parameters for each thread */
    2012             : 
    2013             :     const int nr_chunks = omp_get_num_threads();
    2014             :     const int thread_nr = omp_get_thread_num();
    2015             :     
    2016             :     l = (l_param - 1) / nr_chunks + 1; /* = ceil(l_param / nr_chunks) */
    2017             :     offset = thread_nr * l;
    2018             :     if (offset <= l_param)
    2019             :       l = MIN(l, l_param - offset);
    2020             :     else
    2021             :       l = 0;
    2022             :     outputf (OUTPUT_DEVVERBOSE, 
    2023             :              "pm1_sequence_g: thread %d has l = %lu, offset = %lu.\n", 
    2024             :              thread_nr, l, offset);
    2025             :     M = M_param - (long) offset;
    2026             :     
    2027             :     /* Let only the master thread print stuff */
    2028             :     want_output = (thread_nr == 0);
    2029             : 
    2030             :     if (want_output)
    2031             :       outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
    2032             : #endif
    2033             : 
    2034             :   /* Make a private copy of the mpmod_t struct */
    2035         452 :   mpmod_init_set (modulus, modulus_param);
    2036             : 
    2037         452 :   mpz_init (t);
    2038         452 :   mpres_init (r[0], modulus);
    2039         452 :   mpres_init (r[1], modulus);
    2040         452 :   mpres_init (r[2], modulus);
    2041         452 :   mpres_init (x_0, modulus);
    2042         452 :   mpres_init (x_Mi, modulus);
    2043             : 
    2044         452 :   if (want_output)
    2045             :     {
    2046         452 :       if (test_verbose (OUTPUT_TRACE))
    2047             :         { 
    2048          12 :           mpres_get_z (t, b_1, modulus);
    2049          12 :           outputf (OUTPUT_TRACE, "\n/* pm1_sequence_g */ N = %Zd; "
    2050             :                    "b_1 = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, t);
    2051          12 :           outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ P = %lu; M = %ld; "
    2052             :                    "m_1 = %Zd; /* PARI */\n", P, M, m_1);
    2053          12 :           outputf (OUTPUT_TRACE, 
    2054             :                    "/* pm1_sequence_g */ r = b_1^P; /* PARI */\n");
    2055          12 :           outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 = "
    2056             :                    "b_1^(2*%ld + (2*m_1 + 1)*P); /* PARI */\n", k_2);
    2057             :         }
    2058             :     }
    2059             : 
    2060             :   /* We use (M-(i+1))^2 = (M-i)^2 + 2*(-M+i) + 1 */
    2061         452 :   mpz_set_ui (t, P);
    2062         452 :   mpres_pow (r[0], b_1, t, modulus);     /* r[0] = b_1^P = r */
    2063         452 :   if (test_verbose (OUTPUT_TRACE))
    2064             :     {
    2065          12 :       mpres_get_z (t, r[0], modulus);
    2066          12 :       outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ r == %Zd /* PARI C */\n", t);
    2067             :     }
    2068             :   
    2069             :   /* FIXME: This is a huge mess, clean up some time */
    2070             : 
    2071         452 :   mpz_set_si (t, M);
    2072         452 :   mpz_neg (t, t);
    2073         452 :   mpz_mul_2exp (t, t, 1UL);
    2074         452 :   mpz_add_ui (t, t, 1UL);
    2075             :   /* Warning: t might be negative here. */
    2076         452 :   mpres_pow (r[1], r[0], t, modulus);    /* r[1] = r^{2(-M+i)+1}, i = 0 */
    2077         452 :   mpz_set_si (t, M);
    2078         452 :   mpz_mul (t, t, t);                     /* t = M^2 */
    2079         452 :   mpres_pow (r[2], r[0], t, modulus);    /* r[2] = r^{(M-i)^2}, i = 0 */
    2080         452 :   mpres_sqr (r[0], r[0], modulus); /* r[0] = r^2 */
    2081             : 
    2082         452 :   mpz_mul_2exp (t, m_1, 1UL);
    2083         452 :   mpz_add_ui (t, t, 1UL);
    2084         452 :   mpz_mul_ui (t, t, P);
    2085         452 :   mpz_add_si (t, t, k_2);
    2086         452 :   mpz_add_si (t, t, k_2);
    2087         452 :   if (want_output)
    2088         452 :     outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ 2*%ld + (2*%Zd + 1)*P == "
    2089             :              "%Zd /* PARI C */\n", k_2, m_1, t);
    2090             : 
    2091             :   /* Warning: t might be negative here. */
    2092         452 :   mpres_pow (x_0, b_1, t, modulus);  /* x_0 = b_1^{2*k_2 + (2*m_1 + 1)*P} */
    2093         452 :   if (want_output && test_verbose (OUTPUT_TRACE))
    2094             :     {
    2095          12 :       mpres_get_z (t, x_0, modulus);
    2096          12 :       outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 == %Zd /* PARI C */\n", 
    2097             :                t);
    2098             :     }
    2099             :   
    2100         452 :   mpz_set_si (t, M);
    2101         452 :   mpres_pow (x_Mi, x_0, t, modulus); /* x_Mi = x_0^{M-i}, i = 0 */
    2102             : 
    2103         452 :   mpres_invert (x_0, x_0, modulus);  /* x_0 := x_0^{-1} now */
    2104         452 :   mpres_mul (r[1], r[1], x_0, modulus); /* r[1] = x_0^{-1} * r^{-2M+1} */
    2105             :   
    2106         452 :   mpres_mul (r[2], r[2], x_Mi, modulus); /* r[2] = x_0^M * r^{M^2} */
    2107         452 :   mpres_get_z (t, r[2], modulus);
    2108         452 :   outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd; /* PARI */\n", 
    2109             :            offset, t);
    2110         452 :   if (l > 0)
    2111             :     {
    2112         452 :       if (g_mpz != NULL)
    2113         127 :         mpz_set (g_mpz[offset], t);
    2114         452 :       if (g_ntt != NULL)
    2115         325 :         mpzspv_from_mpzv (g_ntt, offset, &t, 1UL, ntt_context);
    2116             :     }
    2117             : 
    2118             :   /* So here we have for i = 0
    2119             :      r[2] = x_0^(M-i) * r^{(M-i)^2}
    2120             :      r[1] = x_0^{-1} * r^{2(-M+i)+1}
    2121             :      r[0] = r^2
    2122             :      t = r[2]
    2123             :   */
    2124             : 
    2125     3656905 :   for (i = 1; i < l; i++)
    2126             :     {
    2127     3656453 :       if (g_mpz != NULL)
    2128             :         {
    2129      646538 :           mpres_mul_z_to_z (g_mpz[offset + i], r[1], g_mpz[offset + i - 1], 
    2130             :                             modulus);
    2131      646538 :           outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
    2132      646538 :                    " /* PARI */\n", offset + i, g_mpz[offset + i]);
    2133             :         }
    2134     3656453 :       if (g_ntt != NULL)
    2135             :       {
    2136     3009915 :           mpres_mul_z_to_z (t, r[1], t, modulus);
    2137     3009915 :           if (g_mpz == NULL) /* Only one should be non-NULL... */
    2138     3009915 :               outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
    2139             :                        " /* PARI */\n", offset + i, t);
    2140     3009915 :           mpzspv_from_mpzv (g_ntt, offset + i, &t, 1UL, ntt_context);
    2141             :       }
    2142     3656453 :       mpres_mul (r[1], r[1], r[0], modulus);
    2143             :     }
    2144             : 
    2145         452 :   mpres_clear (r[0], modulus);
    2146         452 :   mpres_clear (r[1], modulus);
    2147         452 :   mpres_clear (r[2], modulus);
    2148         452 :   mpres_clear (x_0, modulus);
    2149         452 :   mpres_clear (x_Mi, modulus);
    2150         452 :   mpz_clear (t);
    2151         452 :   mpmod_clear (modulus); /* Clear our private copy of modulus */
    2152             : 
    2153             : #ifdef _OPENMP
    2154             :   }
    2155             : #endif
    2156             : 
    2157         452 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    2158             :   
    2159         452 :   if (test_verbose (OUTPUT_TRACE))
    2160             :     {
    2161       43024 :       for (i = 0; i < l_param; i++)
    2162             :         {
    2163       43012 :           outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu == x_0^"
    2164             :                    "(M - %lu) * r^((M - %lu)^2) /* PARI C */\n", i, i, i);
    2165             :         }
    2166          12 :       outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g(x) = g_0");
    2167       43012 :       for (i = 1; i < l; i++)
    2168       43000 :         outputf (OUTPUT_TRACE, " + g_%lu * x^%lu", i, i);
    2169          12 :       outputf (OUTPUT_TRACE, " /* PARI */\n");
    2170             :     }
    2171         452 : }
    2172             : 
    2173             : 
    2174             : /* Compute h_j = r^(-j^2) * f_j for 0 <= j < d as described in section 9 
    2175             :    of the paper. h == f is ok. */
    2176             : 
    2177             : static void 
    2178         246 : pm1_sequence_h (listz_t h, mpzspv_t h_ntt, mpz_t *f, const mpres_t r, 
    2179             :                 const unsigned long d, mpmod_t modulus_parm, 
    2180             :                 const mpzspm_t ntt_context)
    2181             : {
    2182             :   mpres_t invr;  /* r^{-1}. Can be shared between threads */
    2183             :   long timestart, realstart;
    2184             : 
    2185         246 :   mpres_init (invr, modulus_parm);
    2186         246 :   mpres_invert (invr, r, modulus_parm); /* invr = r^{-1}. FIXME: test for 
    2187             :                                            failure, even if theoretically 
    2188             :                                            impossible */
    2189             : 
    2190         246 :   if (test_verbose (OUTPUT_TRACE))
    2191             :     {
    2192             :       mpz_t t;
    2193           5 :       mpz_init (t);
    2194           5 :       mpres_get_z (t, r, modulus_parm);
    2195           5 :       outputf (OUTPUT_TRACE, "\n/* pm1_sequence_h */ N = %Zd; "
    2196             :                "r = Mod(%Zd, N); /* PARI */\n", 
    2197           5 :                modulus_parm->orig_modulus, t);
    2198           5 :       mpz_clear (t);
    2199             :     }
    2200             : 
    2201         246 :   outputf (OUTPUT_VERBOSE, "Computing h");
    2202         246 :   timestart = cputime ();
    2203         246 :   realstart = realtime ();
    2204             : 
    2205             : #ifdef _OPENMP
    2206             : #pragma omp parallel if (d > 100)
    2207             : #endif
    2208             :   {
    2209             :     mpres_t fd[3]; /* finite differences table for r^{-i^2}*/
    2210             :     mpz_t t;       /* the h_j value as an mpz_t */
    2211             :     unsigned long j;
    2212         246 :     unsigned long offset = 0UL, len = d;
    2213             :     mpmod_t modulus;
    2214             : 
    2215             :     /* Adjust offset and length for this thread */
    2216             : #ifdef _OPENMP
    2217             :     {
    2218             :       const int nr_chunks = omp_get_num_threads();
    2219             :       const int thread_nr = omp_get_thread_num();
    2220             :       unsigned long chunklen;
    2221             :       
    2222             :       if (thread_nr == 0)
    2223             :         outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
    2224             : 
    2225             :       /* chunklen = ceil (len / nr_chunks) */
    2226             :       chunklen = (len - 1UL) / (unsigned long) nr_chunks + 1UL;
    2227             :       offset = chunklen * (unsigned long) thread_nr;
    2228             :       if (offset <= len)
    2229             :         len = MIN(chunklen, len - offset);
    2230             :       else
    2231             :         len = 0;
    2232             :     }
    2233             : #endif
    2234             :     
    2235         246 :     mpmod_init_set (modulus, modulus_parm);
    2236         246 :     mpres_init (fd[0], modulus);
    2237         246 :     mpres_init (fd[1], modulus);
    2238         246 :     mpres_init (fd[2], modulus);
    2239         246 :     mpz_init (t);
    2240             :     
    2241             :     /* We have (n + 1)^2 = n^2 + 2n + 1. For the finite differences we'll 
    2242             :        need r^{-2}, r^{-(2n+1)}, r^{-n^2}. Init for n = 0. */
    2243             :     
    2244             :     /* r^{-2} in fd[0] is constant and could be shared. Computing it 
    2245             :        separately in each thread has the advantage of putting it in
    2246             :        local memory. May not make much difference overall */
    2247             : 
    2248         246 :     mpres_sqr (fd[0], invr, modulus);       /* fd[0] = r^{-2} */
    2249         246 :     mpz_set_ui (t, offset);
    2250         246 :     mpz_mul_2exp (t, t, 1UL);
    2251         246 :     mpz_add_ui (t, t, 1UL);                 /* t = 2 * offset + 1 */
    2252         246 :     mpres_pow (fd[1], invr, t, modulus);    /* fd[1] = r^{-(2*offset+1)} */
    2253         246 :     mpz_set_ui (t, offset);
    2254         246 :     mpz_mul (t, t, t);                      /* t = offset^2 */
    2255         246 :     mpres_pow (fd[2], invr, t, modulus);    /* fd[2] = r^{-offset^2} */
    2256             :     
    2257             :     /* Generate the sequence */
    2258      182533 :     for (j = offset; j < offset + len; j++)
    2259             :       {
    2260      182287 :         mpres_mul_z_to_z (t, fd[2], f[j], modulus);
    2261      182287 :         outputf (OUTPUT_TRACE, 
    2262             :                  "/* pm1_sequence_h */ h_%lu = %Zd; /* PARI */\n", j, t);
    2263             :         
    2264      182287 :         if (h != NULL)
    2265       41010 :           mpz_set (h[j], t);
    2266      182287 :         if (h_ntt != NULL)
    2267      141277 :           mpzspv_from_mpzv (h_ntt, j, &t, 1UL, ntt_context);
    2268             :         
    2269      182287 :         mpres_mul (fd[2], fd[2], fd[1], modulus); /* fd[2] = r^{-j^2} */
    2270      182287 :         mpres_mul (fd[1], fd[1], fd[0], modulus); /* fd[1] = r^{-2*j-1} */
    2271             :       }
    2272             :     
    2273         246 :     mpres_clear (fd[2], modulus);
    2274         246 :     mpres_clear (fd[1], modulus);
    2275         246 :     mpres_clear (fd[0], modulus);
    2276         246 :     mpz_clear (t);
    2277         246 :     mpmod_clear (modulus);
    2278             :   }
    2279             : 
    2280         246 :   mpres_clear (invr, modulus_parm);
    2281             : 
    2282         246 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    2283             : 
    2284         246 :   if (test_verbose (OUTPUT_TRACE))
    2285             :     {
    2286             :       unsigned long j;
    2287        4702 :       for (j = 0; j < d; j++)
    2288        4697 :         outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h_%lu == "
    2289             :                    "f_%lu * r^(-%lu^2) /* PARI C */\n", j, j, j);
    2290           5 :       outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h(x) = h_0");
    2291        4697 :       for (j = 1; j < d; j++)
    2292        4692 :         outputf (OUTPUT_TRACE, " + h_%lu * (x^%lu + x^(-%lu))", j, j, j);
    2293           5 :       outputf (OUTPUT_TRACE, " /* PARI */\n");
    2294             :     }
    2295         246 : }
    2296             : 
    2297             : 
    2298             : static int 
    2299         480 : make_S_1_S_2 (sets_long_t **S_1, set_long_t **S_2, 
    2300             :               const faststage2_param_t *params)
    2301             : {
    2302             :   unsigned long i;
    2303             :   sets_long_t *facS_2;
    2304             :   size_t facS_2_size;
    2305             : 
    2306         480 :   *S_1 = sets_get_factored_sorted (params->P);
    2307         480 :   if (*S_1 == NULL)
    2308           0 :     return ECM_ERROR;
    2309             : 
    2310             :   {
    2311             :     mpz_t t1, t2;
    2312             :     
    2313         480 :     mpz_init (t1);
    2314         480 :     mpz_init (t2);
    2315         480 :     sets_sumset_minmax (t1, *S_1, 1);
    2316         480 :     sets_max (t2, params->P);
    2317         480 :     ASSERT_ALWAYS (mpz_cmp (t1, t2) == 0);
    2318         480 :     mpz_clear (t1);
    2319         480 :     mpz_clear (t2);
    2320             :   }
    2321             : 
    2322         480 :   *S_2 = malloc (set_sizeof(params->s_2));
    2323         480 :   if (*S_2 == NULL)
    2324             :     {
    2325           0 :       free (*S_1);
    2326           0 :       return ECM_ERROR;
    2327             :     }
    2328             : 
    2329             :   /* Extract sets for S_2 and compute the set of sums */
    2330             :   
    2331         480 :   sets_extract (NULL, &facS_2_size, *S_1, params->s_2);
    2332         480 :   facS_2 = malloc (facS_2_size);
    2333         480 :   if (facS_2 == NULL)
    2334             :     {
    2335           0 :       free (*S_1);
    2336           0 :       free (*S_2);
    2337           0 :       return ECM_ERROR;
    2338             :     }
    2339         480 :   sets_extract (facS_2, NULL, *S_1, params->s_2);
    2340         480 :   sets_sumset (*S_2, facS_2);
    2341         480 :   ASSERT_ALWAYS ((*S_2)->card == params->s_2);
    2342         480 :   free (facS_2);
    2343         480 :   quicksort_long ((*S_2)->elem, (*S_2)->card);
    2344             :   
    2345             :   /* Print the sets in devverbose mode */
    2346         480 :   if (test_verbose (OUTPUT_DEVVERBOSE))
    2347             :     {
    2348          35 :       outputf (OUTPUT_DEVVERBOSE, "S_1 = ");
    2349          35 :       sets_print (OUTPUT_DEVVERBOSE, *S_1);
    2350             :       
    2351          35 :       outputf (OUTPUT_DEVVERBOSE, "S_2 = {");
    2352          89 :       for (i = 0UL; i + 1UL < params->s_2; i++)
    2353          54 :         outputf (OUTPUT_DEVVERBOSE, "%ld, ", (*S_2)->elem[i]);
    2354          35 :       if (i < params->s_2)
    2355          35 :         outputf (OUTPUT_DEVVERBOSE, "%ld", (*S_2)->elem[i]); 
    2356          35 :       outputf (OUTPUT_DEVVERBOSE, "}\n");
    2357             :     }
    2358             : 
    2359         480 :   return 0;
    2360             : }
    2361             : 
    2362             : #if 0
    2363             : static mpzspv_t *
    2364             : mpzspv_init_mt (spv_size_t len, mpzspm_t mpzspm)
    2365             : {
    2366             :   int i; /* OpenMP wants the iteration variable a signed type */
    2367             :   mpzspv_t *x = (mpzspv_t *) malloc (mpzspm->sp_num * sizeof (spv_t *));
    2368             :   
    2369             :   if (x == NULL)
    2370             :     return NULL;
    2371             : 
    2372             :   for (i = 0; i < (int) mpzspm->sp_num; i++)
    2373             :     x[i] = NULL;
    2374             :   
    2375             : #ifdef _OPENMP
    2376             : #pragma omp parallel private(i) shared(x)
    2377             :   {
    2378             : #pragma omp for
    2379             : #endif
    2380             :     for (i = 0; i < (int) mpzspm->sp_num; i++)
    2381             :       x[i] = (spv_t *) sp_aligned_malloc (len * sizeof (sp_t));
    2382             :         
    2383             : #ifdef _OPENMP
    2384             :   }
    2385             : #endif
    2386             : 
    2387             :   for (i = 0; i < (int) mpzspm->sp_num; i++)
    2388             :     if (x[i] == NULL)
    2389             :       break;
    2390             : 
    2391             :   if (i != (int) mpzspm->sp_num) /* There is a NULL pointer */
    2392             :     {
    2393             :       for (i = 0; i < (int) mpzspm->sp_num; i++)
    2394             :         if (x[i] != NULL)
    2395             :           sp_aligned_free(x[i]);
    2396             :       return NULL;
    2397             :     }
    2398             : 
    2399             : #if 0
    2400             :   if (test_verbose (OUTPUT_DEVVERBOSE))
    2401             :     {
    2402             :       spv_t * last = x[0];
    2403             :       printf ("mpzspv_init_mt: x[0] = %p\n", x[0]);
    2404             :       for (i = 1; i < (int) mpzspm->sp_num; i++)
    2405             :         printf ("mpzspv_init_mt: x[%d] = %p, distance = %ld\n", 
    2406             :                 i, x[i], (long) (x[i] - x[i-1]));
    2407             :     }
    2408             : #endif
    2409             : 
    2410             :   return x;
    2411             : }
    2412             : #endif
    2413             : 
    2414             : /* Square the reciprocal Laurent polynomial S(x) of degree 2*n-2.
    2415             :    S(x) = s_0 + \sum_{i=1}^{n-1} s_i (x^i + x^{-1}).
    2416             :    S[i] contains the n coefficients s_i, 0 <= i <= n-1.
    2417             :    R[i] will contain the 2n-1 coefficients r_i, 0 <= i <= 2*n-2, where 
    2418             :    R(x) = S(x)^2 = r_0 + \sum_{i=1}^{2n-2} r_i (x^i + x^{-1}).
    2419             :    dft must have power of 2 length len >= 2n.
    2420             :    The NTT primes must be == 1 (mod 3*len).
    2421             : */
    2422             : 
    2423             : #undef TRACE_ntt_sqr_reciprocal
    2424             : static void
    2425        4350 : ntt_sqr_reciprocal (mpzv_t R, const mpzv_t S, mpzspv_t dft, 
    2426             :                     const spv_size_t n, const mpzspm_t ntt_context)
    2427             : {
    2428             : #ifdef WANT_ASSERT
    2429             :   mpz_t S_eval_1, R_eval_1;
    2430             : #endif
    2431             :   
    2432        4350 :   if (n == 0)
    2433           0 :     return;
    2434             : 
    2435        4350 :   if (n == 1)
    2436             :     {
    2437         691 :       mpz_mul (R[0], S[0], S[0]);
    2438         691 :       mpz_mod (R[0], R[0], ntt_context->modulus);
    2439         691 :       return;
    2440             :     }
    2441             : 
    2442             : #ifdef WANT_ASSERT
    2443             :   mpz_init (S_eval_1);
    2444             :   list_recip_eval1 (S_eval_1, S, n);
    2445             :   /* Compute (S(1))^2 */
    2446             :   mpz_mul (S_eval_1, S_eval_1, S_eval_1);
    2447             :   mpz_mod (S_eval_1, S_eval_1, ntt_context->modulus);
    2448             : #endif
    2449             : 
    2450             : #ifdef TRACE_ntt_sqr_reciprocal
    2451             :   printf ("ntt_sqr_reciprocal: n %lu, length %lu\n", n, len);
    2452             :   gmp_printf ("Input polynomial is %Zd", S[0]);
    2453             :   { 
    2454             :     int j;
    2455             :     for (j = 1; (spv_size_t) j < n; j++)
    2456             :       gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", S[j], j, j);
    2457             :   }
    2458             :   printf ("\n");
    2459             : #endif
    2460             : 
    2461             :   /* Fill NTT elements [0 .. n-1] with coefficients */
    2462        3659 :   mpzspv_from_mpzv (dft, (spv_size_t) 0, S, n, ntt_context);
    2463        3659 :   mpzspv_sqr_reciprocal (dft, n, ntt_context);
    2464             :   
    2465             : #if defined(_OPENMP)
    2466             : #pragma omp parallel if (n > 50)
    2467             : #endif
    2468             :   {
    2469        3659 :     spv_size_t i, offset = 0, chunklen = 2*n - 1;
    2470             : 
    2471             : #if defined(_OPENMP)
    2472             :     {
    2473             :       const int nr_chunks = omp_get_num_threads();
    2474             :       const int thread_nr = omp_get_thread_num();
    2475             :       
    2476             :       chunklen = (chunklen - 1) / (spv_size_t) nr_chunks + 1;
    2477             :       offset = (spv_size_t) thread_nr * chunklen;
    2478             :       if (offset <= 2*n - 1)
    2479             :         chunklen = MIN(chunklen, (2*n - 1) - offset);
    2480             :       else
    2481             :         chunklen = 0UL;
    2482             :     }
    2483             : #endif
    2484             :     
    2485        3659 :     mpzspv_to_mpzv (dft, offset, R + offset, chunklen, ntt_context);
    2486     6045296 :     for (i = offset; i < offset + chunklen; i++)
    2487     6041637 :       mpz_mod (R[i], R[i], ntt_context->modulus);
    2488             :   }
    2489             : 
    2490             : #ifdef TRACE_ntt_sqr_reciprocal
    2491             :   gmp_printf ("ntt_sqr_reciprocal: Output polynomial is %Zd", R[0]);
    2492             :   for (j = 1; (spv_size_t) j < 2*n - 1; j++)
    2493             :     gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
    2494             :   printf ("\n");
    2495             : #endif
    2496             : 
    2497             : #ifdef WANT_ASSERT
    2498             :   mpz_init (R_eval_1);
    2499             :   /* Compute (S^2)(1) and compare to (S(1))^2 */
    2500             :   list_recip_eval1 (R_eval_1, R, 2 * n - 1);
    2501             :   mpz_mod (R_eval_1, R_eval_1, ntt_context->modulus);
    2502             :   if (mpz_cmp (R_eval_1, S_eval_1) != 0)
    2503             :     {
    2504             :       gmp_fprintf (stderr, "ntt_sqr_reciprocal: (S(1))^2 = %Zd but "
    2505             :                    "(S^2)(1) = %Zd\n", S_eval_1, R_eval_1);
    2506             : #if 0
    2507             :       gmp_printf ("Output polynomial is %Zd", R[0]);
    2508             :       for (j = 1; (spv_size_t) j < 2*n - 1; j++)
    2509             :         gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
    2510             :       printf ("\n");
    2511             : #endif
    2512             :       abort ();
    2513             :     }
    2514             :   mpz_clear (S_eval_1);
    2515             :   mpz_clear (R_eval_1);
    2516             : #endif
    2517             : }
    2518             : 
    2519             : 
    2520             : /* Computes gcd(\prod_{0 <= i < len} (ntt[i + offset] + add[i]), N), 
    2521             :    the NTT residues are converted to integer residues (mod N) first.
    2522             :    If add == NULL, add[i] is assumed to be 0. */
    2523             : 
    2524             : static void
    2525         596 : ntt_gcd (mpz_t f, mpz_t *product, mpzspv_t ntt, const unsigned long ntt_offset,
    2526             :          const listz_t add, const unsigned long len_param, 
    2527             :          const mpzspm_t ntt_context, mpmod_t modulus_param)
    2528             : {
    2529             :   unsigned long i, j;
    2530         596 :   const unsigned long Rlen = MPZSPV_NORMALISE_STRIDE;
    2531             :   listz_t R;
    2532         596 :   unsigned long len = len_param, thread_offset = 0;
    2533             :   mpres_t tmpres, tmpprod, totalprod;
    2534             :   mpmod_t modulus;
    2535             :   long timestart, realstart;
    2536             :   
    2537         596 :   outputf (OUTPUT_VERBOSE, "Computing gcd of coefficients and N");
    2538         596 :   timestart = cputime ();
    2539         596 :   realstart = realtime ();
    2540             : 
    2541             :   /* All the threads will multiply their partial products to this one. */
    2542         596 :   mpres_init (totalprod, modulus_param);
    2543         596 :   mpres_set_ui (totalprod, 1UL, modulus_param);
    2544             : 
    2545             : #ifdef _OPENMP
    2546             : #pragma omp parallel if (len > 100) private(i, j, R, len, thread_offset, tmpres, tmpprod, modulus) shared(totalprod)
    2547             :   {
    2548             :     const int nr_chunks = omp_get_num_threads();
    2549             :     const int thread_nr = omp_get_thread_num();
    2550             : 
    2551             :     len = (len_param - 1) / nr_chunks + 1;
    2552             :     thread_offset = thread_nr * len;
    2553             :     if (thread_offset <= len_param)
    2554             :       len = MIN(len, len_param - thread_offset);
    2555             :     else
    2556             :       len = 0;
    2557             : #pragma omp master
    2558             :     {
    2559             :       outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
    2560             :     }
    2561             : #endif
    2562             : 
    2563             :     /* Make a private copy of the mpmod_t struct */
    2564         596 :     mpmod_init_set (modulus, modulus_param);
    2565             : 
    2566         596 :     R = init_list2 (Rlen, (mpz_size (modulus->orig_modulus) + 2) * 
    2567             :                            GMP_NUMB_BITS);
    2568         596 :     mpres_init (tmpres, modulus);
    2569         596 :     mpres_init (tmpprod, modulus);
    2570         596 :     mpres_set_ui (tmpprod, 1UL, modulus);
    2571             :     
    2572       46665 :     for (i = 0; i < len; i += Rlen)
    2573             :       {
    2574       46069 :         const unsigned long blocklen = MIN(len - i, Rlen);
    2575             : 
    2576             :         /* Convert blocklen residues from NTT to integer representatives
    2577             :            and store them in R */
    2578       46069 :         mpzspv_to_mpzv (ntt, ntt_offset + thread_offset + i, R, blocklen, 
    2579             :                         ntt_context);
    2580             : 
    2581             :         /* Accumulate product in tmpprod */
    2582     5894909 :         for (j = 0; j < blocklen; j++)
    2583             :           {
    2584     5848840 :             outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[j]);
    2585     5848840 :             if (add != NULL)
    2586      183680 :               mpz_add (R[j], R[j], add[i + thread_offset + j]);
    2587     5848840 :             mpres_set_z_for_gcd (tmpres, R[j], modulus);
    2588             : #define TEST_ZERO_RESULT
    2589             : #ifdef TEST_ZERO_RESULT
    2590     5848840 :             if (mpres_is_zero (tmpres, modulus))
    2591         265 :               outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
    2592             : #endif
    2593     5848840 :             mpres_mul (tmpprod, tmpprod, tmpres, modulus); 
    2594             :           }
    2595             :       }
    2596             : #ifdef _OPENMP
    2597             : #pragma omp critical
    2598             :     {
    2599             :       mpres_mul (totalprod, totalprod, tmpprod, modulus);
    2600             :     }
    2601             : #else
    2602         596 :     mpres_set (totalprod, tmpprod, modulus);
    2603             : #endif
    2604         596 :     mpres_clear (tmpres, modulus);
    2605         596 :     mpres_clear (tmpprod, modulus);
    2606         596 :     mpmod_clear (modulus);
    2607         596 :     clear_list (R, Rlen);
    2608             : #ifdef _OPENMP
    2609             :   }
    2610             : #endif
    2611             : 
    2612             :   {
    2613             :     mpz_t n;
    2614         596 :     mpz_init (n);
    2615         596 :     mpz_set_ui (n, len_param);
    2616         596 :     mpres_set_z_for_gcd_fix (totalprod, totalprod, n, modulus_param);
    2617         596 :     mpz_clear (n);
    2618             :   }
    2619             : 
    2620         596 :   if (product != NULL)
    2621          53 :     mpres_get_z (*product, totalprod, modulus_param);
    2622             : 
    2623         596 :   mpres_gcd (f, totalprod, modulus_param);
    2624         596 :   mpres_clear (totalprod, modulus_param);
    2625             : 
    2626         596 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    2627         596 : }
    2628             : 
    2629             : 
    2630             : int 
    2631          59 : pm1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus, 
    2632             :         const faststage2_param_t *params)
    2633             : {
    2634             :   unsigned long phiP, nr;
    2635             :   unsigned long i, l, lenF, lenG, lenR, tmplen;
    2636             :   sets_long_t *S_1; /* This is stored as a set of sets (arithmetic 
    2637             :                        progressions of prime length */
    2638             :   set_long_t *S_2; /* This is stored as a regular set */
    2639             :   listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has 
    2640             :                   degree s_1. It is symmetric, so has only s_1 / 2 + 1 
    2641             :                   distinct coefficients. The sequence h_j will be stored in 
    2642             :                   the same memory and won't be a monic polynomial, so the 
    2643             :                   leading 1 monomial of F will be stored explicitly. Hence we 
    2644             :                   need s_1 / 2 + 1 entries. */
    2645             :   listz_t g, h, tmp, R;
    2646             :   mpz_t mt;   /* All-purpose temp mpz_t */
    2647             :   mpres_t mr; /* All-purpose temp mpres_t */
    2648          59 :   int youpi = ECM_NO_FACTOR_FOUND;
    2649             :   long timetotalstart, realtotalstart, timestart;
    2650             : 
    2651          59 :   timetotalstart = cputime ();
    2652          59 :   realtotalstart = realtime ();
    2653             : 
    2654          59 :   phiP = eulerphi (params->P);
    2655          59 :   ASSERT_ALWAYS (phiP == params->s_1 * params->s_2);
    2656          59 :   ASSERT_ALWAYS (params->s_1 < params->l);
    2657          59 :   nr = params->l - params->s_1; /* Number of points we evaluate */
    2658             : 
    2659          59 :   if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
    2660           0 :       return ECM_ERROR;
    2661             : 
    2662             :   /* Allocate all the memory we'll need */
    2663             :   /* Allocate the correct amount of space for each mpz_t or the 
    2664             :      reallocations will up to double the time for stage 2! */
    2665          59 :   mpz_init (mt);
    2666          59 :   mpres_init (mr, modulus);
    2667          59 :   lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
    2668             :                                      the leading 1 monomial for each factor */
    2669          59 :   F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
    2670          59 :   h = malloc ((params->s_1 + 1) * sizeof (mpz_t));
    2671          59 :   if (h == NULL)
    2672             :     {
    2673           0 :       fprintf (stderr, "Cannot allocate memory in pm1fs2\n");
    2674           0 :       exit (1);
    2675             :     }
    2676          59 :   lenG = params->l;
    2677          59 :   g = init_list2 (lenG, (unsigned int) abs (modulus->bits));
    2678          59 :   lenR = nr;
    2679          59 :   R = init_list2 (lenR, (unsigned int) abs (modulus->bits));    
    2680          59 :   tmplen = 3UL * params->l + list_mul_mem (params->l / 2);
    2681          59 :   outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
    2682          59 :   if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
    2683             :     {
    2684           2 :       tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
    2685             :       /* FIXME: It appears TMulGen_space() returns a too small value! */
    2686           2 :       outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n", 
    2687             :                tmplen);
    2688             :     }
    2689             : #ifdef SHOW_TMP_USAGE
    2690             :   tmp = init_list (tmplen);
    2691             : #else
    2692          59 :   tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
    2693             : #endif
    2694             :   
    2695          59 :   mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
    2696          59 :   outputf (OUTPUT_TRACE, 
    2697             :            "N = %Zd; X = Mod(%Zd, N); /* PARI */\n", 
    2698          59 :            modulus->orig_modulus, mt);
    2699             : 
    2700             : 
    2701             :   /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2k_1}) */
    2702          59 :   outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
    2703             :   
    2704          59 :   timestart = cputime ();
    2705             :   
    2706             :   /* First compute X + 1/X */
    2707          59 :   mpres_invert (mr, X, modulus);
    2708          59 :   mpres_add (mr, mr, X, modulus);
    2709             : 
    2710          59 :   i = poly_from_sets_V (F, mr, S_1, tmp, tmplen, modulus, NULL, NULL);
    2711          59 :   ASSERT_ALWAYS(2 * i == params->s_1);
    2712             :   ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
    2713          59 :   free (S_1);
    2714          59 :   S_1 = NULL;
    2715             :   
    2716          59 :   outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
    2717          59 :   if (test_verbose (OUTPUT_TRACE))
    2718             :     {
    2719         662 :       for (i = 0; i < params->s_1 / 2 + 1; i++)
    2720         661 :         outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
    2721           1 :       outputf (OUTPUT_TRACE, "f(x) = f_0");
    2722         661 :       for (i = 1; i < params->s_1 / 2 + 1; i++)
    2723         660 :         outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
    2724           1 :       outputf (OUTPUT_TRACE, "/* PARI */ \n");
    2725             :     }
    2726             :   
    2727          59 :   mpz_set_ui (mt, params->P);
    2728          59 :   mpres_pow (mr, X, mt, modulus); /* mr = X^P */
    2729          59 :   pm1_sequence_h (F, NULL, F, mr, params->s_1 / 2 + 1, modulus, NULL); 
    2730             : 
    2731             :   /* Make a symmetric copy of F in h. It will have length 
    2732             :      s_1 + 1 = 2*lenF - 1 */
    2733             :   /* I.e. with F = [3, 2, 1], s_1 = 4, we want h = [1, 2, 3, 2, 1] */
    2734       41069 :   for (i = 0; i < params->s_1 / 2 + 1; i++)
    2735       41010 :     *(h[i]) = *(F[params->s_1 / 2 - i]); /* Clone the mpz_t. */
    2736       41010 :   for (i = 0; i < params->s_1 / 2; i++)
    2737       40951 :     *(h[i + params->s_1 / 2 + 1]) = *(F[i + 1]);
    2738          59 :   if (test_verbose (OUTPUT_TRACE))
    2739             :     {
    2740        1322 :       for (i = 0; i < params->s_1 + 1; i++)
    2741        1321 :         outputf (OUTPUT_VERBOSE, "h_%lu = %Zd; /* PARI */\n", i, h[i]);
    2742           1 :       outputf (OUTPUT_VERBOSE, "h(x) = h_0");
    2743        1321 :       for (i = 1; i < params->s_1 + 1; i++)
    2744        1320 :         outputf (OUTPUT_VERBOSE, " + h_%lu * x^%lu", i, i);
    2745           1 :       outputf (OUTPUT_VERBOSE, " /* PARI */\n");
    2746             :     }
    2747             : 
    2748         146 :   for (l = 0; l < params->s_2; l++)
    2749             :     {
    2750         127 :       const unsigned long M = params->l - 1L - params->s_1 / 2L;
    2751         127 :       outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n", 
    2752         127 :                l + 1, params->s_2);
    2753         127 :       pm1_sequence_g (g, NULL, X, params->P, M, params->l, 
    2754         127 :                       params->m_1, S_2->elem[l], modulus, NULL);
    2755             : 
    2756             :       /* Do the convolution */
    2757             :       /* Use the transposed "Middle Product" algorithm */
    2758             :       /* TMulGen reverses the first input sequence, but that doesn't matter
    2759             :          since h is symmetric. */
    2760             : 
    2761         127 :       outputf (OUTPUT_VERBOSE, "TMulGen of g and h");
    2762         127 :       timestart = cputime ();
    2763             :       ASSERT(tmplen >= TMulGen_space (nr - 1, params->l - 1, params->s_1));
    2764             : 
    2765             :       /* Computes rev(h)*g, stores coefficients of x^(s_1) to 
    2766             :          x^(s_1+nr-1) = x^(len-1) */
    2767         127 :       if (TMulGen (R, nr - 1, h, params->s_1, g, params->l - 1, tmp, 
    2768         127 :                    modulus->orig_modulus) < 0)
    2769             :         {
    2770           0 :           outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
    2771             :                    "of memory)\n");
    2772           0 :           youpi = ECM_ERROR;
    2773           0 :           break;
    2774             :         }
    2775         127 :       list_mod (R, R, nr, modulus->orig_modulus);
    2776             : 
    2777         127 :       outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
    2778             : 
    2779             : #if 0 && defined(WANT_ASSERT)
    2780             : 
    2781             :       /* See if R[i] is correct, with a test that works even if i0 != 0 */
    2782             :       /* More expensive self-test */
    2783             :       /* alpha = beta*(i0 + l*nr) */
    2784             :       /* This code is old and probably does not work. */
    2785             : 
    2786             :       outputf (OUTPUT_VERBOSE, "Verifying all results (slow)");
    2787             :       for (i = 0; i < nr; i++)
    2788             :         {
    2789             :           mpz_set_ui (mt, nr * l);
    2790             :           mpz_add (mt, mt, root_params->i0);
    2791             :           mpz_add_ui (mt, mt, i);
    2792             :           mpz_mul_ui (mt, mt, beta);
    2793             :           mpres_get_z (tmp[0], X, modulus);
    2794             :           mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus);
    2795             :           /* Hence, tmp[0] = X^(alpha + i * beta) */
    2796             :           list_eval_poly (tmp[1], F, tmp[0], dF, 1, modulus->orig_modulus, 
    2797             :                           tmp + 2);
    2798             : 
    2799             :           mpz_set_ui (mt, i);
    2800             :           mpz_mul_ui (mt, mt, i);
    2801             :           mpz_mul_ui (mt, mt, beta / 2); /* h(i) = beta*i^2/2 */
    2802             :           mpres_get_z (tmp[0], X, modulus);
    2803             :           mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus); /* X^h(1) */
    2804             :           mpz_mul (tmp[0], tmp[0], R[i]);
    2805             :           mpz_mod (tmp[0], tmp[0], modulus->orig_modulus);
    2806             :           if (mpz_cmp (tmp[0], tmp[1]) != 0)
    2807             :             {
    2808             :               outputf (OUTPUT_ERROR, "Result in R[%ld] incorrect.\n", i);
    2809             :               outputf (OUTPUT_ERROR, "R[%ld] = %Zd\n", i, R[i]);
    2810             :               abort ();
    2811             :             }
    2812             :         }
    2813             :       outputf (OUTPUT_VERBOSE, " - everything's correct! :-D\n");
    2814             : #endif
    2815             : 
    2816         127 :       if (test_verbose (OUTPUT_TRACE))
    2817             :         {
    2818        4968 :           for (i = 0; i < nr; i++)
    2819        4964 :             outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[i]);
    2820             :         }
    2821             : 
    2822         127 :       outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)");
    2823         127 :       timestart = cputime ();
    2824             : 
    2825             :       {
    2826             :         mpres_t tmpres, tmpprod;
    2827         127 :         mpres_init (tmpres, modulus);
    2828         127 :         mpres_init (tmpprod, modulus);
    2829         127 :         mpres_set_z_for_gcd (tmpprod, R[0], modulus);
    2830      493155 :         for (i = 1; i < nr; i++)
    2831             :           {
    2832      493028 :             mpres_set_z_for_gcd (tmpres, R[i], modulus);
    2833      493028 :             mpres_mul (tmpprod, tmpprod, tmpres, modulus); 
    2834             :           }
    2835         127 :         mpres_get_z (tmp[1], tmpprod, modulus); /* For printing */
    2836         127 :         mpres_gcd (tmp[0], tmpprod, modulus);
    2837         127 :         mpres_clear (tmpprod, modulus);
    2838         127 :         mpres_clear (tmpres, modulus);
    2839             :       }
    2840             : 
    2841         127 :       outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
    2842         127 :       outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", tmp[1]);
    2843             : 
    2844         127 :       if (mpz_cmp_ui (tmp[0], 1UL) > 0)
    2845             :         {
    2846          40 :           mpz_set (f, tmp[0]);
    2847          40 :           youpi = ECM_FACTOR_FOUND_STEP2;
    2848          40 :           break;
    2849             :         }
    2850             :     }
    2851             : 
    2852             : #ifdef SHOW_TMP_USAGE
    2853             :   for (i = tmplen - 1; i > 0; i--)
    2854             :     if (tmp[i]->_mp_alloc > 1)
    2855             :       break;
    2856             :   outputf (OUTPUT_DEVVERBOSE, "Highest used temp element is tmp[%lu]\n", i);
    2857             : #endif
    2858             :   
    2859          59 :   free (S_2);
    2860          59 :   free (h);
    2861          59 :   clear_list (F, lenF);
    2862          59 :   clear_list (g, lenG);
    2863          59 :   clear_list (R, lenR);    
    2864          59 :   clear_list (tmp, tmplen);
    2865             : 
    2866          59 :   mpz_clear (mt);
    2867          59 :   mpres_clear (mr, modulus);
    2868             : 
    2869          59 :   outputf (OUTPUT_NORMAL, "Step 2");
    2870             :   /* In normal output mode, print only cpu time as we always have.
    2871             :      In verbose mode, print real time as well if we used multi-threading */
    2872          59 :   if (test_verbose (OUTPUT_VERBOSE))
    2873          16 :     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
    2874             :   else
    2875          43 :     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
    2876             :   
    2877          59 :   return youpi;
    2878             : }
    2879             : 
    2880             : 
    2881             : int 
    2882         187 : pm1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus, 
    2883             :         const faststage2_param_t *params)
    2884             : {
    2885             :   unsigned long nr;
    2886             :   unsigned long l, lenF;
    2887             :   sets_long_t *S_1; /* This is stored as a set of sets (arithmetic 
    2888             :                        progressions of prime length) */
    2889             :   set_long_t *S_2; /* This is stored as a regular set */
    2890             :   listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has 
    2891             :                   degree s_1. It is symmetric, so has only s_1 / 2 + 1 
    2892             :                   distinct coefficients. The sequence h_j will be stored in 
    2893             :                   the same memory and won't be a monic polynomial, so the 
    2894             :                   leading 1 monomial of F will be stored explicitly. Hence we 
    2895             :                   need s_1 / 2 + 1 entries. */
    2896             :   mpzspm_t ntt_context;
    2897             :   mpzspv_t g_ntt, h_ntt;
    2898             :   mpz_t mt;   /* All-purpose temp mpz_t */
    2899             :   mpz_t product; /* Product of each multi-point evaluation */
    2900         187 :   mpz_t *product_ptr = NULL;
    2901             :   mpres_t tmpres; /* All-purpose temp mpres_t */
    2902         187 :   int youpi = ECM_NO_FACTOR_FOUND;
    2903             :   long timetotalstart, realtotalstart, timestart, realstart;
    2904             : 
    2905         187 :   timetotalstart = cputime ();
    2906         187 :   realtotalstart = realtime ();
    2907             : 
    2908         187 :   ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
    2909         187 :   ASSERT_ALWAYS (params->s_1 < params->l);
    2910         187 :   nr = params->l - params->s_1; /* Number of points we evaluate */
    2911             : 
    2912             :   /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution 
    2913             :      with g. We need NTT of transform length l. We do it here at the start
    2914             :      of stage 2 so that in case of a "not enough primes" condition, 
    2915             :      we don't have to wait until after F is built to get the error. */
    2916             : 
    2917         187 :   ntt_context = mpzspm_init (params->l, modulus->orig_modulus);
    2918         187 :   if (ntt_context == NULL)
    2919             :     {
    2920           0 :       outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
    2921             :                "presumably out of memory\n");
    2922           0 :       return ECM_ERROR;
    2923             :     }
    2924             : 
    2925         187 :   print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ", 
    2926             :                     ntt_context);
    2927             : 
    2928         187 :   if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
    2929           0 :       return ECM_ERROR;
    2930             : 
    2931             :   /* Allocate all the memory we'll need for building f */
    2932         187 :   mpz_init (mt);
    2933         187 :   mpres_init (tmpres, modulus);
    2934         187 :   lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
    2935             :                                      the leading 1 monomial for each factor */
    2936             :   {
    2937             :     /* in some cases the above value of lenF is not enough, for example with
    2938             :        s_1 = 10, which gives lenF = 7, but 9 entries are needed */
    2939         187 :     unsigned long mem = mem_poly_from_sets_V (S_1);
    2940         187 :     if (mem > lenF)
    2941          20 :       lenF = mem;
    2942             :   }
    2943             : 
    2944         187 :   F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
    2945             : 
    2946         187 :   mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
    2947         187 :   outputf (OUTPUT_TRACE, 
    2948             :            "N = %Zd; X = Mod(%Zd, N); /* PARI */\n", 
    2949         187 :            modulus->orig_modulus, mt);
    2950             : 
    2951             : #if 0 && defined (WANT_ASSERT)
    2952             :   /* For this self test run with a large enough B2 so that enough memory
    2953             :      is allocated for tmp and F_ntt, otherwise it segfaults. */
    2954             :   {
    2955             :     int testlen = 255;
    2956             :     int i, j;
    2957             :     /* A test of ntt_sqr_reciprocal() */
    2958             :     for (j = 1; j <= testlen; j++)
    2959             :       {
    2960             :         outputf (OUTPUT_VERBOSE, 
    2961             :                  "Testing ntt_sqr_reciprocal() for input degree %d\n", 
    2962             :                  j - 1);
    2963             :         for (i = 0; i < j; i++)
    2964             :           mpz_set_ui (tmp[i], 1UL);
    2965             :         ntt_sqr_reciprocal (tmp, tmp, F_ntt, (spv_size_t) j, ntt_context_F);
    2966             :         for (i = 0; i < 2 * j - 1; i++)
    2967             :           {
    2968             :             ASSERT (mpz_cmp_ui (tmp[i], 2 * j - 1 - i) == 0);
    2969             :           }
    2970             :       }
    2971             :     outputf (OUTPUT_VERBOSE, 
    2972             :              "Test of ntt_sqr_reciprocal() for input degree 2 ... %d passed\n", 
    2973             :              testlen - 1);
    2974             :   }
    2975             : #endif
    2976             : 
    2977             : 
    2978             :   /* First compute X + 1/X */
    2979         187 :   mpres_invert (tmpres, X, modulus);
    2980         187 :   mpres_add (tmpres, tmpres, X, modulus);
    2981             : 
    2982         187 :   if (build_F_ntt (F, tmpres, S_1, params, modulus) == ECM_ERROR)
    2983             :     {
    2984           0 :       free (S_1);
    2985           0 :       free (S_2);
    2986           0 :       mpz_clear (mt);
    2987           0 :       mpres_clear (tmpres, modulus);
    2988           0 :       mpzspm_clear (ntt_context);
    2989           0 :       clear_list (F, lenF);
    2990           0 :       return ECM_ERROR;
    2991             :     }
    2992             : 
    2993         187 :   free (S_1);
    2994         187 :   S_1 = NULL;
    2995             :   
    2996         187 :   h_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
    2997             : 
    2998         187 :   mpz_set_ui (mt, params->P);
    2999         187 :   mpres_pow (tmpres, X, mt, modulus); /* tmpres = X^P */
    3000         187 :   pm1_sequence_h (NULL, h_ntt, F, tmpres, params->s_1 / 2 + 1, modulus, 
    3001             :                   ntt_context);
    3002             : 
    3003         187 :   clear_list (F, lenF);
    3004         187 :   g_ntt = mpzspv_init (params->l, ntt_context);
    3005             : 
    3006             :   /* Compute the DCT-I of h */
    3007         187 :   outputf (OUTPUT_VERBOSE, "Computing DCT-I of h");
    3008             : #ifdef _OPENMP
    3009             :   outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    3010             : #endif
    3011         187 :   timestart = cputime ();
    3012         187 :   realstart = realtime ();
    3013             :   
    3014         187 :   mpzspv_to_dct1 (h_ntt, h_ntt, params->s_1 / 2 + 1, params->l / 2 + 1, 
    3015             :                   g_ntt, ntt_context);
    3016         187 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    3017             :   
    3018         187 :   if (test_verbose (OUTPUT_RESVERBOSE))
    3019             :     {
    3020          17 :       mpz_init (product);
    3021          17 :       product_ptr = &product;
    3022             :     }
    3023             : 
    3024         371 :   for (l = 0; l < params->s_2; l++)
    3025             :     {
    3026         325 :       const unsigned long M = params->l - 1L - params->s_1 / 2L;
    3027             : 
    3028         325 :       outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n", 
    3029         325 :                l + 1, params->s_2);
    3030             :       /* Compute the coefficients of the polynomial g(x) */
    3031         325 :       pm1_sequence_g (NULL, g_ntt, X, params->P, M, params->l, 
    3032         325 :                       params->m_1, S_2->elem[l], modulus, ntt_context);
    3033             : 
    3034             :       /* Do the convolution */
    3035         325 :       outputf (OUTPUT_VERBOSE, "Computing g*h");
    3036             : #ifdef _OPENMP
    3037             :       outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    3038             : #endif
    3039         325 :       timestart = cputime ();
    3040         325 :       realstart = realtime ();
    3041         325 :       mpzspv_mul_by_dct (g_ntt, h_ntt, params->l, ntt_context, 
    3042             :         NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
    3043         325 :       print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    3044             :       
    3045             :       /* Compute GCD of N and coefficients of product polynomial */
    3046         325 :       ntt_gcd (mt, product_ptr, g_ntt, params->s_1 / 2, NULL, nr, ntt_context, 
    3047             :                modulus);
    3048             : 
    3049         325 :       outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);
    3050             : 
    3051             :       /* If we found a factor, stop */
    3052         325 :       if (mpz_cmp_ui (mt, 1UL) > 0)
    3053             :         {
    3054         141 :           mpz_set (f, mt);
    3055         141 :           youpi = ECM_FACTOR_FOUND_STEP2;
    3056         141 :           break;
    3057             :         }
    3058             :     }
    3059             : 
    3060         187 :   if (test_verbose (OUTPUT_RESVERBOSE))
    3061             :     {
    3062          17 :       product_ptr = NULL;
    3063          17 :       mpz_clear (product);
    3064             :     }
    3065         187 :   mpzspv_clear (g_ntt, ntt_context);
    3066         187 :   mpzspv_clear (h_ntt, ntt_context);
    3067         187 :   mpzspm_clear (ntt_context);
    3068         187 :   mpres_clear (tmpres, modulus);
    3069         187 :   mpz_clear (mt);
    3070         187 :   free (S_2);
    3071             : 
    3072         187 :   outputf (OUTPUT_NORMAL, "Step 2");
    3073             :   /* In normal output mode, print only cpu time as we always have.
    3074             :      In verbose mode, print real time as well if we used multi-threading */
    3075         187 :   if (test_verbose (OUTPUT_VERBOSE))
    3076          29 :     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
    3077             :   else
    3078         158 :     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
    3079             :   
    3080         187 :   return youpi;
    3081             : }
    3082             : 
    3083             : 
    3084             : static void 
    3085          80 : gfp_ext_print (const mpres_t r_x, const mpres_t r_y, mpmod_t modulus, 
    3086             :                const int verbose)
    3087             : {
    3088             :   mpz_t t1, t2;
    3089             : 
    3090          80 :   if (!test_verbose (verbose))
    3091           0 :     return;
    3092             : 
    3093          80 :   mpz_init (t1);
    3094          80 :   mpz_init (t2);
    3095          80 :   mpres_get_z (t1, r_x, modulus);
    3096          80 :   mpres_get_z (t2, r_y, modulus);
    3097          80 :   outputf (verbose, "Mod(%Zd, N) + Mod(%Zd, N) * w", t1, t2);
    3098             :   
    3099          80 :   mpz_clear (t1);
    3100          80 :   mpz_clear (t2);
    3101             : }
    3102             : 
    3103             : 
    3104             : 
    3105             : /* Multiplies (a_0 + a_1*sqrt(Delta)) * (b_0 + b_1*sqrt(Delta))
    3106             :    using four multiplications. Result goes in (r_0 + r_1*sqrt(Delta)). 
    3107             :    a_0, b_0, r_0 as well as a_1, b_1, r_1 may overlap arbitrarily. t[0], t[1], 
    3108             :    t[2] and Delta must not overlap with anything. */
    3109             : /* FIXME: is there a faster multiplication routine if both inputs have 
    3110             :    norm 1? */
    3111             : 
    3112             : static void 
    3113       14396 : gfp_ext_mul (mpres_t r_0, mpres_t r_1, const mpres_t a_0, const mpres_t a_1,
    3114             :              const mpres_t b_0, const mpres_t b_1, const mpres_t Delta, 
    3115             :              mpmod_t modulus, ATTRIBUTE_UNUSED const unsigned long tmplen, 
    3116             :              mpres_t *tmp)
    3117             : {
    3118             :   ASSERT (tmplen >= 2);
    3119             :   if (0 && test_verbose (OUTPUT_TRACE))
    3120             :     {
    3121             :       mpz_t t;
    3122             :       mpz_init (t);
    3123             :       mpres_get_z (t, Delta, modulus);
    3124             :       outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ w = quadgen (4*%Zd); "
    3125             :                "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
    3126             :       mpz_clear (t);
    3127             :       outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ (");
    3128             :       gfp_ext_print (a_0, a_1, modulus, OUTPUT_TRACE);
    3129             :       outputf (OUTPUT_TRACE, ") * (");
    3130             :       gfp_ext_print (b_0, b_1, modulus, OUTPUT_TRACE);
    3131             :     }
    3132             :   
    3133       14396 :   mpres_add (tmp[0], a_0, a_1, modulus);
    3134       14396 :   mpres_add (tmp[1], b_0, b_1, modulus);
    3135       14396 :   mpres_mul (tmp[1], tmp[0], tmp[1], modulus); /* t[1] = (a_0+a_1)*(b_0+b_1) = 
    3136             :                                             a_0*b_0 + a_0*b_1 + a_1*b_0 + 
    3137             :                                             a_1*b_1 */
    3138             : 
    3139       14396 :   mpres_mul (r_0, a_0, b_0, modulus);    /* r_0 = a_0*b_0. We don't need a_0 
    3140             :                                             or b_0 any more now */
    3141       14396 :   mpres_sub (tmp[1], tmp[1], r_0, modulus);  /* t[1] = a_0*b_1 + a_1*b_0 + 
    3142             :                                                 a_1*b_1 */
    3143             :   
    3144       14396 :   mpres_mul (tmp[0], a_1, b_1, modulus);   /* t[0] = a_1*b_1. We don't need 
    3145             :                                               a_1 or b_1 any more now */
    3146       14396 :   mpres_sub (r_1, tmp[1], tmp[0], modulus);  /* r_1 == a_0*b_1 + a_1*b_0 */
    3147             :   
    3148       14396 :   mpres_mul (tmp[0], tmp[0], Delta, modulus); /* t[0] = a_1*b_1*Delta */
    3149       14396 :   mpres_add (r_0, r_0, tmp[0], modulus);   /* r_0 = a_0*b_0 + a_1*b_1*Delta */
    3150             : 
    3151             :   if (0 && test_verbose (OUTPUT_TRACE))
    3152             :     {
    3153             :       outputf (OUTPUT_TRACE, ") == ");
    3154             :       gfp_ext_print (r_0, r_1, modulus, OUTPUT_TRACE);
    3155             :       outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3156             :     }
    3157       14396 : }
    3158             : 
    3159             : 
    3160             : /* Computes (a_0 + a_1 * sqrt(Delta))^2, where the norm 
    3161             :    (a_0^2 - a_1^2*Delta) is assumed to be equal to 1. Hence 
    3162             :    (a_0 + a_1 * sqrt(Delta))^2 = a_0^2 + 2*a_0*a_1*sqrt(Delta) + a_1^2*Delta
    3163             :    and a_0^2 + a_1^2*Delta = a_0^2 + a_1^2*Delta + norm - 1 = 2*a_0^2 - 1.
    3164             :    a_0 and r_0, as well as a_1 and r_1 may overlap */
    3165             : 
    3166             : static void
    3167       23998 : gfp_ext_sqr_norm1 (mpres_t r_0, mpres_t r_1, const mpres_t a_0, 
    3168             :                    const mpres_t a_1, mpmod_t modulus)
    3169             : {
    3170             :   ASSERT (a_0 != r_1);  /* a_0 is read after r_1 is written */
    3171             :   
    3172       23998 :   if (pari)
    3173           0 :     gmp_printf ("/* gfp_ext_sqr_norm1 */ (%Zd + %Zd * w)^2 %% N == ", a_0, a_1);
    3174             :   
    3175       23998 :   mpres_mul (r_1, a_0, a_1, modulus);
    3176       23998 :   mpres_add (r_1, r_1, r_1, modulus);       /* r_1 = 2*a_0*a_1 */
    3177             :   
    3178       23998 :   mpres_sqr (r_0, a_0, modulus);
    3179       23998 :   mpres_add (r_0, r_0, r_0, modulus);
    3180       23998 :   mpres_sub_ui (r_0, r_0, 1UL, modulus);    /* r_0 = 2*a_0^2 - 1 */
    3181             : 
    3182       23998 :   if (pari)
    3183           0 :     gmp_printf ("(%Zd + %Zd * w) %% N /* PARI C */\n", r_0, r_1);
    3184       23998 : }
    3185             : 
    3186             : 
    3187             : /* Raise (a0 + a1*sqrt(Delta)) to the power e which is a signed long int. 
    3188             :    (a0 + a1*sqrt(Delta)) is assumed to have norm 1, i.e. 
    3189             :    a0^2 - a1^2*Delta == 1. The result is (r0 * r1*sqrt(Delta)). 
    3190             :    a0, a1, r0 and r1 must not overlap */
    3191             : 
    3192             : static void 
    3193        2682 : gfp_ext_pow_norm1_sl (mpres_t r0, mpres_t r1, const mpres_t a0, 
    3194             :                       const mpres_t a1, const long e, const mpres_t Delta, 
    3195             :                       mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
    3196             : {
    3197        2682 :   const unsigned long abs_e = labs (e);
    3198        2682 :   unsigned long mask = ~0UL - (~0UL >> 1);
    3199             : 
    3200             :   ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);
    3201             : 
    3202        2682 :   if (e == 0)
    3203             :     {
    3204         468 :       mpres_set_ui (r0, 1UL, modulus);
    3205         468 :       mpres_set_ui (r1, 0UL, modulus);
    3206         468 :       return;
    3207             :     }
    3208             : 
    3209             :   /* If e < 0, we want 1/(a0 + a1*sqrt(Delta)). By extending with 
    3210             :      a0 - a1*sqrt(Delta), we get 
    3211             :      (a0 - a1*sqrt(Delta)) / (a0^2 - a1^2 * Delta), but that denomiator
    3212             :      is the norm which is known to be 1, so the result is 
    3213             :      a0 - a1*sqrt(Delta). */
    3214             : 
    3215      122902 :   while ((abs_e & mask) == 0UL)
    3216      120688 :     mask >>= 1;
    3217             : 
    3218        2214 :   mpres_set (r0, a0, modulus);
    3219        2214 :   mpres_set (r1, a1, modulus);
    3220             : 
    3221       21008 :   while (mask > 1UL)
    3222             :     {
    3223       18794 :       gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
    3224       18794 :       mask >>= 1;
    3225       18794 :       if (abs_e & mask)
    3226        9070 :         gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
    3227             :     }
    3228             : 
    3229        2214 :   if (e < 0)
    3230           0 :     mpres_neg (r1, r1, modulus);
    3231             : 
    3232             :   if (0 && test_verbose (OUTPUT_TRACE))
    3233             :     {
    3234             :       mpz_t t;
    3235             :       mpz_init (t);
    3236             :       mpres_get_z (t, Delta, modulus);
    3237             :       outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ w = quadgen (4*%Zd); "
    3238             :                "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
    3239             :       mpz_clear (t);
    3240             :       outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ (");
    3241             :       gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
    3242             :       outputf (OUTPUT_TRACE, ")^(%ld) == ", e);
    3243             :       gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
    3244             :       outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3245             :     }
    3246             : }
    3247             : 
    3248             : 
    3249             : /* Same, but taking an mpz_t argument for the exponent */
    3250             : 
    3251             : static void 
    3252         330 : gfp_ext_pow_norm1 (mpres_t r0, mpres_t r1, const mpres_t a0, 
    3253             :                    const mpres_t a1, mpz_t e, const mpres_t Delta, 
    3254             :                    mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
    3255             : {
    3256             :   mpz_t abs_e;
    3257             :   size_t idx;
    3258             : 
    3259             :   ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);
    3260             : 
    3261         330 :   if (mpz_sgn (e) == 0)
    3262             :     {
    3263           0 :       mpres_set_ui (r0, 1UL, modulus);
    3264           0 :       mpres_set_ui (r1, 0UL, modulus);
    3265           0 :       return;
    3266             :     }
    3267             : 
    3268         330 :   mpz_init (abs_e);
    3269         330 :   mpz_abs (abs_e, e);
    3270         330 :   idx = mpz_sizeinbase (abs_e, 2) - 1; /* Thus ecm_tstbit (abs_e, idx) == 1 */
    3271             :   ASSERT (ecm_tstbit (abs_e, idx) == 1);
    3272             : 
    3273         330 :   mpres_set (r0, a0, modulus);
    3274         330 :   mpres_set (r1, a1, modulus);
    3275             : 
    3276        4736 :   while (idx > 0UL)
    3277             :     {
    3278        4406 :       gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
    3279        4406 :       idx--;
    3280        4406 :       if (ecm_tstbit (abs_e, idx))
    3281        2410 :         gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
    3282             :     }
    3283             : 
    3284         330 :   if (mpz_sgn (e) < 0)
    3285          21 :     mpres_neg (r1, r1, modulus);
    3286             : 
    3287         330 :   mpz_clear (abs_e);
    3288             : 
    3289         330 :   if (test_verbose (OUTPUT_TRACE))
    3290             :     {
    3291             :       mpz_t t;
    3292          10 :       mpz_init (t);
    3293          10 :       mpres_get_z (t, Delta, modulus);
    3294          10 :       outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ w = quadgen (4*%Zd); "
    3295          10 :                "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
    3296          10 :       mpz_clear (t);
    3297          10 :       outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ (");
    3298          10 :       gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
    3299          10 :       outputf (OUTPUT_TRACE, ")^(%Zd) == ", e);
    3300          10 :       gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
    3301          10 :       outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3302             :     }
    3303             : }
    3304             : 
    3305             : #if 0
    3306             : /* Compute r[i] = a^((k+i)^2) for i = 0, 1, ..., l-1, where "a" is an 
    3307             :    element of norm 1 in the quadratic extension ring */
    3308             : 
    3309             : static void
    3310             : gfp_ext_rn2 (mpres_t *r_x, mpres_t *r_y, const mpres_t a_x, const mpres_t a_y,
    3311             :              const long k, const unsigned long l, const mpres_t Delta, 
    3312             :              mpmod_t modulus, const unsigned long origtmplen, mpres_t *origtmp)
    3313             : {
    3314             :   mpres_t *r2_x = origtmp, *r2_y = origtmp + 2, *v = origtmp + 4, 
    3315             :     *V2 = origtmp + 6;
    3316             :   const unsigned long newtmplen = origtmplen - 7;
    3317             :   mpres_t *newtmp = origtmp + 7;
    3318             :   unsigned long i;
    3319             : 
    3320             :   if (l == 0UL)
    3321             :     return;
    3322             : 
    3323             :   ASSERT (origtmplen >= 8UL);
    3324             : 
    3325             :   if (pari)
    3326             :     gmp_printf ("/* In gfp_ext_rn2 */ ; a = %Zd + %Zd * w; /* PARI */\n",
    3327             :                 a_x, a_y, modulus->orig_modulus);
    3328             : 
    3329             :   /* Compute r[0] = a^(k^2). We do it by two exponentiations by k and use 
    3330             :      v[0] and v[1] as temp storage */
    3331             :   gfp_ext_pow_norm1_sl (v[0], v[1], a_x, a_y, k, Delta, modulus, newtmplen, 
    3332             :                      newtmp);
    3333             :   gfp_ext_pow_norm1_sl (r_x[0], r_y[0], v[0], v[1], k, Delta, modulus, 
    3334             :                      newtmplen, newtmp);
    3335             :   if (pari)
    3336             :     gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
    3337             :                 "/* PARI C */\n", k, r_x[0], r_y[0]);
    3338             : 
    3339             :   /* Compute r[1] = a^((k+1)^2) = a^(k^2 + 2k + 1)*/
    3340             :   if (l > 1)
    3341             :     {
    3342             :       /* v[0] + v[1]*sqrt(Delta) still contains a^k */
    3343             :       gfp_ext_sqr_norm1 (r_x[1], r_y[1], v[0], v[1], modulus);
    3344             :       /* Now r[1] = a^(2k) */
    3345             :       gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], r_x[0], r_y[0], Delta, 
    3346             :                    modulus, newtmplen, newtmp);
    3347             :       /* Now r[1] = a^(k^2 + 2k) */
    3348             :       gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], a_x, a_y, Delta, modulus, 
    3349             :                    newtmplen, newtmp);
    3350             :       /* Now r[1] = a^(k^2 + 2k + 1) = a^((k+1)^2) */
    3351             :     }
    3352             :   if (pari)
    3353             :     gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
    3354             :                 "/* PARI C */\n", k + 1, r_x[1], r_y[1]);
    3355             :   
    3356             :   /* Compute r2[0] = a^(k^2+2) = a^(k^2) * a^2 */
    3357             :   gfp_ext_sqr_norm1 (v[0], v[1], a_x, a_y, modulus);
    3358             :   gfp_ext_mul (r2_x[0], r2_y[0], r_x[0], r_y[0], v[0], v[1], Delta, modulus, 
    3359             :                newtmplen, newtmp);
    3360             :   if (pari)
    3361             :     gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
    3362             :                 "/* PARI C */\n", k, r2_x[0], r2_y[0]);
    3363             :   /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
    3364             :   gfp_ext_mul (r2_x[1], r2_y[1], r_x[1], r_y[1], v[0], v[1], Delta, modulus, 
    3365             :                newtmplen, newtmp);
    3366             :   if (pari)
    3367             :     gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
    3368             :                 "/* PARI C */\n", k + 1, r2_x[1], r2_y[1]);
    3369             :   
    3370             :   /* Compute V_2(a + 1/a). Since 1/a = a_x - a_y, we have a+1/a = 2*a_x.
    3371             :      V_2(x) = x^2 - 2, so we want 4*a_x^2 - 2. */
    3372             :   mpres_add (*V2, a_x, a_x, modulus); /* V2 = a + 1/a  = 2*a_x*/
    3373             :   V (v[0], *V2, 2 * k + 1, modulus);  /* v[0] = V_{2k+1} (a + 1/a) */
    3374             :   V (v[1], *V2, 2 * k + 3, modulus);  /* v[0] = V_{2k+3} (a + 1/a) */
    3375             :   mpres_sqr (*V2, *V2, modulus);      /* V2 = 4*a_x^2 */
    3376             :   mpres_sub_ui (*V2, *V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
    3377             :   if (pari)
    3378             :     {
    3379             :       gmp_printf ("/* In gfp_ext_rn2 */ ((a + 1/a)^2 - 2) %% N == "
    3380             :                   "%Zd %% N /* PARI C */\n", *V2);
    3381             :       gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
    3382             :                   "/* PARI C */\n", 2 * k + 1, v[0]);
    3383             :       gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
    3384             :                   "/* PARI C */\n", 2 * k + 3, v[1]);
    3385             :     }
    3386             :   
    3387             :   /* Compute the remaining a^((k+i)^2) values according to Peter's 
    3388             :      recurrence */
    3389             :   
    3390             :   for (i = 2; i < l; i++)
    3391             :     {
    3392             :       /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i taken
    3393             :          modulo 2 */
    3394             :       mpres_mul (r_x[i], r2_x[1 - i % 2], v[i % 2], modulus);
    3395             :       mpres_sub (r_x[i], r_x[i], r2_x[i % 2], modulus);
    3396             :       mpres_mul (r_y[i], r2_y[1 - i % 2], v[i % 2], modulus);
    3397             :       mpres_sub (r_y[i], r_y[i], r2_y[i % 2], modulus);
    3398             :       
    3399             :       /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
    3400             :       mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
    3401             :       mpres_sub (r2_x[i % 2], r2_x[i % 2], r_x[i - 2], modulus);
    3402             :       mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
    3403             :       mpres_sub (r2_y[i % 2], r2_y[i % 2], r_y[i - 2], modulus);
    3404             :       
    3405             :       /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
    3406             :       mpres_mul (newtmp[0], v[1 - i % 2], *V2, modulus);
    3407             :       mpres_sub (v[i % 2], newtmp[0], v[i % 2], modulus);
    3408             :       if (pari)
    3409             :         gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
    3410             :                     "/* PARI C */\n", 2 * (k + i) + 1, v[i % 2]);
    3411             :     }
    3412             : }
    3413             : #endif
    3414             : 
    3415             : /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l. 
    3416             :    x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P. */
    3417             : 
    3418             : static void
    3419         330 : pp1_sequence_g (listz_t g_x, listz_t g_y, mpzspv_t g_x_ntt, mpzspv_t g_y_ntt,
    3420             :                 const mpres_t b1_x, const mpres_t b1_y, const unsigned long P, 
    3421             :                 const mpres_t Delta, const long M_param, 
    3422             :                 const unsigned long l_param, const mpz_t m_1, const long k_2, 
    3423             :                 const mpmod_t modulus_param, const mpzspm_t ntt_context)
    3424             : {
    3425         330 :   const unsigned long tmplen = 3;
    3426         330 :   const int want_x = (g_x != NULL || g_x_ntt != NULL);
    3427         330 :   const int want_y = (g_y != NULL || g_y_ntt != NULL);
    3428             :   mpres_t r_x, r_y, x0_x, x0_y, v2,
    3429             :       r1_x[2], r1_y[2], r2_x[2], r2_y[2], 
    3430             :       v[2], tmp[3];
    3431             :   mpz_t mt;
    3432             :   mpmod_t modulus; /* Thread-local copy of modulus_param */
    3433         330 :   unsigned long i, l = l_param, offset = 0;
    3434         330 :   long M = M_param;
    3435             :   long timestart, realstart;
    3436         330 :   int want_output = 1;
    3437             : 
    3438         655 :   outputf (OUTPUT_VERBOSE, "Computing %s%s%s", 
    3439             :            (want_x) ? "g_x" : "", 
    3440         325 :            (want_x && want_y) ? " and " : "",
    3441             :            (want_y) ? "g_y" : "");
    3442         330 :   timestart = cputime ();
    3443         330 :   realstart = realtime ();
    3444             : 
    3445             : #ifdef _OPENMP
    3446             : #pragma omp parallel if (l > 100) private(r_x, r_y, x0_x, x0_y, v2, r1_x, r1_y, r2_x, r2_y, v, tmp, mt, modulus, i, l, offset, M, want_output)
    3447             :   {
    3448             :     /* When multi-threading, we adjust the parameters for each thread */
    3449             : 
    3450             :     const int nr_chunks = omp_get_num_threads();
    3451             :     const int thread_nr = omp_get_thread_num();
    3452             : 
    3453             :     l = (l_param - 1) / nr_chunks + 1;
    3454             :     offset = thread_nr * l;
    3455             :     if (offset <= l_param)
    3456             :       l = MIN(l, l_param - offset);
    3457             :     else
    3458             :       l = 0;
    3459             :     M = M_param - (long) offset;
    3460             : 
    3461             :     want_output = (omp_get_thread_num() == 0);
    3462             :     if (want_output)
    3463             :       outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
    3464             : #endif
    3465         330 :     mpmod_init_set (modulus, modulus_param);
    3466         330 :     mpres_init (r_x, modulus);
    3467         330 :     mpres_init (r_y, modulus);
    3468         330 :     mpres_init (x0_x, modulus);
    3469         330 :     mpres_init (x0_y, modulus);
    3470         330 :     mpres_init (v2, modulus);
    3471         990 :     for (i = 0; i < 2UL; i++)
    3472             :       {
    3473         660 :         mpres_init (r1_x[i], modulus);
    3474         660 :         mpres_init (r1_y[i], modulus);
    3475         660 :         mpres_init (r2_x[i], modulus);
    3476         660 :         mpres_init (r2_y[i], modulus);
    3477         660 :         mpres_init (v[i], modulus);
    3478             :       }
    3479        1320 :     for (i = 0; i < tmplen; i++)
    3480         990 :       mpres_init (tmp[i], modulus);
    3481         330 :     mpz_init (mt);
    3482             :     
    3483         330 :     if (want_output && test_verbose (OUTPUT_TRACE))
    3484             :       {
    3485          10 :         mpres_get_z (mt, Delta, modulus);
    3486          10 :         outputf (OUTPUT_TRACE, 
    3487             :                  "\n/* pp1_sequence_g */ w = quadgen (4*%Zd); P = %lu; "
    3488             :                  "M = %ld; k_2 = %ld; m_1 = %Zd; N = %Zd; /* PARI */\n", 
    3489             :                  mt, P, M, k_2, m_1, modulus->orig_modulus);
    3490             :         
    3491          10 :         outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ b_1 = ");
    3492          10 :         gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
    3493          10 :         outputf (OUTPUT_TRACE, "; /* PARI */\n");
    3494          10 :         outputf (OUTPUT_TRACE, 
    3495             :                  "/* pp1_sequence_g */ r = b_1^P; /* PARI */\n");
    3496          10 :         outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
    3497             :                  "x_0 = b_1^(2*k_2 + (2*m_1 + 1) * P); /* PARI */\n");
    3498          10 :         outputf (OUTPUT_TRACE, 
    3499             :                  "/* pp1_sequence_g */ addrec(x) = x + 1/x; /* PARI */\n");
    3500             :       }
    3501             :     
    3502             :     /* Compute r */
    3503         330 :     gfp_ext_pow_norm1_sl (r_x, r_y, b1_x, b1_y, P, Delta, modulus, 
    3504             :                           tmplen, tmp);
    3505         330 :     if (want_output && test_verbose (OUTPUT_TRACE))
    3506             :       {
    3507          10 :         outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ r == ");
    3508          10 :         gfp_ext_print (r_x, r_y, modulus, OUTPUT_TRACE);
    3509          10 :         outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3510             :       }
    3511             :     
    3512             :     /* Compute x0 = x_0 */
    3513         330 :     mpz_mul_2exp (mt, m_1, 1UL);
    3514         330 :     mpz_add_ui (mt, mt, 1UL);
    3515         330 :     mpz_mul_ui (mt, mt, P);
    3516         330 :     mpz_add_si (mt, mt, k_2);
    3517         330 :     mpz_add_si (mt, mt, k_2); /* mt = 2*k_2 + (2*m_1 + 1) * P */
    3518         330 :     gfp_ext_pow_norm1 (x0_x, x0_y, b1_x, b1_y, mt, Delta, modulus, 
    3519             :                        tmplen, tmp);
    3520         330 :     if (want_output && test_verbose (OUTPUT_TRACE))
    3521             :       {
    3522          10 :         outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ x_0 == ");
    3523          10 :         gfp_ext_print (x0_x, x0_y, modulus, OUTPUT_TRACE);
    3524          10 :         outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3525             :       }
    3526             :     
    3527             :     
    3528             :     /* Compute g[1] = r1[0] = x0^M * r^(M^2) = (x0 * r^M)^M.
    3529             :        We use v[0,1] as temporary storage */
    3530         330 :     gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M, Delta, modulus, 
    3531             :                           tmplen, tmp); /* v[0,1] = r^M */
    3532         330 :     gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus, 
    3533             :                  tmplen, tmp); /* v[0,1] = r^M * x_0 */
    3534         330 :     gfp_ext_pow_norm1_sl (r1_x[0], r1_y[0], v[0], v[1], M, Delta, modulus, 
    3535             :                           tmplen, tmp); /* r1[0] = (r^M * x_0)^M */
    3536         330 :     if (l > 0)
    3537             :       {
    3538         330 :         if (g_x != NULL)
    3539          54 :           mpres_get_z (g_x[offset], r1_x[0], modulus);
    3540         330 :         if (g_y != NULL)
    3541          54 :           mpres_get_z (g_y[offset], r1_y[0], modulus);
    3542         330 :         if (g_x_ntt != NULL)
    3543             :           {
    3544         271 :             mpres_get_z (mt, r1_x[0], modulus);
    3545         271 :             mpzspv_from_mpzv (g_x_ntt, offset, &mt, 1UL, ntt_context);
    3546             :           }
    3547         330 :         if (g_y_ntt != NULL)
    3548             :           {
    3549         271 :             mpres_get_z (mt, r1_y[0], modulus);
    3550         271 :             mpzspv_from_mpzv (g_y_ntt, offset, &mt, 1UL, ntt_context);
    3551             :           }
    3552             :       }
    3553             :     
    3554             :     
    3555             :     /* Compute g[1] = r1[1] = x0^(M-1) * r^((M-1)^2) = (x0 * r^(M-1))^(M-1). 
    3556             :        We use v[0,1] as temporary storage. FIXME: simplify, reusing g_0 */
    3557         330 :     gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M - 1, Delta, modulus, 
    3558             :                           tmplen, tmp);
    3559         330 :     gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus, 
    3560             :                  tmplen, tmp);
    3561         330 :     gfp_ext_pow_norm1_sl (r1_x[1], r1_y[1], v[0], v[1], M - 1, Delta, 
    3562             :                           modulus, tmplen, tmp);
    3563         330 :     if (l > 1)
    3564             :       {
    3565         330 :         if (g_x != NULL)
    3566          54 :           mpres_get_z (g_x[offset + 1], r1_x[1], modulus);
    3567         330 :         if (g_y != NULL)
    3568          54 :           mpres_get_z (g_y[offset + 1], r1_y[1], modulus);
    3569         330 :         if (g_x_ntt != NULL)
    3570             :           {
    3571         271 :             mpres_get_z (mt, r1_x[1], modulus);
    3572         271 :             mpzspv_from_mpzv (g_x_ntt, offset + 1, &mt, 1UL, ntt_context);
    3573             :           }
    3574         330 :         if (g_y_ntt != NULL)
    3575             :           {
    3576         271 :             mpres_get_z (mt, r1_y[1], modulus);
    3577         271 :             mpzspv_from_mpzv (g_y_ntt, offset + 1, &mt, 1UL, ntt_context);
    3578             :           }
    3579             :       }
    3580             :     
    3581             :     
    3582             :     /* x0 := $x_0 * r^{2M - 3}$ */
    3583             :     /* We don't need x0 after this so we overwrite it. We use v[0,1] as 
    3584             :        temp storage for $r^{2M - 3}$. */
    3585         330 :     gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, 2UL*M - 3UL, Delta, modulus,
    3586             :                           tmplen, tmp);
    3587         330 :     gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, v[0], v[1], Delta, modulus,
    3588             :                  tmplen, tmp);
    3589             :     
    3590             :     /* Compute r2[0] = r1[0] * r^2 and r2[1] = r1[1] * r^2. */
    3591             :     /* We only need $r^2$ from here on, so we set r = $r^2$ */
    3592         330 :     gfp_ext_sqr_norm1 (r_x, r_y, r_x, r_y, modulus);  
    3593         330 :     gfp_ext_mul (r2_x[0], r2_y[0], r1_x[0], r1_y[0], r_x, r_y, Delta, 
    3594             :                  modulus, tmplen, tmp);
    3595         330 :     gfp_ext_mul (r2_x[1], r2_y[1], r1_x[1], r1_y[1], r_x, r_y, Delta, 
    3596             :                  modulus, tmplen, tmp);
    3597             :     
    3598             :     /* v[1] := $x_0 * r^{2*M - 3} + 1/(x_0 * r^{2M - 3}) */
    3599         330 :     mpres_add (v[1], x0_x, x0_x, modulus);
    3600             :     /* x0 := x0 * r = $x_0 * r^{2M - 1}$ */
    3601         330 :     gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, r_x, r_y, Delta, modulus,
    3602             :                  tmplen, tmp);
    3603             :     /* v[0] := $x_0 * r^{2M - 1} + 1/(x_0 * r^{2M - 1}) */
    3604         330 :     mpres_add (v[0], x0_x, x0_x, modulus);
    3605             :     
    3606             :     /* v2 = V_2 (r + 1/r) = r^2 + 1/r^2 */
    3607         330 :     mpres_add (v2, r_x, r_x, modulus);
    3608             :     
    3609             :     /* We don't need the contents of r any more and use it as a temp var */
    3610             :     
    3611     6685430 :     for (i = 2; i < l; i++)
    3612             :       {
    3613     6685100 :         if (want_x)
    3614             :           {
    3615             :             /* r1[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i 
    3616             :                taken modulo 2. We store the new r1_x[i] in r_x for now */
    3617     6357430 :             mpres_mul (r_x, r2_x[1 - i % 2], v[i % 2], modulus);
    3618     6357430 :             mpres_sub (r_x, r_x,            r2_x[i % 2], modulus);
    3619             :             /* r2[i] = r2[i-1] * v[i-1] - r1[i-2] */
    3620     6357430 :             mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
    3621     6357430 :             mpres_sub (r2_x[i % 2], r2_x[i % 2],     r1_x[i % 2], modulus);
    3622     6357430 :             mpres_set (r1_x[i % 2], r_x, modulus); /* FIXME, avoid this copy */
    3623     6357430 :             if (g_x != NULL)
    3624       86388 :               mpres_get_z (g_x[offset + i], r_x, modulus); /* FIXME, avoid these REDC */
    3625     6357430 :             if (g_x_ntt != NULL)
    3626             :               {
    3627     6271042 :                 mpres_get_z (mt, r_x, modulus);
    3628     6271042 :                 mpzspv_from_mpzv (g_x_ntt, offset + i, &mt, 1UL, ntt_context);
    3629             :               }
    3630             :           }
    3631             :         
    3632     6685100 :         if (want_y)
    3633             :           {
    3634             :             /* Same for y coordinate */
    3635     6357430 :             mpres_mul (r_y, r2_y[1 - i % 2], v[i % 2], modulus);
    3636     6357430 :             mpres_sub (r_y, r_y,             r2_y[i % 2], modulus);
    3637     6357430 :             mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
    3638     6357430 :             mpres_sub (r2_y[i % 2], r2_y[i % 2],     r1_y[i % 2], modulus);
    3639     6357430 :             mpres_set (r1_y[i % 2], r_y, modulus);
    3640     6357430 :             if (g_y != NULL)
    3641       86388 :               mpres_get_z (g_y[offset + i], r_y, modulus); /* Keep r1, r2 in mpz_t ? */
    3642     6357430 :             if (g_y_ntt != NULL)
    3643             :               {
    3644     6271042 :                 mpres_get_z (mt, r_y, modulus);
    3645     6271042 :                 mpzspv_from_mpzv (g_y_ntt, offset + i, &mt, 1UL, ntt_context);
    3646             :               }
    3647             :           }
    3648             :         
    3649             :         /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
    3650     6685100 :         mpres_mul (r_x, v[1 - i % 2], v2, modulus);
    3651     6685100 :         mpres_sub (v[i % 2], r_x, v[i % 2], modulus);
    3652     6685100 :         if (want_output && test_verbose (OUTPUT_TRACE))
    3653             :           {
    3654             :             mpz_t t;
    3655       39944 :             mpz_init (t);
    3656       39944 :             mpres_get_z (t, v[i % 2], modulus);
    3657       39944 :             outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
    3658             :                      "addrec(x_0 * r^(2*(M-%lu) - 1)) == %Zd /* PARI C */\n", 
    3659             :                      i, t);
    3660       39944 :             mpz_clear (t);
    3661             :           }
    3662             :       }
    3663             :     
    3664         330 :     mpres_clear (r_x, modulus);
    3665         330 :     mpres_clear (r_y, modulus);
    3666         330 :     mpres_clear (x0_x, modulus);
    3667         330 :     mpres_clear (x0_y, modulus);
    3668         330 :     mpres_clear (v2, modulus);
    3669         990 :     for (i = 0; i < 2; i++)
    3670             :       {
    3671         660 :         mpres_clear (r1_x[i], modulus);
    3672         660 :         mpres_clear (r1_y[i], modulus);
    3673         660 :         mpres_clear (r2_x[i], modulus);
    3674         660 :         mpres_clear (r2_y[i], modulus);
    3675         660 :         mpres_clear (v[i], modulus);
    3676             :       }
    3677        1320 :     for (i = 0; i < tmplen; i++)
    3678         990 :       mpres_clear (tmp[i], modulus);
    3679         330 :     mpz_clear (mt);
    3680         330 :     mpmod_clear (modulus);
    3681             : #ifdef _OPENMP
    3682             :   }
    3683             : #endif
    3684             :   
    3685         330 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    3686             : 
    3687         330 :   if (g_x != NULL && g_y != NULL && test_verbose(OUTPUT_TRACE))
    3688             :     {
    3689        7198 :       for (i = 0; i < l_param; i++)
    3690             :         {
    3691        7196 :           outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu = "
    3692             :                    "x_0^(M-%lu) * r^((M-%lu)^2); /* PARI */", i, i, i);
    3693        7196 :           outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu == "
    3694        7196 :                    "%Zd + %Zd*w /* PARI C */\n", i, g_x[i], g_y[i]);
    3695             :         }
    3696             :     }
    3697         330 : }
    3698             : 
    3699             : 
    3700             : /* Compute r[i] = b1^(-P*(k+i)^2) * f_i for i = 0, 1, ..., l-1, where "b1" is 
    3701             :    an element of norm 1 in the quadratic extension ring */
    3702             : 
    3703             : static void
    3704         234 : pp1_sequence_h (listz_t h_x, listz_t h_y, mpzspv_t h_x_ntt, mpzspv_t h_y_ntt,
    3705             :                 const listz_t f, const mpres_t b1_x, const mpres_t b1_y, 
    3706             :                 const long k_param, const unsigned long l_param, 
    3707             :                 const unsigned long P, const mpres_t Delta, 
    3708             :                 mpmod_t modulus_param, const mpzspm_t ntt_context)
    3709             : {
    3710             :   unsigned long i;
    3711             :   long timestart, realstart;
    3712             : 
    3713         234 :   if (l_param == 0UL)
    3714           0 :     return;
    3715             : 
    3716             :   ASSERT (f != h_x);
    3717             :   ASSERT (f != h_y);
    3718             : 
    3719         234 :   outputf (OUTPUT_VERBOSE, "Computing h_x and h_y");
    3720         234 :   timestart = cputime ();
    3721         234 :   realstart = realtime ();
    3722             : 
    3723         234 :   if (test_verbose (OUTPUT_TRACE))
    3724             :     {
    3725             :       mpz_t t;
    3726           5 :       mpz_init (t);
    3727           5 :       mpres_get_z (t, Delta, modulus_param);
    3728           5 :       outputf (OUTPUT_TRACE, "\n/* pp1_sequence_h */ N = %Zd; "
    3729             :                "Delta = %Zd; w = quadgen (4*Delta); k = %ld; P = %lu; "
    3730           5 :                "/* PARI */\n", modulus_param->orig_modulus, t, k_param, P);
    3731           5 :       outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ b_1 = ");
    3732           5 :       gfp_ext_print (b1_x, b1_y, modulus_param, OUTPUT_TRACE);
    3733           5 :       outputf (OUTPUT_TRACE, "; r = b_1^P; rn = b_1^(-P); /* PARI */\n");
    3734        4882 :       for (i = 0; i < l_param; i++)
    3735        4877 :         outputf (OUTPUT_TRACE, 
    3736        4877 :                  "/* pp1_sequence_h */ f_%lu = %Zd; /* PARI */\n", i, f[i]);
    3737           5 :       mpz_clear (t);
    3738             :     }
    3739             : 
    3740             : #ifdef _OPENMP
    3741             : #pragma omp parallel if (l_param > 100) private(i)
    3742             : #endif
    3743             :   {
    3744         234 :     const size_t tmplen = 2;
    3745             :     mpres_t s_x[3], s_y[3], s2_x[2], s2_y[2], v[2], V2, rn_x, rn_y, 
    3746             :       tmp[2];
    3747             :     mpmod_t modulus; /* Thread-local copy of modulus_param */
    3748             :     mpz_t mt;
    3749         234 :     unsigned long l = l_param, offset = 0;
    3750         234 :     long k = k_param;
    3751             : 
    3752             : #ifdef _OPENMP
    3753             :     /* When multi-threading, we adjust the parameters for each thread */
    3754             : 
    3755             :     const int nr_chunks = omp_get_num_threads();
    3756             :     const int thread_nr = omp_get_thread_num();
    3757             : 
    3758             :     l = (l_param - 1) / nr_chunks + 1;
    3759             :     offset = thread_nr * l;
    3760             :     if (offset <= l_param)
    3761             :       l = MIN(l, l_param - offset);
    3762             :     else
    3763             :       l = 0;
    3764             : 
    3765             :     if (thread_nr == 0)
    3766             :       outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
    3767             :     outputf (OUTPUT_TRACE, "\n");
    3768             : #endif
    3769             : 
    3770             :     /* Each thread computes r[i + offset] = b1^(-P*(k+i+offset)^2) * f_i 
    3771             :        for i = 0, 1, ..., l-1, where l is the adjusted length of each thread */
    3772             : 
    3773             :     /* Test that k+offset does not overflow */
    3774         234 :     ASSERT_ALWAYS (offset <= (unsigned long) LONG_MAX && 
    3775             :                    k <= LONG_MAX - (long) offset);
    3776         234 :     k += (long) offset;
    3777             : 
    3778         234 :     mpz_init (mt);
    3779             :     /* Make thread-local copy of modulus */
    3780         234 :     mpmod_init_set (modulus, modulus_param);
    3781             : 
    3782             :     /* Init the local mpres_t variables */
    3783         702 :     for (i = 0; i < 2; i++)
    3784             :       {
    3785         468 :         mpres_init (s_x[i], modulus);
    3786         468 :         mpres_init (s_y[i], modulus);
    3787         468 :         mpres_init (s2_x[i], modulus);
    3788         468 :         mpres_init (s2_y[i], modulus);
    3789         468 :         mpres_init (v[i], modulus);
    3790             :       }
    3791         234 :     mpres_init (s_x[2], modulus);
    3792         234 :     mpres_init (s_y[2], modulus);
    3793         234 :     mpres_init (V2, modulus);
    3794         234 :     mpres_init (rn_x, modulus);
    3795         234 :     mpres_init (rn_y, modulus);
    3796         702 :     for (i = 0; i < (unsigned long) tmplen; i++)
    3797         468 :       mpres_init (tmp[i], modulus);
    3798             : 
    3799             :     /* Compute rn = b_1^{-P}. It has the same value for all threads,
    3800             :        but we make thread local copies anyway. */
    3801         234 :     gfp_ext_pow_norm1_sl (rn_x, rn_y, b1_x, b1_y, P, Delta, modulus, tmplen, 
    3802             :                           tmp);
    3803         234 :     mpres_neg (rn_y, rn_y, modulus);
    3804             :     
    3805             :     /* Compute s[0] = rn^(k^2) = r^(-k^2). We do it by two 
    3806             :        exponentiations by k and use v[0] and v[1] as temp storage */
    3807         234 :     gfp_ext_pow_norm1_sl (v[0], v[1], rn_x, rn_y, k, Delta, modulus, 
    3808             :                           tmplen, tmp);
    3809         234 :     gfp_ext_pow_norm1_sl (s_x[0], s_y[0], v[0], v[1], k, Delta, modulus, 
    3810             :                           tmplen, tmp);
    3811         234 :     if (test_verbose (OUTPUT_TRACE))
    3812             :       {
    3813             : #ifdef _OPENMP
    3814             : #pragma omp critical
    3815             : #endif
    3816             :         {
    3817           5 :           outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ", k);
    3818           5 :           gfp_ext_print (s_x[0], s_y[0], modulus, OUTPUT_TRACE);
    3819           5 :           outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3820             :         }
    3821             :       }
    3822             :     
    3823             :     /* Compute s[1] = r^(-(k+1)^2) = r^(-(k^2 + 2k + 1))*/
    3824         234 :     if (l > 1)
    3825             :       {
    3826             :         /* v[0] + v[1]*sqrt(Delta) still contains rn^k */
    3827         234 :         gfp_ext_sqr_norm1 (s_x[1], s_y[1], v[0], v[1], modulus);
    3828             :         /* Now s[1] = r^(-2k) */
    3829         234 :         gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], s_x[0], s_y[0], Delta, 
    3830             :                      modulus, tmplen, tmp);
    3831             :         /* Now s[1] = r^(-(k^2 + 2k)) */
    3832         234 :         gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], rn_x, rn_y, Delta, 
    3833             :                      modulus, tmplen, tmp);
    3834             :         /* Now s[1] = r^(-(k^2 + 2k + 1)) = r^(-(k+1)^2) */
    3835         234 :         if (test_verbose (OUTPUT_TRACE))
    3836             :           {
    3837             : #ifdef _OPENMP
    3838             : #pragma omp critical
    3839             : #endif
    3840             :             {
    3841           5 :               outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ", 
    3842             :                        k + 1);
    3843           5 :               gfp_ext_print (s_x[1], s_y[1], modulus, OUTPUT_TRACE);
    3844           5 :               outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3845             :             }
    3846             :           }
    3847             :       }
    3848             :     
    3849             :     /* Compute s2[0] = r^(k^2+2) = r^(k^2) * r^2 */
    3850         234 :     gfp_ext_sqr_norm1 (v[0], v[1], rn_x, rn_y, modulus);
    3851         234 :     gfp_ext_mul (s2_x[0], s2_y[0], s_x[0], s_y[0], v[0], v[1], Delta, modulus, 
    3852             :                  tmplen, tmp);
    3853         234 :     if (test_verbose (OUTPUT_TRACE))
    3854             :       {
    3855             : #ifdef _OPENMP
    3856             : #pragma omp critical
    3857             : #endif
    3858             :         {
    3859           5 :           outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ", k);
    3860           5 :           gfp_ext_print (s2_x[0], s2_y[0], modulus, OUTPUT_TRACE);
    3861           5 :           outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3862             :         }
    3863             :       }
    3864             :     
    3865             :     /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
    3866         234 :     gfp_ext_mul (s2_x[1], s2_y[1], s_x[1], s_y[1], v[0], v[1], Delta, modulus, 
    3867             :                  tmplen, tmp);
    3868         234 :     if (test_verbose (OUTPUT_TRACE))
    3869             :       {
    3870             : #ifdef _OPENMP
    3871             : #pragma omp critical
    3872             : #endif
    3873             :         {
    3874           5 :           outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ", 
    3875             :                    k + 1);
    3876           5 :           gfp_ext_print (s2_x[1], s2_y[1], modulus, OUTPUT_TRACE);
    3877           5 :           outputf (OUTPUT_TRACE, " /* PARI C */\n");
    3878             :         }
    3879             :       }
    3880             :     
    3881             :     /* Compute V_2(r + 1/r). Since 1/r = rn_x - rn_y, we have r+1/r = 2*rn_x.
    3882             :        V_2(x) = x^2 - 2, so we want 4*rn_x^2 - 2. */
    3883         234 :     mpres_add (V2, rn_x, rn_x, modulus); /* V2 = r + 1/r  = 2*rn_x */
    3884         234 :     V (v[0], V2, 2 * k + 1, modulus);  /* v[0] = V_{2k+1} (r + 1/r) */
    3885         234 :     V (v[1], V2, 2 * k + 3, modulus);  /* v[1] = V_{2k+3} (r + 1/r) */
    3886         234 :     mpres_sqr (V2, V2, modulus);       /* V2 = 4*a_x^2 */
    3887         234 :     mpres_sub_ui (V2, V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
    3888         234 :     if (test_verbose (OUTPUT_TRACE))
    3889             :       {
    3890             : #ifdef _OPENMP
    3891             : #pragma omp critical
    3892             : #endif
    3893             :         {
    3894           5 :           mpres_get_z (mt, V2, modulus);
    3895           5 :           outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^2 + 1/r^2 == %Zd "
    3896             :                    "/* PARI C */\n", mt);
    3897           5 :           mpres_get_z (mt, v[0], modulus);
    3898           5 :           outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+1) + "
    3899             :                    "1/r^(2*%ld+1) == %Zd /* PARI C */\n", k, k, mt);
    3900           5 :           mpres_get_z (mt, v[1], modulus);
    3901           5 :           outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+3) + "
    3902             :                    "1/r^(2*%ld+3) == %Zd /* PARI C */\n", k, k, mt);
    3903             :         }
    3904             :       }
    3905             :     
    3906         702 :     for (i = 0; i < 2UL && i < l; i++)
    3907             :       {
    3908             :         /* Multiply the 2nd coordinate by Delta, so that after the polynomial
    3909             :            multipoint evaluation we get x1 + Delta*x2 */
    3910         468 :         mpres_mul (s_y[i], s_y[i], Delta, modulus);
    3911         468 :         mpres_mul (s2_y[i], s2_y[i], Delta, modulus);
    3912             :         
    3913         468 :         if (h_x != NULL)
    3914          94 :           mpres_mul_z_to_z (h_x[i + offset], s_x[i], f[i + offset], modulus);
    3915         468 :         if (h_y != NULL)
    3916          94 :           mpres_mul_z_to_z (h_y[i + offset], s_y[i], f[i + offset], modulus);
    3917         468 :         if (h_x_ntt != NULL)
    3918             :           {
    3919         374 :             mpres_mul_z_to_z (mt, s_x[i], f[i + offset], modulus);
    3920         374 :             mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
    3921             :           }
    3922         468 :         if (h_y_ntt != NULL)
    3923             :           {
    3924         374 :             mpres_mul_z_to_z (mt, s_y[i], f[i + offset], modulus);
    3925         374 :             mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
    3926             :           }
    3927             :       }
    3928             :     
    3929             :     /* Compute the remaining r^((k+i)^2) values according to Peter's 
    3930             :        recurrence */
    3931             :     
    3932     1396085 :     for (i = 2; i < l; i++)
    3933             :       {
    3934     1395851 :         if (h_x != NULL || h_x_ntt != NULL)
    3935             :           {
    3936             :             /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i 
    3937             :                taken modulo 2 */
    3938     1395851 :             mpres_mul (s_x[i % 3], s2_x[1 - i % 2], v[i % 2], modulus);
    3939     1395851 :             mpres_sub (s_x[i % 3], s_x[i % 3], s2_x[i % 2], modulus);
    3940             :             
    3941             :             /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
    3942     1395851 :             mpres_mul (s2_x[i % 2], s2_x[1 - i % 2], v[1 - i % 2], modulus);
    3943     1395851 :             mpres_sub (s2_x[i % 2], s2_x[i % 2], s_x[(i - 2) % 3], modulus);
    3944     1395851 :             if (h_x != NULL)
    3945       17850 :               mpres_mul_z_to_z (h_x[i + offset], s_x[i % 3], f[i + offset], 
    3946             :                                 modulus);
    3947     1395851 :             if (h_x_ntt != NULL)
    3948             :               {
    3949     1378001 :                 mpres_mul_z_to_z (mt, s_x[i % 3], f[i + offset], modulus);
    3950     1378001 :                 mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
    3951             :               }
    3952             :           }
    3953             :         
    3954     1395851 :         if (h_y != NULL || h_y_ntt != NULL)
    3955             :           {
    3956             :             /* Same for y coordinate */
    3957     1395851 :             mpres_mul (s_y[i % 3], s2_y[1 - i % 2], v[i % 2], modulus);
    3958     1395851 :             mpres_sub (s_y[i % 3], s_y[i % 3], s2_y[i % 2], modulus);
    3959     1395851 :             mpres_mul (s2_y[i % 2], s2_y[1 - i % 2], v[1 - i % 2], modulus);
    3960     1395851 :             mpres_sub (s2_y[i % 2], s2_y[i % 2], s_y[(i - 2) % 3], modulus);
    3961     1395851 :             if (h_y != NULL)
    3962       17850 :               mpres_mul_z_to_z (h_y[i + offset], s_y[i % 3], f[i + offset], 
    3963             :                                 modulus);
    3964     1395851 :             if (h_y_ntt != NULL)
    3965             :               {
    3966     1378001 :                 mpres_mul_z_to_z (mt, s_y[i % 3], f[i + offset], modulus);
    3967     1378001 :                 mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
    3968             :               }
    3969             :           }
    3970             :         
    3971             :         /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
    3972     1395851 :         mpres_mul (tmp[0], v[1 - i % 2], V2, modulus);
    3973     1395851 :         mpres_sub (v[i % 2], tmp[0], v[i % 2], modulus);
    3974             :       }
    3975             :     
    3976             :     /* Clear the local mpres_t variables */
    3977         702 :     for (i = 0; i < 2; i++)
    3978             :       {
    3979         468 :         mpres_clear (s_x[i], modulus);
    3980         468 :         mpres_clear (s_y[i], modulus);
    3981         468 :         mpres_clear (s2_x[i], modulus);
    3982         468 :         mpres_clear (s2_y[i], modulus);
    3983         468 :         mpres_clear (v[i], modulus);
    3984             :       }
    3985         234 :     mpres_clear (s_x[2], modulus);
    3986         234 :     mpres_clear (s_y[2], modulus);
    3987         234 :     mpres_clear (V2, modulus);
    3988         234 :     mpres_clear (rn_x, modulus);
    3989         234 :     mpres_clear (rn_y, modulus);
    3990         702 :     for (i = 0; i < tmplen; i++)
    3991         468 :       mpres_clear (tmp[i], modulus);
    3992             : 
    3993             :     /* Clear the thread-local copy of modulus */
    3994         234 :     mpmod_clear (modulus);
    3995             : 
    3996         234 :     mpz_clear (mt);
    3997             :   }
    3998             : 
    3999         234 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    4000             : 
    4001         234 :   if (h_x != NULL && h_y != NULL && test_verbose (OUTPUT_TRACE))
    4002             :     {
    4003         842 :       for (i = 0; i < l_param; i++)
    4004         841 :         gmp_printf ("/* pp1_sequence_h */ (rn^((k+%lu)^2) * f_%lu) == "
    4005             :                     "(%Zd + Mod(%Zd / Delta, N) * w) /* PARI C */\n", 
    4006         841 :                     i, i, h_x[i], h_y[i]);
    4007             :     }
    4008             : }
    4009             : 
    4010             : 
    4011             : int 
    4012          47 : pp1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus, 
    4013             :         const faststage2_param_t *params)
    4014             : {
    4015             :   unsigned long nr;
    4016             :   unsigned long i, l, lenF, lenH, lenG, lenR, tmplen;
    4017             :   sets_long_t *S_1; /* This is stored as a set of sets (arithmetic 
    4018             :                        progressions of prime length */
    4019             :   set_long_t *S_2; /* This is stored as a regular set */
    4020             :   listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has 
    4021             :                   degree s_1. It is symmetric, so has only s_1 / 2 + 1 
    4022             :                   distinct coefficients. The sequence h_j will be stored in 
    4023             :                   the same memory and won't be a monic polynomial, so the 
    4024             :                   leading 1 monomial of F will be stored explicitly. Hence we 
    4025             :                   need s_1 / 2 + 1 entries. */
    4026             : 
    4027             :   listz_t g_x, g_y, fh_x, fh_y, h_x, h_y, tmp, R_x, R_y; 
    4028          47 :   const unsigned long tmpreslen = 2UL;
    4029             :   mpres_t b1_x, b1_y, Delta, tmpres[2];
    4030             :   mpz_t mt;   /* All-purpose temp mpz_t */
    4031          47 :   int youpi = ECM_NO_FACTOR_FOUND;
    4032             :   long timetotalstart, realtotalstart, timestart;
    4033             : 
    4034          47 :   timetotalstart = cputime ();
    4035          47 :   realtotalstart = realtime ();
    4036             : 
    4037          47 :   ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
    4038          47 :   ASSERT_ALWAYS (params->s_1 < params->l);
    4039          47 :   nr = params->l - params->s_1; /* Number of points we evaluate */
    4040             : 
    4041          47 :   if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
    4042           0 :       return ECM_ERROR;
    4043             : 
    4044             :   /* Allocate all the memory we'll need */
    4045             :   /* Allocate the correct amount of space for each mpz_t or the 
    4046             :      reallocations will up to double the time for stage 2! */
    4047          47 :   mpz_init (mt);
    4048          47 :   mpres_init (b1_x, modulus);
    4049          47 :   mpres_init (b1_y, modulus);
    4050          47 :   mpres_init (Delta, modulus);
    4051         141 :   for (i = 0; i < tmpreslen; i++)
    4052          94 :       mpres_init (tmpres[i], modulus);
    4053          47 :   lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
    4054             :                                      the leading 1 monomial for each factor */
    4055          47 :   lenH = params->s_1 + 1;
    4056          47 :   lenG = params->l;
    4057          47 :   lenR = nr;
    4058          47 :   F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
    4059          47 :   fh_x = init_list2 (lenF, (unsigned int) abs (modulus->bits));
    4060          47 :   fh_y = init_list2 (lenF, (unsigned int) abs (modulus->bits));
    4061          47 :   h_x = malloc (lenH * sizeof (mpz_t));
    4062          47 :   h_y = malloc (lenH * sizeof (mpz_t));
    4063          47 :   if (h_x == NULL || h_y == NULL)
    4064             :     {
    4065           0 :       fprintf (stderr, "Cannot allocate memory in pp1fs2\n");
    4066           0 :       exit (1);
    4067             :     }
    4068          47 :   g_x = init_list2 (lenG, (unsigned int) abs (modulus->bits));
    4069          47 :   g_y = init_list2 (lenG, (unsigned int) abs (modulus->bits));
    4070          47 :   R_x = init_list2 (lenR, (unsigned int) abs (modulus->bits));
    4071          47 :   R_y = init_list2 (lenR, (unsigned int) abs (modulus->bits));
    4072          47 :   tmplen = 3UL * params->l + list_mul_mem (params->l / 2) + 20;
    4073          47 :   outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
    4074          47 :   if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
    4075             :     {
    4076           0 :       tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
    4077             :       /* FIXME: It appears TMulGen_space() returns a too small value! */
    4078           0 :       outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n", 
    4079             :                tmplen);
    4080             :     }
    4081             : 
    4082          47 :   tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
    4083             : 
    4084          47 :   if (test_verbose (OUTPUT_TRACE))
    4085             :     {
    4086           1 :       mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
    4087           1 :       outputf (OUTPUT_TRACE, 
    4088             :                "N = %Zd; X = Mod(%Zd, N); /* PARI */\n", 
    4089           1 :                modulus->orig_modulus, mt);
    4090             :     }
    4091             : 
    4092             :   /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2 k_1}) */
    4093          47 :   outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
    4094             :   
    4095          47 :   timestart = cputime ();
    4096          47 :   i = poly_from_sets_V (F, X, S_1, tmp, tmplen, modulus, NULL, NULL);
    4097          47 :   ASSERT_ALWAYS(2 * i == params->s_1);
    4098             :   ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
    4099          47 :   free (S_1);
    4100          47 :   S_1 = NULL;
    4101             :   
    4102          47 :   outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
    4103          47 :   if (test_verbose (OUTPUT_TRACE))
    4104             :     {
    4105         842 :       for (i = 0; i < params->s_1 / 2 + 1; i++)
    4106         841 :         outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
    4107           1 :       outputf (OUTPUT_TRACE, "f(x) = f_0");
    4108         841 :       for (i = 1; i < params->s_1 / 2 + 1; i++)
    4109         840 :         outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
    4110           1 :       outputf (OUTPUT_TRACE, "/* PARI */ \n");
    4111             :     }
    4112             : 
    4113             :   /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
    4114          47 :   mpres_sqr (Delta, X, modulus);
    4115          47 :   mpres_sub_ui (Delta, Delta, 4UL, modulus);
    4116          47 :   mpres_div_2exp (b1_x, X, 1, modulus);
    4117          47 :   mpres_set_ui (b1_y, 1UL, modulus);
    4118          47 :   mpres_div_2exp (b1_y, b1_y, 1, modulus);
    4119          47 :   if (test_verbose (OUTPUT_TRACE))
    4120             :     {
    4121           1 :       mpres_get_z (mt, Delta, modulus);
    4122           1 :       outputf (OUTPUT_TRACE, 
    4123             :                "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
    4124           1 :       gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
    4125           1 :       outputf (OUTPUT_TRACE, "; /* PARI */\n");
    4126           1 :       outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
    4127             :     }
    4128             : 
    4129             :   /* Compute the h sequence h_j = b1^(P*-j^2) * f_j for 0 <= j <= s_1 */
    4130          47 :   pp1_sequence_h (fh_x, fh_y, NULL, NULL, F, b1_x, b1_y, 0L, 
    4131          47 :                   params->s_1 / 2 + 1, params->P, Delta, modulus, NULL);
    4132             :   /* We don't need F(x) any more */
    4133          47 :   clear_list (F, lenF);
    4134             : 
    4135             :   /* Make a symmetric copy of fh in h. */
    4136       17991 :   for (i = 0; i < params->s_1 / 2 + 1; i++)
    4137             :     {
    4138       17944 :       *(h_x[i]) = *(fh_x[params->s_1 / 2 - i]); /* Clone the mpz_t */
    4139       17944 :       *(h_y[i]) = *(fh_y[params->s_1 / 2 - i]);
    4140             :     }
    4141       17944 :   for (i = 0; i < params->s_1 / 2; i++)
    4142             :     {
    4143       17897 :       *(h_x[i + params->s_1 / 2 + 1]) = *(fh_x[i + 1]);
    4144       17897 :       *(h_y[i + params->s_1 / 2 + 1]) = *(fh_y[i + 1]);
    4145             :     }
    4146          47 :   if (test_verbose (OUTPUT_TRACE))
    4147             :     {
    4148        1682 :       for (i = 0; i < params->s_1 + 1; i++)
    4149        1681 :         outputf (OUTPUT_VERBOSE, "h_%lu = %Zd + %Zd * w; /* PARI */\n", 
    4150        1681 :                  i, h_x[i], h_y[i]);
    4151             :     }
    4152             :   
    4153          68 :   for (l = 0; l < params->s_2; l++)
    4154             :     {
    4155          54 :       const long M = params->l - 1 - params->s_1 / 2;
    4156          54 :       outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n", 
    4157          54 :                l + 1, params->s_2);
    4158          54 :       pp1_sequence_g (g_x, g_y, NULL, NULL, b1_x, b1_y, params->P, 
    4159          54 :                       Delta, M, params->l, params->m_1, S_2->elem[l], 
    4160             :                       modulus, NULL);
    4161             :       
    4162             :       /* Do the two convolution products */
    4163          54 :       outputf (OUTPUT_VERBOSE, "TMulGen of g_x and h_x");
    4164          54 :       timestart = cputime ();
    4165          54 :       if (TMulGen (R_x, nr - 1, h_x, params->s_1, g_x, params->l - 1, tmp,
    4166          54 :                    modulus->orig_modulus) < 0)
    4167             :         {
    4168           0 :           outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
    4169             :                    "of memory)\n");
    4170           0 :           youpi = ECM_ERROR;
    4171           0 :           break;
    4172             :         }
    4173          54 :       outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
    4174          54 :       outputf (OUTPUT_VERBOSE, "TMulGen of g_y and h_y");
    4175          54 :       timestart = cputime ();
    4176          54 :       if (TMulGen (R_y, nr - 1, h_y, params->s_1, g_y, params->l - 1, tmp,
    4177          54 :                    modulus->orig_modulus) < 0)
    4178             :         {
    4179           0 :           outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
    4180             :                    "of memory)\n");
    4181           0 :           youpi = ECM_ERROR;
    4182           0 :           break;
    4183             :         }
    4184          54 :       outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
    4185       43948 :       for (i = 0; i < nr; i++)
    4186       43894 :           mpz_add (R_x[i], R_x[i], R_y[i]);
    4187             :       
    4188          54 :       timestart = cputime ();
    4189          54 :       mpres_set_ui (tmpres[1], 1UL, modulus); /* Accumulate product in 
    4190             :                                                  tmpres[1] */
    4191       43948 :       for (i = 0; i < nr; i++)
    4192             :       {
    4193       43894 :           mpres_set_z_for_gcd (tmpres[0], R_x[i], modulus);
    4194             : #define TEST_ZERO_RESULT
    4195             : #ifdef TEST_ZERO_RESULT
    4196       43894 :           if (mpres_is_zero (tmpres[0], modulus))
    4197          28 :               outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
    4198             : #endif
    4199       43894 :           mpres_mul (tmpres[1], tmpres[1], tmpres[0], modulus); 
    4200             :       }
    4201          54 :       outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)^(1) took %lums\n", 
    4202          54 :                cputime () - timestart);
    4203          54 :       if (test_verbose(OUTPUT_RESVERBOSE))
    4204             :       {
    4205           6 :           mpres_get_z (mt, tmpres[1], modulus);
    4206           6 :           outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", mt);
    4207             :       }
    4208             :       
    4209          54 :       mpres_gcd (mt, tmpres[1], modulus);
    4210          54 :       if (mpz_cmp_ui (mt, 1UL) > 0)
    4211             :         {
    4212          33 :           mpz_set (f, mt);
    4213          33 :           youpi = ECM_FACTOR_FOUND_STEP2;
    4214          33 :           break;
    4215             :         }
    4216             :     }
    4217             : 
    4218          47 :   mpz_clear (mt);
    4219          47 :   mpres_clear (b1_x, modulus);
    4220          47 :   mpres_clear (b1_y, modulus);
    4221          47 :   mpres_clear (Delta, modulus);
    4222         141 :   for (i = 0; i < tmpreslen; i++)
    4223          94 :       mpres_clear (tmpres[i], modulus);
    4224          47 :   clear_list (fh_x, lenF);
    4225          47 :   clear_list (fh_y, lenF);
    4226          47 :   free (h_x);
    4227          47 :   free (h_y);
    4228          47 :   clear_list (g_x, lenG);
    4229          47 :   clear_list (g_y, lenG);
    4230          47 :   clear_list (R_x, lenR);
    4231          47 :   clear_list (R_y, lenR);
    4232          47 :   clear_list (tmp, tmplen);
    4233          47 :   free (S_2);
    4234             :  
    4235          47 :   outputf (OUTPUT_NORMAL, "Step 2");
    4236             :   /* In normal output mode, print only cpu time as we always have.
    4237             :      In verbose mode, print real time as well if we used multi-threading */
    4238          47 :   if (test_verbose (OUTPUT_VERBOSE))
    4239           9 :     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
    4240             :   else
    4241          38 :     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
    4242             : 
    4243          47 :   return youpi;
    4244             : }
    4245             : 
    4246             : 
    4247             : int 
    4248         187 : pp1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
    4249             :             const faststage2_param_t *params, const int twopass)
    4250             : {
    4251             :   unsigned long nr;
    4252             :   unsigned long l, lenF;
    4253             :   sets_long_t *S_1; /* This is stored as a set of sets (arithmetic 
    4254             :                        progressions of prime length */
    4255             :   set_long_t *S_2; /* This is stored as a regular set */
    4256             :   listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has 
    4257             :                   degree s_1. It is symmetric, so has only s_1 / 2 + 1 
    4258             :                   distinct coefficients. The sequence h_j will be stored in 
    4259             :                   the same memory and won't be a monic polynomial, so the 
    4260             :                   leading 1 monomial of F will be stored explicitly. Hence we 
    4261             :                   need s_1 / 2 + 1 entries. */
    4262         187 :   listz_t R = NULL;  /* Is used only for two-pass convolution, has nr 
    4263             :                         entries. R is only ever referenced if twopass == 1,
    4264             :                         but gcc does not realize that and complains about
    4265             :                         uninitialized value, so we set it to NULL. */
    4266             :   mpzspm_t ntt_context;
    4267             :   mpzspv_t g_x_ntt, g_y_ntt, h_x_ntt, h_y_ntt;
    4268             :   mpres_t b1_x, b1_y, Delta;
    4269             :   mpz_t mt;   /* All-purpose temp mpz_t */
    4270             :   mpz_t product;
    4271         187 :   mpz_t *product_ptr = NULL;
    4272         187 :   int youpi = ECM_NO_FACTOR_FOUND;
    4273             :   long timetotalstart, realtotalstart, timestart, realstart;
    4274             : 
    4275         187 :   timetotalstart = cputime ();
    4276         187 :   realtotalstart = realtime ();
    4277             : 
    4278         187 :   ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
    4279         187 :   ASSERT_ALWAYS (params->s_1 < params->l);
    4280         187 :   nr = params->l - params->s_1; /* Number of points we evaluate */
    4281             : 
    4282         187 :   if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
    4283           0 :       return ECM_ERROR;
    4284             :   
    4285         187 :   mpz_init (mt);
    4286             :   
    4287             :   /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution 
    4288             :      with g. We need NTT of transform length l here. If we want to add 
    4289             :      transformed vectors, we need to double the modulus. */
    4290             : 
    4291         187 :   if (twopass)
    4292           1 :     mpz_set (mt, modulus->orig_modulus);
    4293             :   else
    4294         186 :     mpz_mul_2exp (mt, modulus->orig_modulus, 1UL);
    4295             :   
    4296         187 :   ntt_context = mpzspm_init (params->l, mt);
    4297             : 
    4298         187 :   if (ntt_context == NULL)
    4299             :     {
    4300           0 :       outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
    4301             :                "presumably out of memory\n");
    4302           0 :       mpz_clear (mt);
    4303           0 :       free (S_1);
    4304           0 :       S_1 = NULL;
    4305           0 :       free (S_2);
    4306           0 :       S_2 = NULL;
    4307           0 :       return ECM_ERROR;
    4308             :     }
    4309             : 
    4310         187 :   print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ", 
    4311             :                     ntt_context);
    4312             : 
    4313             :   /* Allocate memory for F with correct amount of space for each mpz_t */
    4314         187 :   lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
    4315             :                                      the leading 1 monomial for each factor */
    4316             :   {
    4317             :     /* in some cases the above value of lenF is not enough, for example with
    4318             :        s_1 = 10, which gives lenF = 7, but 9 entries are needed */
    4319         187 :     unsigned long mem = mem_poly_from_sets_V (S_1);
    4320         187 :     if (mem > lenF)
    4321          28 :       lenF = mem;
    4322             :   }
    4323             : 
    4324         187 :   F = init_list2 (lenF, (unsigned int) abs (modulus->bits) + GMP_NUMB_BITS);
    4325             :   
    4326             :   /* Build F */
    4327         187 :   if (build_F_ntt (F, X, S_1, params, modulus) == ECM_ERROR)
    4328             :     {
    4329           0 :       free (S_1);
    4330           0 :       free (S_2);
    4331           0 :       mpz_clear (mt);
    4332           0 :       mpzspm_clear (ntt_context);
    4333           0 :       clear_list (F, lenF);
    4334           0 :       return ECM_ERROR;
    4335             :     }
    4336             : 
    4337         187 :   free (S_1);
    4338         187 :   S_1 = NULL;
    4339             :   
    4340         187 :   mpres_init (b1_x, modulus);
    4341         187 :   mpres_init (b1_y, modulus);
    4342         187 :   mpres_init (Delta, modulus);
    4343             : 
    4344             :   /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
    4345         187 :   mpres_sqr (Delta, X, modulus);
    4346         187 :   mpres_sub_ui (Delta, Delta, 4UL, modulus);
    4347         187 :   mpres_div_2exp (b1_x, X, 1, modulus);
    4348         187 :   mpres_set_ui (b1_y, 1UL, modulus);
    4349         187 :   mpres_div_2exp (b1_y, b1_y, 1, modulus);
    4350         187 :   if (test_verbose (OUTPUT_TRACE))
    4351             :     {
    4352           4 :       mpres_get_z (mt, Delta, modulus);
    4353           4 :       outputf (OUTPUT_TRACE, 
    4354             :                "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
    4355           4 :       gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
    4356           4 :       outputf (OUTPUT_TRACE, "; /* PARI */\n");
    4357           4 :       outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
    4358             :     }
    4359             : 
    4360             :   /* Allocate remaining memory for h_ntt */
    4361         187 :   h_x_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
    4362         187 :   h_y_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
    4363             :   /* Compute the h_j sequence */
    4364         187 :   pp1_sequence_h (NULL, NULL, h_x_ntt, h_y_ntt, F, b1_x, b1_y, 0L, 
    4365         187 :                   params->s_1 / 2 + 1, params->P, Delta, modulus, 
    4366             :                   ntt_context);
    4367             :   /* We don't need F(x) any more */
    4368         187 :   clear_list (F, lenF);
    4369             : 
    4370             :   /* compute the forward transform of h and store the distinct coefficients 
    4371             :      in h_ntt */
    4372         187 :   g_x_ntt = mpzspv_init (params->l, ntt_context);
    4373         187 :   if (twopass)
    4374             :     {
    4375           1 :       g_y_ntt = g_x_ntt;
    4376           1 :       R = init_list2 (nr, (mpz_size (modulus->orig_modulus) + 2) *  
    4377             :                           GMP_NUMB_BITS);
    4378             :     }
    4379             :   else
    4380         186 :     g_y_ntt = mpzspv_init (params->l, ntt_context);
    4381             :   
    4382             :   /* Compute DCT-I of h_x and h_y */
    4383         187 :   outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_x");
    4384             : #ifdef _OPENMP
    4385             :   outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    4386             : #endif
    4387         187 :   timestart = cputime ();
    4388         187 :   realstart = realtime ();
    4389         187 :   mpzspv_to_dct1 (h_x_ntt, h_x_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
    4390             :                   g_x_ntt, ntt_context);
    4391         187 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    4392             : 
    4393         187 :   outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_y");
    4394             : #ifdef _OPENMP
    4395             :   outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    4396             : #endif
    4397         187 :   timestart = cputime ();
    4398         187 :   realstart = realtime ();
    4399         187 :   mpzspv_to_dct1 (h_y_ntt, h_y_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
    4400             :                   g_x_ntt, ntt_context);
    4401         187 :   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    4402             : 
    4403         187 :   if (test_verbose (OUTPUT_RESVERBOSE))
    4404             :     {
    4405          12 :       mpz_init (product);
    4406          12 :       product_ptr = &product;
    4407             :     }
    4408             : 
    4409         297 :   for (l = 0; l < params->s_2; l++)
    4410             :     {
    4411         271 :       const long M = params->l - 1 - params->s_1 / 2;
    4412             : 
    4413         271 :       outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n", 
    4414         271 :                l + 1, params->s_2);
    4415         271 :       if (twopass)
    4416             :         {
    4417             :           /* Two-pass variant. Two separate convolutions, 
    4418             :              then addition in Z/NZ */
    4419           5 :           pp1_sequence_g (NULL, NULL, g_x_ntt, NULL, b1_x, b1_y, params->P, 
    4420           5 :                           Delta, M, params->l, params->m_1, S_2->elem[l], 
    4421             :                           modulus, ntt_context);
    4422             : 
    4423             :           /* Do the convolution product of g_x * h_x */
    4424           5 :           outputf (OUTPUT_VERBOSE, "Computing g_x*h_x");
    4425             : #ifdef _OPENMP
    4426             :           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    4427             : #endif
    4428           5 :           timestart = cputime ();
    4429           5 :           realstart = realtime ();
    4430           5 :           mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context, 
    4431             :             NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
    4432             :           /* Store the product coefficients we want in R */
    4433           5 :           mpzspv_to_mpzv (g_x_ntt, params->s_1 / 2, R, nr, ntt_context);
    4434           5 :           print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    4435             : 
    4436             :           /* Compute g_y sequence */
    4437           5 :           pp1_sequence_g (NULL, NULL, NULL, g_y_ntt, b1_x, b1_y, params->P, 
    4438           5 :                           Delta, M, params->l, params->m_1, S_2->elem[l], 
    4439             :                           modulus, ntt_context);
    4440             :           
    4441             :           /* Do the convolution product of g_y * (Delta * h_y) */
    4442           5 :           outputf (OUTPUT_VERBOSE, "Computing g_y*h_y");
    4443             : #ifdef _OPENMP
    4444             :           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    4445             : #endif
    4446           5 :           timestart = cputime ();
    4447           5 :           realstart = realtime ();
    4448           5 :           mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context, 
    4449             :             NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
    4450           5 :           print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    4451             :           
    4452             :           /* Compute product of sum of coefficients and gcd with N */
    4453           5 :           ntt_gcd (mt, product_ptr, g_y_ntt, params->s_1 / 2, R, nr, 
    4454             :                    ntt_context, modulus);
    4455             :         }
    4456             :       else
    4457             :         {
    4458             :           /* One-pass variant. Two forward transforms and point-wise products,
    4459             :              then addition and single inverse transform */
    4460         266 :           pp1_sequence_g (NULL, NULL, g_x_ntt, g_y_ntt, b1_x, b1_y, params->P, 
    4461         266 :                           Delta, M, params->l, params->m_1, S_2->elem[l], 
    4462             :                           modulus, ntt_context);
    4463             : 
    4464         266 :           outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_x");
    4465             : #ifdef _OPENMP
    4466             :           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    4467             : #endif
    4468         266 :           timestart = cputime ();
    4469         266 :           realstart = realtime ();
    4470         266 :           mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context, 
    4471             :             NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
    4472         266 :           print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    4473             :           
    4474         266 :           outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_y");
    4475             : #ifdef _OPENMP
    4476             :           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    4477             : #endif
    4478         266 :           timestart = cputime ();
    4479         266 :           realstart = realtime ();
    4480         266 :           mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context, 
    4481             :             NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
    4482         266 :           print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    4483             :           
    4484         266 :           outputf (OUTPUT_VERBOSE, "Adding and computing inverse NTT of sum");
    4485             : #ifdef _OPENMP
    4486             :           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
    4487             : #endif
    4488         266 :           timestart = cputime ();
    4489         266 :           realstart = realtime ();
    4490         266 :           mpzspv_add (g_x_ntt, (spv_size_t) 0, g_x_ntt, (spv_size_t) 0, 
    4491         266 :                       g_y_ntt, (spv_size_t) 0, params->l, ntt_context);
    4492         266 :           mpzspv_mul_by_dct (g_x_ntt, NULL, params->l, ntt_context, 
    4493             :             NTT_MUL_STEP_IFFT);
    4494         266 :           print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
    4495             :           
    4496         266 :           ntt_gcd (mt, product_ptr, g_x_ntt, params->s_1 / 2, NULL, nr, 
    4497             :                    ntt_context, modulus);
    4498             :         }
    4499             :       
    4500         271 :       outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);
    4501             : 
    4502         271 :       if (mpz_cmp_ui (mt, 1UL) > 0)
    4503             :         {
    4504         161 :           mpz_set (f, mt);
    4505         161 :           youpi = ECM_FACTOR_FOUND_STEP2;
    4506         161 :           break;
    4507             :         }
    4508             :     }
    4509             : 
    4510         187 :   if (test_verbose (OUTPUT_RESVERBOSE))
    4511             :     {
    4512          12 :       product_ptr = NULL;
    4513          12 :       mpz_clear (product);
    4514             :     }
    4515         187 :   mpzspv_clear (g_x_ntt, ntt_context);
    4516         187 :   if (twopass)
    4517           1 :     clear_list (R, nr);
    4518             :   else
    4519         186 :     mpzspv_clear (g_y_ntt, ntt_context);
    4520         187 :   mpzspv_clear (h_x_ntt, ntt_context);
    4521         187 :   mpzspv_clear (h_y_ntt, ntt_context);
    4522         187 :   mpzspm_clear (ntt_context);
    4523         187 :   mpz_clear (mt);
    4524         187 :   mpres_clear (b1_x, modulus);
    4525         187 :   mpres_clear (b1_y, modulus);
    4526         187 :   mpres_clear (Delta, modulus);
    4527         187 :   free (S_2);
    4528             :  
    4529         187 :   outputf (OUTPUT_NORMAL, "Step 2");
    4530             :   /* In normal output mode, print only cpu time as we always have.
    4531             :      In verbose mode, print real time as well if we used multi-threading */
    4532         187 :   if (test_verbose (OUTPUT_VERBOSE))
    4533          21 :     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
    4534             :   else
    4535         166 :     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
    4536             : 
    4537         187 :   return youpi;
    4538             : }

Generated by: LCOV version 1.14