Line data Source code
1 : /* Implementation of fast stage 2 for P-1 and P+1 as described in
2 : "Improved Stage 2 to $P\pm{}1$ Factoring Algorithms" by
3 : Peter L. Montgomery and Alexander Kruppa, ANTS 2008 (8th Algorithmic
4 : Number Theory Symposium).
5 :
6 : Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2016
7 : Alexander Kruppa, Paul Zimmermann, David Cleaver
8 : NTT functions are based on code Copyright 2005 Dave Newman.
9 :
10 : This file is part of the ECM Library.
11 :
12 : The ECM Library is free software; you can redistribute it and/or modify
13 : it under the terms of the GNU Lesser General Public License as published by
14 : the Free Software Foundation; either version 3 of the License, or (at your
15 : option) any later version.
16 :
17 : The ECM Library is distributed in the hope that it will be useful, but
18 : WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
19 : or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
20 : License for more details.
21 :
22 : You should have received a copy of the GNU Lesser General Public License
23 : along with the ECM Library; see the file COPYING.LIB. If not, see
24 : http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
25 : 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
26 :
27 : #include "config.h"
28 : #include <stdio.h>
29 : #include <stdlib.h>
30 : #include <limits.h>
31 : #include "ecm-impl.h"
32 : #include "sp.h"
33 : #include <math.h>
34 : #ifdef HAVE_ALLOCA_H
35 : #include <alloca.h>
36 : #endif
37 : #ifdef HAVE_STRING_H
38 : #include <string.h>
39 : #endif
40 : #ifdef _OPENMP
41 : #include <omp.h>
42 : #endif
43 :
44 : /* TODO:
45 : - move functions into their proper files (i.e. NTT functions etc.)
46 : - later: allow storing NTT vectors on disk
47 : */
48 :
49 : /* Define TEST_ZERO_RESULT to test if any result of the multipoint
50 : evaluation is equal to zero. If the modulus is composite, this
51 : happening might indicate a problem in the evalutaion code */
52 : #define TEST_ZERO_RESULT
53 :
54 : const int pari = 0;
55 :
56 : const unsigned long Pvalues[] = {
57 : 3UL, 5UL, 9UL, 15UL, 21UL, 17UL, 27UL, 33UL, 45UL, 51UL, 63UL, 75UL,
58 : 105UL, 99UL, 135UL, 165UL, 195UL, 189UL, 231UL, 255UL, 315UL, 345UL,
59 : 357UL, 375UL, 405UL, 435UL, 525UL, 585UL, 615UL, 735UL, 765UL, 825UL,
60 : 945UL, 1155UL, 1065UL, 1365UL, 1305UL, 1335UL, 1575UL, 1785UL, 1995UL,
61 : 2145UL, 2205UL, 2415UL, 2625UL, 2805UL, 3045UL, 3465UL, 3675UL, 4095UL,
62 : 4305UL, 4515UL, 4725UL, 4785UL, 5355UL, 5775UL, 5985UL, 5865UL, 6825UL,
63 : 7245UL, 8085UL, 8925UL, 9555UL, 10395UL, 10725UL, 11025UL, 12285UL,
64 : 12705UL, 15015UL, 14175UL, 15225UL, 16065UL, 17325UL, 19635UL, 21945UL,
65 : 23205UL, 24255UL, 25935UL, 26775UL, 28875UL, 31395UL, 33495UL, 35805UL,
66 : 36465UL, 38115UL, 39585UL, 40425UL, 45045UL, 45885UL, 49665UL, 51765UL,
67 : 58905UL, 65835UL, 69615UL, 75075UL, 77805UL, 82005UL, 84315UL, 86625UL,
68 : 88935UL, 94185UL, 98175UL, 105105UL, 109725UL, 116025UL, 118755UL,
69 : 121275UL, 135135UL, 137445UL, 137655UL, 144375UL, 153615UL, 165165UL,
70 : 167475UL, 176715UL, 179025UL, 185955UL, 197505UL, 208845UL, 215985UL,
71 : 225225UL, 255255UL, 250635UL, 285285UL, 277095UL, 294525UL, 315315UL,
72 : 345345UL, 373065UL, 368445UL, 405405UL, 435435UL, 451605UL, 465465UL,
73 : 454545UL, 504735UL, 525525UL, 555555UL, 569415UL, 596505UL, 645645UL,
74 : 647955UL, 672945UL, 687225UL, 765765UL, 770385UL, 805035UL, 855855UL,
75 : 858585UL, 915915UL, 945945UL, 962115UL, 1036035UL, 1066065UL, 1119195UL,
76 : 1156155UL, 1276275UL, 1306305UL, 1354815UL, 1426425UL, 1456455UL,
77 : 1514205UL, 1576575UL, 1666665UL, 1726725UL, 1786785UL, 1789515UL,
78 : 1865325UL, 1996995UL, 1983135UL, 2177175UL, 2297295UL, 2327325UL,
79 : 2417415UL, 2567565UL, 2611455UL, 2807805UL, 2847075UL, 2878785UL,
80 : 3048045UL, 3161235UL, 3258255UL, 3357585UL, 3401475UL, 3533145UL,
81 : 3828825UL, 3918915UL, 3985905UL, 4279275UL, 4849845UL, 4789785UL,
82 : 4967655UL, 5180175UL, 5360355UL, 5870865UL, 5990985UL, 6561555UL,
83 : 6531525UL, 6891885UL, 7402395UL, 7912905UL, 8273265UL, 8580495UL,
84 : 8843835UL, 9444435UL, 10015005UL, 10465455UL, 10705695UL, 10885875UL,
85 : 11696685UL, 12267255UL, 12507495UL, 12785955UL, 13498485UL, 14549535UL,
86 : 14849835UL, 15570555UL, 16111095UL, 16291275UL, 17612595UL, 18123105UL,
87 : 18633615UL, 19684665UL, 20255235UL, 20825805UL, 22207185UL, 22717695UL,
88 : 24249225UL, 24819795UL, 25741485UL, 26531505UL, 28333305UL, 29354325UL,
89 : 30045015UL, 31396365UL, 32807775UL, 33948915UL, 33528495UL, 34879845UL,
90 : 37011975UL, 37522485UL, 39564525UL, 41096055UL, 43648605UL, 44219175UL,
91 : 45930885UL, 47222175UL, 48333285UL, 50075025UL, 51816765UL, 52777725UL,
92 : 55390335UL, 55547415UL, 59053995UL, 60063465UL, 61906845UL, 64579515UL,
93 : 66621555UL, 67492425UL, 70105035UL, 73258185UL, 74939865UL, 77224455UL,
94 : 79594515UL, 81876795UL, 84999915UL, 88062975UL, 91005915UL, 94189095UL,
95 : 98423325UL, 101846745UL, 111546435UL, 111035925UL, 115120005UL,
96 : 121246125UL, 124098975UL, 130945815UL, 140645505UL, 150345195UL,
97 : 150225075UL, 155450295UL, 158333175UL, 170255085UL, 179444265UL,
98 : 190285095UL, 198843645UL, 203408205UL, 206831625UL, 217222005UL,
99 : 229474245UL, 240705465UL, 252447195UL, 254999745UL, 269023755UL,
100 : 282146865UL, 287672385UL, 294076965UL, 306110805UL, 318302985UL,
101 : 334639305UL, 344338995UL, 354038685UL, 363738375UL, 373438065UL,
102 : 387221835UL, 400254855UL, 421936515UL, 431636205UL, 451035585UL,
103 : 453888435UL, 470434965UL, 480134655UL, 510765255UL, 522506985UL,
104 : 557732175UL, 570855285UL, 596530935UL, 610224615UL, 627912285UL,
105 : 654729075UL, 703227525UL, 722116395UL, 751725975UL, 780825045UL,
106 : 790524735UL, 821665845UL, 851275425UL, 863017155UL, 909984075UL,
107 : 936020085UL, 984518535UL, 1017041025UL, 1052416365UL
108 : #if (ULONG_MAX > 4294967295)
109 : ,1086110025UL, 1110614505UL, 1147371225UL, 1191785595UL, 1213887675UL,
110 : 1265809545UL, 1282356075UL, 1331995665UL, 1391905515UL, 1450103655UL,
111 : 1479202725UL, 1547100555UL, 1555088535UL, 1673196525UL, 1712565855UL,
112 : 1767130365UL, 1830673845UL, 1883166285UL, 1954487535UL, 2001964965UL,
113 : 2119382265UL, 2187280095UL, 2255177925UL, 2342475135UL, 2390973585UL,
114 : 2421213795UL, 2555868315UL, 2672264595UL, 2788660875UL, 2856558705UL,
115 : 2953555605UL, 3050552505UL, 3234846615UL, 3457939485UL, 3516137625UL,
116 : 3681032355UL, 3758629875UL, 3904125225UL, 4127218095UL, 4360010655UL,
117 : 4573403835UL, 4796496705UL, 4844995155UL, 5019589575UL, 5203883685UL,
118 : 5262081825UL, 5465775315UL, 5766465705UL, 5898837945UL, 6164152995UL,
119 : 6358146795UL, 6411780375UL, 6804332535UL, 6980458485UL, 7172920755UL,
120 : 7473611145UL, 7716103395UL, 7968295335UL, 8182259085UL, 8342499165UL,
121 : 8812168365UL, 9023519505UL, 9704539845UL, 9927632715UL, 10373818455UL,
122 : 10439434005UL, 10820004195UL, 11043097065UL, 11489282805UL,
123 : 11877270405UL, 12381654285UL, 12604747155UL, 13080031965UL,
124 : 13274025765UL, 13642613985UL, 14389490115UL, 14583483915UL,
125 : 15058768725UL, 15611651055UL, 16174233075UL, 16397325945UL,
126 : 17289697425UL, 17735883165UL, 18143270145UL, 18381678315UL,
127 : 19074440385UL, 19559424885UL, 20636090475UL, 20941375455UL,
128 : 21800053275UL, 22643926305UL, 23148310185UL, 24205576395UL,
129 : 24546777255UL, 25544133615UL, 26389538175UL, 26863291455UL,
130 : 27813861075UL, 29113619535UL, 29494189725UL, 30520074585UL,
131 : 30684969315UL, 31790733975UL, 33575476935UL, 34467848415UL,
132 : 35202742575UL, 36427185795UL, 38037334335UL, 39240095895UL,
133 : 40365259935UL, 42053005995UL, 43168470345UL, 44953213305UL,
134 : 45845584785UL, 48522699225UL, 50307442185UL, 51869092275UL,
135 : 53653835235UL, 54546206715UL, 56680138515UL, 58784971245UL,
136 : 59386352025UL, 61908271425UL, 63431122755UL, 65700850215UL,
137 : 67931778915UL, 70162707615UL, 72616729185UL, 74120181135UL,
138 : 75740029365UL, 78417143805UL, 80871165375UL, 82840202445UL,
139 : 86448487125UL, 88466022645UL, 91133437395UL, 92918180355UL,
140 : 100280245065UL, 100726430805UL, 102811864155UL, 106749938295UL,
141 : 109000266375UL, 113219631525UL, 119689324755UL, 121027881975UL,
142 : 127943760945UL, 132628711215UL, 134859639915UL, 141775518885UL,
143 : 148691397855UL, 150922326555UL, 155607276825UL, 161320394235UL,
144 : 164977177365UL, 171446870595UL, 177470378085UL, 183270792705UL
145 : #endif
146 : };
147 :
148 : /* All the prime factors that can appear in eulerphi(P) */
149 : const unsigned long phiPfactors[] = {2UL, 3UL, 5UL, 7UL, 11UL, 13UL,
150 : 17UL, 19UL};
151 :
152 :
153 : /* Some useful PARI functions:
154 : sumset(a,b) = {local(i, j, l); l = listcreate (length(a) * length(b)); for (i = 1, length(a), for (j = 1, length(b), listput(l, a[i] + b[j]))); listsort (l, 1); l}
155 :
156 : V(i,X) = { if (i==0, return(2)); if (i==1, return(X)); if(i%2 == 0, return (V (i/2, X)^2-2)); return (V ((i+1)/2, X) * V ((i-1)/2, X) - X)}
157 :
158 : U(i,X) = { if (i==0, return(0)); if (i==1, return(1)); if(i%2 == 0, return (U (i/2, X) * V(i/2,X))); return (V ((i+1)/2, X) *U( (i-1)/2, X) + 1)}
159 : */
160 :
161 : #ifndef _OPENMP
162 5628 : static int omp_get_num_threads () {return 1;}
163 5628 : static int omp_get_thread_num () {return 0;}
164 : #endif
165 :
166 : static void
167 : ntt_sqr_reciprocal (mpzv_t, const mpzv_t, mpzspv_t, const spv_size_t,
168 : const mpzspm_t);
169 :
170 : static void
171 4406 : print_elapsed_time (int verbosity, long cpu_start,
172 : ATTRIBUTE_UNUSED long real_start)
173 : {
174 : #ifdef _OPENMP
175 : if (real_start != 0L)
176 : {
177 : outputf (verbosity, " took %lums (%lums real)\n",
178 : elltime (cpu_start, cputime()),
179 : elltime (real_start, realtime()));
180 : return;
181 : }
182 : #endif
183 4406 : outputf (verbosity, " took %lums\n", elltime (cpu_start, cputime()));
184 4406 : }
185 :
186 :
187 : static void
188 748 : print_CRT_primes (const int verbosity, const char *prefix,
189 : const mpzspm_t ntt_context)
190 : {
191 748 : double modbits = 0.;
192 : unsigned int i;
193 :
194 748 : if (test_verbose (verbosity))
195 : {
196 42 : outputf (verbosity, "%s%lu", prefix, ntt_context->spm[0]->sp);
197 42 : modbits += log ((double) ntt_context->spm[0]->sp);
198 1150 : for (i = 1; i < ntt_context->sp_num; i++)
199 : {
200 1108 : outputf (verbosity, " * %lu", ntt_context->spm[i]->sp);
201 1108 : modbits += log ((double) ntt_context->spm[i]->sp);
202 : }
203 42 : outputf (verbosity, ", has %d primes, %f bits\n",
204 : ntt_context->sp_num, modbits / log (2.));
205 : }
206 748 : }
207 :
208 : /* Approximate amount of memory in bytes each coefficient in an NTT takes
209 : so that NTT can do transforms up to length lmax with modulus, or
210 : with 2*modulus if twice != 0 */
211 : static size_t
212 243 : ntt_coeff_mem (const unsigned long lmax, const mpz_t modulus, const int twice)
213 : {
214 : mpz_t t;
215 : size_t n;
216 :
217 243 : mpz_init (t);
218 243 : mpz_mul (t, modulus, modulus);
219 243 : mpz_mul_ui (t, t, lmax);
220 243 : if (twice)
221 21 : mpz_mul_2exp (t, t, 1UL);
222 : /* +4: +1 for rounding up, +3 for extra words due to ECRT */
223 243 : n = (mpz_sizeinbase (t, 2) - 1) / SP_NUMB_BITS + 4;
224 243 : mpz_clear (t);
225 243 : return n * sizeof (sp_t);
226 : }
227 :
228 : size_t
229 267 : pm1fs2_memory_use (const unsigned long lmax, const mpz_t modulus,
230 : const int use_ntt)
231 : {
232 267 : if (use_ntt)
233 : {
234 : /* We store lmax / 2 + 1 coefficients for the DCT-I of F and lmax
235 : coefficients for G in NTT ready format. Each coefficient in
236 : NTT-ready format occupies approx.
237 : ceil(log(lmax*modulus^2)/log(bits per sp_t)) + 3 words. */
238 :
239 : size_t n;
240 :
241 205 : n = ntt_coeff_mem (lmax, modulus, 0) * (size_t) (3 * lmax / 2 + 1);
242 205 : outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
243 : "with lmax = %lu NTT is %lu bytes\n", lmax, n);
244 205 : return n;
245 : }
246 : else
247 : {
248 : /* F stores s_1/2 residues,
249 : h stores s_1 mpz_t structs (residues get cloned from F)
250 : g stores lmax residues,
251 : R stores lmax-s_1 residues,
252 : and tmp stores 3*lmax+list_mul_mem (lmax / 2) residues.
253 : Assume s_1 is close to lmax/2.
254 : Then we have
255 : lmax/4 + lmax/2 + lmax + lmax/2 + 3*lmax + list_mul_mem (lmax / 2)
256 : = (5+1/4)*lmax + list_mul_mem (lmax / 2) residues, plus s_1 mpz_t.
257 : */
258 :
259 : size_t n;
260 :
261 62 : n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
262 62 : n *= 5 * lmax + lmax / 4 + list_mul_mem (lmax / 2);
263 62 : n += lmax / 2 * sizeof (mpz_t);
264 : /* Memory use due to temp space allocation in TMulKS appears to
265 : approximately triple the estimated memory use. This is hard to
266 : estimate precisely, so let's go with the fudge factor of 3 here */
267 62 : n *= 3;
268 62 : outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
269 : "with lmax = %lu is %lu bytes\n", lmax, n);
270 62 : return n;
271 : }
272 : }
273 :
274 : /* return the possible lmax for given memory use and modulus */
275 :
276 : unsigned long
277 15 : pm1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt)
278 : {
279 15 : if (use_ntt)
280 : {
281 5 : size_t n, lmax = 1;
282 :
283 5 : n = ntt_coeff_mem (lmax, modulus, 0);
284 5 : lmax = 1UL << ceil_log2 (memory / n / 3);
285 5 : return lmax;
286 : }
287 : else
288 : {
289 : size_t lmax, n;
290 :
291 10 : n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
292 :
293 : /* Guess an initial value of lmax for list_mul_mem (lmax / 2) */
294 : /* memory = n * 25/4 * lmax + lmax / 2 * sizeof (mpz_t); */
295 : /* Fudge factor of 3 for TMulKS as above */
296 10 : lmax = memory / (3 * 25 * n / 4 + 3 * sizeof (mpz_t) / 2);
297 10 : return lmax;
298 : }
299 : }
300 :
301 : size_t
302 21 : pp1fs2_memory_use (const unsigned long lmax, const mpz_t modulus,
303 : const int use_ntt, const int twopass)
304 : {
305 : size_t n, m;
306 :
307 21 : m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
308 21 : if (use_ntt)
309 : {
310 : /* In one pass mode, we store h_x_ntt and h_y_ntt, each of length
311 : lmax/2(+1), and g_x_ntt and g_y_ntt, each of length lmax, all in
312 : NTT ready format. In two pass mode, we store h_x_ntt, h_y_ntt and
313 : g_x_ntt as before, plus R which is lmax - s_1 mpz_t.
314 : We assume s_1 ~= lmax/2.
315 : */
316 :
317 11 : n = ntt_coeff_mem (lmax, modulus, !twopass);
318 11 : if (twopass)
319 1 : return lmax * (2 * n + m / 2);
320 : else
321 10 : return lmax * 3 * n;
322 : }
323 : else
324 : {
325 : /* We allocate:
326 : F: s_1/2 coefficients
327 : fh_x, fh_y: s_1/2 coefficients
328 : h_x, h_y: s_1 mpz_t's (cloned from fh_x and fh_y)
329 : g_x, g_y: lmax coefficients
330 : R_x, R_y: lmax - s_1 coefficients
331 : tmp: 3UL * lmax + list_mul_mem (lmax / 2)
332 : Assuming s_1 ~ lmax/2, that's
333 : lmax/2 + 2*lmax/4 + 2*lmax + 2*lmax/2 * 3*lmax +
334 : list_mul_mem (lmax / 2) =
335 : 7 + list_mul_mem (lmax / 2) coefficients and lmax mpz_t.
336 : */
337 :
338 10 : n = m * (7 * lmax + list_mul_mem (lmax / 2));
339 10 : n += lmax * sizeof (mpz_t);
340 10 : n = 5 * n / 2; /* A fudge factor again */
341 10 : return n;
342 : }
343 : }
344 :
345 : unsigned long
346 43 : pp1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt,
347 : const int twopass)
348 : {
349 : size_t n, m;
350 :
351 43 : m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
352 43 : if (use_ntt)
353 : {
354 22 : n = ntt_coeff_mem (1, modulus, !twopass);
355 22 : if (twopass)
356 11 : n = memory / (2 * n + m / 2);
357 : else
358 11 : n = memory / (3 * n);
359 22 : return 1UL << (ceil_log2 (n / 2)); /* Rounded down to power of 2 */
360 : }
361 : else
362 : {
363 21 : return memory / 5 / (m * 8 + sizeof (mpz_t)) * 2;
364 : }
365 : }
366 :
367 :
368 : /* Test if for given P, nr, B2min and B2 we can choose an m_1 so that the
369 : stage 2 interval [B2min, B2] is covered. The effective B2min and B2
370 : are stored in effB2min and effB2 */
371 :
372 : static int
373 1050536 : test_P (const mpz_t B2min, const mpz_t B2, mpz_t m_1, const unsigned long P,
374 : const unsigned long nr, mpz_t effB2min, mpz_t effB2)
375 : {
376 : mpz_t m;
377 : /* We need B2min >= 2 * max(S_1 + S_2) + (2*m_1 - 1)*P + 1, or
378 : B2min - 2 * max(S_1 + S_2) - 1 >= (2*m_1)*P - P, or
379 : (B2min - 2*max(S_1 + S_2) + P - 1)/(2P) >= m_1
380 : Choose m_1 accordingly */
381 :
382 1050536 : mpz_init (m);
383 1050536 : sets_max (m, P);
384 1050536 : mpz_mul_2exp (m, m, 1UL); /* m = 2*max(S_1 + S_2) */
385 :
386 1050536 : mpz_sub (m_1, B2min, m);
387 1050536 : mpz_sub_ui (m_1, m_1, 1UL); /* m_1 = B2min - 2*max(S_1 + S_2) - 1 */
388 1050536 : mpz_add_ui (m_1, m_1, P);
389 1050536 : mpz_fdiv_q_2exp (m_1, m_1, 1UL);
390 1050536 : mpz_fdiv_q_ui (m_1, m_1, P); /* 2UL*P may overflow */
391 :
392 : /* Compute effB2min = 2 * max(S_1 + S_2) + (2*(m_1 - 1) + 1)*P + 1 */
393 :
394 1050536 : mpz_mul_2exp (effB2min, m_1, 1UL);
395 1050536 : mpz_sub_ui (effB2min, effB2min, 1UL);
396 1050536 : mpz_mul_ui (effB2min, effB2min, P);
397 1050536 : mpz_add (effB2min, effB2min, m);
398 1050536 : mpz_add_ui (effB2min, effB2min, 1UL);
399 1050536 : ASSERT_ALWAYS (mpz_cmp (effB2min, B2min) <= 0);
400 :
401 : /* Compute the smallest value coprime to P at the high end of the stage 2
402 : interval that will not be covered:
403 : 2*(min(S_1 + S_2)) + (2*(m_1 + nr) + 1)*P.
404 : We assume min(S_1 + S_2) = -max(S_1 + S_2) */
405 1050536 : mpz_add_ui (effB2, m_1, nr);
406 1050536 : mpz_mul_2exp (effB2, effB2, 1UL);
407 1050536 : mpz_add_ui (effB2, effB2, 1UL);
408 1050536 : mpz_mul_ui (effB2, effB2, P);
409 1050536 : mpz_sub (effB2, effB2, m);
410 :
411 : /* The effective B2 values is that value, minus 1 */
412 1050536 : mpz_sub_ui (effB2, effB2, 1UL);
413 :
414 1050536 : mpz_clear (m);
415 1050536 : return (mpz_cmp (B2, effB2) <= 0);
416 : }
417 :
418 :
419 : static void
420 1052843 : factor_phiP (int *exponents, const unsigned long phiP)
421 : {
422 1052843 : const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
423 1052843 : unsigned long cofactor = phiP;
424 : int i;
425 :
426 1052843 : ASSERT_ALWAYS (phiP > 0UL);
427 :
428 9475587 : for (i = 0; i < nrprimes; i++)
429 15008152 : for (exponents[i] = 0; cofactor % phiPfactors[i] == 0UL; exponents[i]++)
430 6585408 : cofactor /= phiPfactors[i];
431 :
432 1052843 : ASSERT_ALWAYS (cofactor == 1UL);
433 1052843 : }
434 :
435 :
436 : static unsigned long
437 214580208 : pow_ul (const unsigned long b, const unsigned int e)
438 : {
439 214580208 : unsigned long r = 1UL;
440 : unsigned int i;
441 :
442 327259471 : for (i = 0; i < e; i++)
443 112679263 : r *= b;
444 :
445 214580208 : return r;
446 : }
447 :
448 : static unsigned long
449 53604626 : absdiff_ul (unsigned long a, unsigned long b)
450 : {
451 53604626 : return (a > b) ? a - b : b - a;
452 : }
453 :
454 : /* Choose s_1 so that s_1 * s_2 = phiP, s_1 is positive and even,
455 : s_2 >= min_s2 and s_2 is minimal and abs(s_1 - l) is minimal
456 : under those conditions. If use_ntt == 1, we require s_1 < l.
457 : Returns 0 if no such choice is possible */
458 :
459 : static unsigned long
460 1052843 : choose_s_1 (const unsigned long phiP, const unsigned long min_s2,
461 : const unsigned long l, const int use_ntt)
462 : {
463 1052843 : const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
464 : /* Using [nrprimes] here makes the compiler complain about variable-sized
465 : arrays */
466 : int phiPexponents[sizeof (phiPfactors) / sizeof (unsigned long)],
467 : exponents[sizeof (phiPfactors) / sizeof (unsigned long)];
468 1052843 : unsigned long s_1 = 0UL, s_2 = 0UL, trys_1;
469 : int i;
470 :
471 1052843 : ASSERT_ALWAYS (phiP > 0 && phiP % 2 == 0);
472 :
473 : /* We want only even s_1. We divide one 2 out of phiP here... */
474 1052843 : factor_phiP (phiPexponents, phiP / 2);
475 9475587 : for (i = 0; i < nrprimes; i++)
476 8422744 : exponents[i] = 0;
477 :
478 : do {
479 26822526 : trys_1 = 2; /* ... and add a 2 here */
480 241402734 : for (i = 0; i < nrprimes; i++)
481 214580208 : trys_1 *= pow_ul (phiPfactors[i], exponents[i]);
482 : #if 0
483 : printf ("choose_s_1: Trying trys_1 = %lu\n", trys_1);
484 : #endif
485 : /* See if it satisfies all the required conditions and is an
486 : improvement over the previous choice */
487 26822526 : if (phiP / trys_1 >= min_s2 &&
488 26802313 : (s_2 == 0UL || phiP / trys_1 < s_2) &&
489 37656228 : absdiff_ul (trys_1, l) < absdiff_ul (s_1, l) &&
490 3502374 : (use_ntt == 0 || trys_1 < l))
491 : {
492 : #if 0
493 : printf ("choose_s_1: New best s_1 for phiP = %lu, min_s2 = %lu, "
494 : "l = %lu : %lu\n", phiP, min_s2, l, trys_1);
495 : #endif
496 10828870 : s_1 = trys_1;
497 : }
498 40553699 : for (i = 0; i < nrprimes; i++)
499 : {
500 39500856 : if (++(exponents[i]) <= phiPexponents[i])
501 25769683 : break;
502 13731173 : exponents[i] = 0;
503 : }
504 26822526 : } while (i < nrprimes);
505 :
506 1052843 : return s_1;
507 : }
508 :
509 :
510 : /* Approximate cost of stage 2. Cost with and without ntt are not
511 : comparable. We have l > s_1 and s_1 * s_2 = eulerphi(P), hence
512 : s_2*l > eulerphi(P) and so cost (s_2, l) > eulerphi(P) for all P */
513 : static unsigned long
514 873981 : est_cost (const unsigned long s_2, const unsigned long l, const int use_ntt,
515 : const int method)
516 : {
517 873981 : if (method == ECM_PM1)
518 : {
519 : /* The time for building f, h and DCT-I of h seems to be about
520 : 7/6 of the time of computing g, h*g and gcd with NTT, and
521 : 3/2 of the time of computing g, h*g and gcd without NTT */
522 :
523 691077 : if (use_ntt)
524 161788 : return (7 * l) / 6 + s_2 * l;
525 : else
526 529289 : return (3 * l) / 2 + s_2 * l;
527 : }
528 182904 : else if (method == ECM_PP1)
529 : {
530 : /* Building f is the same, building h and its forward transform is
531 : twice about as expensive as for P-1. Each multi-point evaluation
532 : is twice as expensive as for P-1.
533 : FIXME: The estimate for NTT assumes the "one-pass" variant, in
534 : "two-pass" the multipoint evaluations are slower, so the optimum
535 : shifts towards smaller s_2 some more */
536 182904 : if (use_ntt)
537 128002 : return (4 * l) / 5 + s_2 * l;
538 : else
539 54902 : return (3 * l) / 4 + s_2 * l;
540 : }
541 : else
542 0 : abort (); /* Invalid value for method */
543 : }
544 :
545 : /* Choose P so that a stage 2 range from B2min to B2 can be covered with
546 : multipoint evaluations, each using a convolution of length at most lmax.
547 : The parameters for stage 2 are stored in finalparams, the final effective
548 : B2min and B2 values in final_B2min and final_B2, respecively. Each of these
549 : may be NULL, in which case the value is not stored. It is permissible
550 : to let B2min and final_B2min, or B2 and final_B2 point at the same mpz_t. */
551 :
552 : long
553 736 : choose_P (const mpz_t B2min, const mpz_t B2, const unsigned long lmax,
554 : const unsigned long min_s2, faststage2_param_t *finalparams,
555 : mpz_t final_B2min, mpz_t final_B2, const int use_ntt,
556 : const int method)
557 : {
558 : /* Let S_1 + S_2 == (Z/PZ)* (mod P).
559 :
560 : Let F(x) = \prod_{k_1 \in S_1} (x - b_1^{2 k_1}).
561 :
562 : If we evaluate F(b_1^{2 k_2 + (2m + 1)P}) for all k_2 \in S_2 with
563 : m_1 <= m < m_1+nr, we test all exponents 2 k_2 + (2m + 1)P - 2 k_1.
564 : The largest value coprime to P at the low end of the stage 2 interval
565 : *not* covered will be
566 : 2*max(S_2) + (2*(m_1-1) + 1)*P - 2*min(S_1).
567 : The smallest value at the high end not covered will be
568 : 2*min(S_2) + (2*(m_1 + nr) + 1)*P - 2*max(S_1).
569 : Assume S_1 and S_2 are symmetric around 0, so that max(S_1) = -min(S_1).
570 : Then the largest ... is:
571 : 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P
572 : The smallest ... is:
573 : -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P
574 : The effective B2min = 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P + 1
575 : The effective B2max = -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P - 1
576 :
577 : Then the difference effB2max - effB2min =
578 : -4*(max(S_1) + max(S_2)) + 2P*(nr + 1) - 2
579 :
580 : We obviously require B2max - B2min <= 2*nr*P
581 : Since nr < lmax, B2max - B2min <= 2*lmax*P or
582 : P >= ceil((B2max - B2min)/(2*lmax))
583 :
584 : Hence we are looking for an odd P with s_1 * s_2 = eulerphi(P) so that
585 : s_1 ~= lmax / 2 and the whole stage 2 interval is covered. s_2 should
586 : be small, as long as s_1 is small enough.
587 : */
588 :
589 : mpz_t B2l, m_1, effB2min, tryeffB2, effB2, lmin;
590 : /* The best parameters found so far, P == 0 means that no suitable P
591 : has been found yet: */
592 736 : unsigned long P = 0, s_1 = 0, s_2 = 0, l = 0, cost = 0;
593 : unsigned int i;
594 736 : const unsigned int Pvalues_len = sizeof (Pvalues) / sizeof (unsigned long);
595 : int r;
596 :
597 736 : outputf (OUTPUT_TRACE,
598 : "choose_P(B2min = %Zd, B2 = %Zd, lmax = %lu, min_s2 = %ld, "
599 : "use_ntt = %d, method = %d\n",
600 : B2min, B2, lmax, min_s2, use_ntt, method);
601 :
602 736 : if (mpz_cmp (B2, B2min) < 0)
603 29 : return 0L;
604 :
605 : /* If we use the NTT, we allow only power-of-two transform lengths.
606 : In that case, the code below assumes that lmax is a power of two.
607 : If that is not the case, print error and return. */
608 707 : if (use_ntt && (lmax & (lmax - 1UL)) != 0)
609 : {
610 0 : outputf (OUTPUT_ERROR,
611 : "choose_P: Error, lmax = %lu is not a power of two\n", lmax);
612 0 : return ECM_ERROR;
613 : }
614 :
615 707 : mpz_init (effB2);
616 707 : mpz_init (tryeffB2);
617 707 : mpz_init (effB2min);
618 707 : mpz_init (B2l);
619 707 : mpz_init (m_1);
620 707 : mpz_init (lmin);
621 :
622 707 : mpz_sub (B2l, B2, B2min);
623 707 : mpz_add_ui (B2l, B2l, 1UL); /* +1 due to closed interval */
624 :
625 : /* For each candidate P, check if [B2min, B2] can be covered at all,
626 : and if so, what the best parameters (minimizing the cost, maximizing
627 : effB2) are. If they are better than the best parameters for the best P
628 : so far, remember them. */
629 :
630 347137 : for (i = 0 ; i < Pvalues_len; i++)
631 : {
632 : unsigned long tryP, tryphiP, trys_1, trys_2, tryl, trycost;
633 :
634 346430 : tryP = Pvalues[i];
635 346430 : tryphiP = eulerphi (tryP);
636 :
637 346430 : outputf (OUTPUT_TRACE,
638 : "choose_P: trying P = %lu, eulerphi(P) = %lu\n", tryP, tryphiP);
639 :
640 : /* If we have a good P already and this tryphiP >= cost, then
641 : there's no hope for this tryP, since cost(s_2, l) > eulerphi(P) */
642 346430 : if (P != 0 && tryphiP >= cost)
643 : {
644 301707 : outputf (OUTPUT_TRACE,
645 : "choose_P: tryphiP > cost = %lu, this P is too large\n",
646 : cost);
647 301707 : continue;
648 : }
649 :
650 : /* We have nr < l and effB2-effB2min <= 2*nr*P. Hence we need
651 : l >= B2l/P/2 */
652 44723 : mpz_cdiv_q_ui (lmin, B2l, tryP);
653 44723 : mpz_cdiv_q_2exp (lmin, lmin, 1UL);
654 44723 : outputf (OUTPUT_TRACE, "choose_P: lmin = %Zd for P = %lu\n", lmin, tryP);
655 44723 : if (mpz_cmp_ui (lmin, lmax) > 0)
656 : {
657 5859 : outputf (OUTPUT_TRACE,
658 : "choose_P: lmin > lmax, this P is too small\n");
659 5859 : continue;
660 : }
661 :
662 : /* Try all possible transform lengths and store parameters in
663 : P, s_1, s_2, l if they are better than the previously best ones */
664 :
665 : /* Keep reducing tryl to find best parameters. For NTT, we only have
666 : power of 2 lengths so far, so we can simply divide by 2.
667 : For non-NTT, we have arbitrary transform lengths so we can decrease
668 : in smaller steps... let's say by, umm, 25% each time? */
669 1091707 : for (tryl = lmax; mpz_cmp_ui (lmin, tryl) <= 0;
670 1052843 : tryl = (use_ntt) ? tryl / 2 : 3 * tryl / 4)
671 : {
672 1052843 : trys_1 = choose_s_1 (tryphiP, min_s2, tryl / 2, use_ntt);
673 1052843 : if (trys_1 == 0)
674 : {
675 3004 : outputf (OUTPUT_TRACE,
676 : "choose_P: could not choose s_1 for P = %lu, l = %lu\n",
677 : tryP, tryl);
678 3004 : continue;
679 : }
680 : ASSERT (tryphiP % trys_1 == 0UL);
681 1049839 : trys_2 = tryphiP / trys_1;
682 1049839 : outputf (OUTPUT_TRACE, "choose_P: chose s_1 = %lu, k = s_2 = %lu "
683 : "for P = %lu, l = %lu\n", trys_1, trys_2, tryP, tryl);
684 :
685 1049839 : if (test_P (B2min, B2, m_1, tryP, tryl - trys_1, effB2min, tryeffB2))
686 : {
687 1021837 : outputf (OUTPUT_TRACE,
688 : "choose_P: P = %lu, l = %lu, s_1 = %lu, k = s_2 = %lu "
689 : "works, m_1 = %Zd, effB2min = %Zd, effB2 = %zZd\n",
690 : tryP, tryl, trys_1, trys_2, m_1, effB2min, tryeffB2);
691 :
692 : /* do not allow B2min to be less than 0 */
693 1021837 : if (mpz_cmp_ui(effB2min, 0) < 0)
694 147856 : continue;
695 :
696 : /* We use these parameters if we
697 : 1. didn't have any suitable ones yet, or
698 : 2. these cover [B2min, B2] and are cheaper than the best
699 : ones so far, or
700 : 3. they are as expensive but reach greater effB2. */
701 873981 : trycost = est_cost (trys_2, tryl, use_ntt, method);
702 : ASSERT (tryphiP < trycost);
703 873981 : if (P == 0 || trycost < cost ||
704 24814 : (trycost == cost && mpz_cmp (tryeffB2, effB2) > 0))
705 : {
706 43179 : outputf (OUTPUT_TRACE,
707 : "choose_P: and is the new optimum (cost = %lu)\n",
708 : trycost);
709 43179 : P = tryP;
710 43179 : s_1 = trys_1;
711 43179 : s_2 = trys_2;
712 43179 : l = tryl;
713 43179 : cost = trycost;
714 43179 : mpz_set (effB2, tryeffB2);
715 : }
716 : }
717 : }
718 : }
719 :
720 707 : if (P != 0) /* If we found a suitable P */
721 : {
722 : /* Compute m_1, effB2min, effB2 again */
723 697 : r = test_P (B2min, B2, m_1, P, l - s_1, effB2min, effB2);
724 697 : ASSERT_ALWAYS(r != 0);
725 697 : if (finalparams != NULL)
726 : {
727 697 : finalparams->P = P;
728 697 : finalparams->s_1 = s_1;
729 697 : finalparams->s_2 = s_2;
730 697 : finalparams->l = l;
731 697 : mpz_set (finalparams->m_1, m_1);
732 : }
733 697 : if (final_B2min != NULL)
734 697 : mpz_set (final_B2min, effB2min);
735 697 : if (final_B2 != NULL)
736 697 : mpz_set (final_B2, effB2);
737 : }
738 :
739 707 : mpz_clear (effB2);
740 707 : mpz_clear (tryeffB2);
741 707 : mpz_clear (effB2min);
742 707 : mpz_clear (B2l);
743 707 : mpz_clear (m_1);
744 707 : mpz_clear (lmin);
745 :
746 707 : return (P != 0) ? (long) P : ECM_ERROR;
747 : }
748 :
749 :
750 :
751 : static void
752 21255 : list_output_poly (listz_t l, unsigned long len, int monic, int symmetric,
753 : char *prefix, char *suffix, int verbosity)
754 : {
755 : unsigned long i;
756 :
757 21255 : if (prefix != NULL)
758 21255 : outputf (verbosity, prefix);
759 :
760 21255 : if (len == 0)
761 : {
762 0 : if (monic)
763 0 : outputf (verbosity, "1\n", len, len);
764 : else
765 0 : outputf (verbosity, "0\n", len);
766 0 : return;
767 : }
768 :
769 21255 : if (monic)
770 : {
771 0 : if (symmetric)
772 0 : outputf (verbosity, "(x^%lu + x^-%lu) + ", len, len);
773 : else
774 0 : outputf (verbosity, "x^%lu + ", len);
775 : }
776 14177972 : for (i = len - 1; i > 0; i--)
777 14156717 : if (symmetric)
778 14156717 : outputf (verbosity, "%Zd * (x^%lu + x^-%lu) + ", l[i], i, i);
779 : else
780 0 : outputf (verbosity, "%Zd * x^%lu + ", l[i], i);
781 21255 : outputf (verbosity, "%Zd", l[0]);
782 21255 : if (suffix != NULL)
783 21255 : outputf (verbosity, suffix);
784 : }
785 :
786 : #if 0
787 : /* Multiply P[i] by r^{k(deg-i)}, for 0 <= i <= deg. Needs 3 entries in tmp. */
788 : /* I.e., let P(x) = x^deg + \sum_{i=0}^{deg - 1} P[i] * x^i. The output is
789 : R(x) = x^deg + \sum_{i=0}^{deg - 1} R[i] * x^i = r^(k deg) P(r^{-k} x). */
790 : /* The input and output polynomials are monic and have the leading monomial
791 : implicit, i.e. not actually stored in the array of coefficients. */
792 : /* Returns 0 if a modular inversion failed (in which case R is left
793 : unchanged), 1 otherwise */
794 :
795 : static int
796 : list_scale_rev (listz_t R, listz_t S, mpz_t r, long k, unsigned long deg,
797 : mpz_t modulus, listz_t tmp,
798 : ATTRIBUTE_UNUSED const unsigned long tmplen)
799 : {
800 : unsigned long i;
801 :
802 : ASSERT (tmplen >= 3);
803 : mpz_powm_ui (tmp[0], r, (unsigned long) labs (k), modulus);
804 : if (k < 0)
805 : {
806 : if (!mpz_invert (tmp[0], tmp[0], modulus)) /* FIXME: get rid of this! */
807 : return 0;
808 : }
809 : /* Here, tmp[0] = r^k */
810 : mpz_set (tmp[1], tmp[0]);
811 : /* mpz_set (R[deg], S[deg]); Leading monomial is not stored! */
812 : for (i = 1; i + 1 <= deg; i++)
813 : {
814 : /* Here, tmp[1] = r^(ki) */
815 : mpz_mul (tmp[2], S[deg-i], tmp[1]);
816 : mpz_mod (R[deg-i], tmp[2], modulus);
817 : mpz_mul (tmp[2], tmp[1], tmp[0]); /* FIXME, avoid unnecessary mul */
818 : mpz_mod (tmp[1], tmp[2], modulus); /* at end of loop */
819 : }
820 : if (i <= deg)
821 : {
822 : mpz_mul (tmp[2], S[deg-i], tmp[1]);
823 : mpz_mod (R[deg-i], tmp[2], modulus);
824 : }
825 :
826 : return 1;
827 : }
828 : #endif
829 :
830 : /* Same, but does squaring which makes things easier */
831 :
832 : static void
833 1278 : list_sqr_reciprocal (listz_t R, listz_t S, const unsigned long l,
834 : mpz_t modulus, listz_t tmp,
835 : ATTRIBUTE_UNUSED const unsigned long tmplen)
836 : {
837 : unsigned long i;
838 1278 : listz_t Srev, r1 = tmp, r2 = tmp + 2 * l - 1, t = tmp + 4 * l - 2;
839 :
840 1278 : if (l == 0UL)
841 0 : return;
842 :
843 : /* FIXME: This modifies the input arguments. */
844 : /* We have to divide S[0] by 2 */
845 :
846 : ASSERT (tmplen >= 4 * l - 2 + list_mul_mem (l));
847 :
848 : #if 0
849 : gmp_printf ("/* list_sqr_reciprocal */ S(x) = %Zd", S[0]);
850 : for (i = 1; i < l1; i++)
851 : gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S[i], i, i);
852 : gmp_printf ("\n");
853 : #endif
854 :
855 1278 : if (mpz_odd_p (S[0]))
856 : {
857 616 : ASSERT_ALWAYS (mpz_odd_p (modulus));
858 616 : mpz_add (S[0], S[0], modulus);
859 : }
860 1278 : mpz_tdiv_q_2exp (S[0], S[0], 1UL);
861 :
862 1278 : list_mul (r1, S, l, S, l, 0, t);
863 : /* r1 = f0*g0/4 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
864 : #if 0
865 : for (i = 0; i < 2UL * l - 1UL; i++)
866 : gmp_printf ("list_sqr_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
867 : #endif
868 :
869 1278 : Srev = (listz_t) malloc (l * sizeof (mpz_t));
870 1278 : ASSERT_ALWAYS (Srev != NULL);
871 114827 : for (i = 0UL; i < l; i++)
872 113549 : (*Srev)[i] = (*S)[l - 1UL - i];
873 1278 : list_mul (r2, S, l, Srev, l, 0, t);
874 : /* r2 is symmetric, r2[i] = r2[2*l - 2 - i]. Check this */
875 : #if 0
876 : for (i = 0; 0 && i < 2UL * l - 1UL; i++)
877 : gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
878 : #endif
879 : #ifdef WANT_ASSERT
880 : for (i = 0UL; i < l; i++)
881 : ASSERT (mpz_cmp (r2[i], r2[2UL * l - 2UL - i]) == 0);
882 : #endif
883 1278 : free (Srev);
884 : /* r2 = g1*f0/2 + (g0*f0/4 + g1*f1) * x + g0*f1/2 * x^2 */
885 : #if 0
886 : for (i = 0; i < 2UL * l - 1UL; i++)
887 : gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
888 : #endif
889 :
890 1278 : mpz_mul_2exp (r1[0], r1[0], 1UL);
891 : /* r1 = f0*g0/2 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
892 114827 : for (i = 0UL; i < l; i++)
893 : {
894 113549 : mpz_mul_2exp (r2[l - i - 1UL], r2[l - i - 1UL], 1UL);
895 113549 : mpz_add (R[i], r1[i], r2[l - i - 1UL]);
896 : }
897 : /* r1 = 3/4*f0*g0 + g1*f1 + (f0*g1 + 2*f1*g0)/2 * x + f1*g1 * x^2 */
898 : /* r1 = f0*g0 + 2*g1*f1 + (f0*g1 + f1*g0) * x + f1*g1 * x^2 */
899 113549 : for (i = l; i < 2UL * l - 1UL; i++)
900 112271 : mpz_set (R[i], r1[i]);
901 :
902 1278 : if (R != S)
903 0 : mpz_mul_2exp (S[0], S[0], 1UL);
904 :
905 : #if 0
906 : for (i = 0; i < 2UL * l; i++)
907 : gmp_printf ("list_sqr_reciprocal: R[%lu] = %Zd\n", i, R[i]);
908 : #endif
909 : }
910 :
911 : #ifdef WANT_ASSERT
912 : static void
913 : list_recip_eval1 (mpz_t R, const listz_t S, const unsigned long l)
914 : {
915 : unsigned long i;
916 :
917 : mpz_set_ui (R, 0UL);
918 : for (i = 1; i < l; i++)
919 : mpz_add (R, R, S[i]);
920 : mpz_mul_2exp (R, R, 1UL);
921 : if (l > 0UL)
922 : mpz_add (R, R, S[0]);
923 : }
924 : #endif
925 :
926 : /* Multiply two reciprocal polynomials of degree 2*l1-2 and 2*l2-2, resp.,
927 : with coefficients in standard basis
928 :
929 : S_1(x) = S1[0] + sum_{1 \leq i \leq l1 - 1} S1[i] (x^i + x^{-i})
930 : S_2(x) = S2[0] + sum_{1 \leq i \leq l2 - 1} S2[i] (x^i + x^{-i})
931 :
932 : to the reciprocal polynomial of degree 2*(l1 + l2) - 4
933 :
934 : R(x) = R[0] + sum_{1 \leq i \leq l1 + l2 - 2} R[i] (x^i + x^{-i})
935 : = S_1(x) * S_2(x)
936 :
937 : R == S1 == S2 is permissible, however if S1 == S2, l1 must be equal
938 : to l2 (i.e. the multiplication must be a squaring)
939 : */
940 : /* FIXME: This modifies the input arguments. */
941 : /* We have to divide S1[0] and S2[0] by 2 */
942 :
943 : static void
944 1457 : list_mul_reciprocal (listz_t R, listz_t S1, unsigned long l1,
945 : listz_t S2, unsigned long l2,
946 : mpz_t modulus, listz_t tmp,
947 : ATTRIBUTE_UNUSED const unsigned long tmplen)
948 : {
949 : unsigned long i;
950 1457 : const unsigned long lmax = MAX(l1, l2);
951 1457 : listz_t r1 = tmp, r2 = tmp + 2*lmax - 1, rev = tmp + 4*lmax - 2,
952 1457 : t = tmp + 6*lmax - 3;
953 : #ifdef WANT_ASSERT
954 : mpz_t sum1, sum2, prod;
955 : #endif
956 :
957 : ASSERT (S1 < tmp || S1 >= tmp + tmplen);
958 : ASSERT (S2 < tmp || S2 >= tmp + tmplen);
959 : ASSERT (R < tmp || R >= tmp + tmplen);
960 :
961 1457 : if (l1 == 0UL || l2 == 0UL)
962 0 : return;
963 :
964 1457 : if (S1 == S2)
965 : {
966 0 : ASSERT_ALWAYS (l1 == l2);
967 0 : list_sqr_reciprocal (R, S1, l1, modulus, tmp, tmplen);
968 0 : return;
969 : }
970 :
971 : ASSERT (tmplen >= 6*lmax - 3 + list_mul_mem (lmax));
972 : #ifdef WANT_ASSERT
973 : mpz_init (sum1);
974 : mpz_init (sum2);
975 : mpz_init (prod);
976 : list_recip_eval1 (sum1, S1, l1);
977 : list_recip_eval1 (sum2, S2, l2);
978 : mpz_mul (prod, sum1, sum2);
979 : mpz_mod (prod, prod, modulus);
980 : #endif
981 :
982 :
983 : /* Make S1 the longer of the two, i.e. l1 >= l2 */
984 1457 : if (l2 > l1)
985 : {
986 972 : listz_t St = S1;
987 972 : unsigned long lt = l1;
988 972 : S1 = S2;
989 972 : S2 = St;
990 972 : l1 = l2;
991 972 : l2 = lt;
992 : }
993 :
994 : #if 0
995 : gmp_printf ("/* list_mul_reciprocal */ S1(x) = %Zd", S1[0]);
996 : for (i = 1; i < l1; i++)
997 : gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S1[i], i, i);
998 : gmp_printf ("\n");
999 : gmp_printf ("/* list_mul_reciprocal */ S2(x) = %Zd", S2[0]);
1000 : for (i = 1; i < l1; i++)
1001 : gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S2[i], i, i);
1002 : gmp_printf ("\n");
1003 : #endif
1004 :
1005 : /* Divide S1[0] and S2[0] by 2 */
1006 1457 : if (mpz_odd_p (S1[0]))
1007 : {
1008 727 : ASSERT_ALWAYS (mpz_odd_p (modulus));
1009 727 : mpz_add (S1[0], S1[0], modulus);
1010 : }
1011 1457 : mpz_tdiv_q_2exp (S1[0], S1[0], 1UL);
1012 :
1013 1457 : if (mpz_odd_p (S2[0]))
1014 : {
1015 718 : ASSERT_ALWAYS (mpz_odd_p (modulus));
1016 718 : mpz_add (S2[0], S2[0], modulus);
1017 : }
1018 1457 : mpz_tdiv_q_2exp (S2[0], S2[0], 1UL);
1019 :
1020 : /* Pad rev with zeros */
1021 12456 : for (i = l2; i < lmax; i++)
1022 10999 : mpz_set_ui (rev[i], 0UL);
1023 :
1024 14278 : for (i = 0UL; i < l2; i++)
1025 12821 : mpz_set (rev[i], S2[l2 - 1UL - i]);
1026 1457 : list_mul (r1, S1, lmax, rev, lmax, 0, t);
1027 : /* r1 = \tilde{f}(x) \rev(\tilde{g}(x)) and has degree l1 + l2 - 2,
1028 : i.e. l1 + l2 - 1 entries. */
1029 : #if 0
1030 : for (i = 0; i < 2 * lmax - 1; i++)
1031 : gmp_printf ("list_mul_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
1032 : #endif
1033 :
1034 14278 : for (i = 0UL; i < l2; i++)
1035 12821 : mpz_set(rev[i], S2[i]);
1036 1457 : list_mul (r2, S1, lmax, rev, lmax, 0, t);
1037 : /* \tilde{f}(x) \tilde{g}(x) */
1038 :
1039 : #if 0
1040 : for (i = 0; i < 2 * lmax - 1; i++)
1041 : gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
1042 : #endif
1043 :
1044 : /* Add f_0*g_0 by doubling the f_0*g_0 term in r2 */
1045 1457 : mpz_mul_2exp (r2[0], r2[0], 1UL);
1046 :
1047 : /* Add \flloor x^{-d_g} \tilde{f}(x) \rev(\tilde{g}(x)) \rfloor.
1048 : d_g = l2 - 1. */
1049 25277 : for (i = 0; i < l1; i++)
1050 23820 : mpz_add (r2[i], r2[i], r1[i + l2 - 1]);
1051 :
1052 : /* Add \floor x^{-d_f} rev(\tilde{f}(x) \rev(\tilde{g}(x))) \rfloor.
1053 : d_f = l1 - 1. rev(r2)[i] = r2[l1 + l2 - 2 - i]. We want
1054 : rev(r2)[l1 - 1 ... l1 + l2 - 2], hence
1055 : r2[l2 - 1 ... 0] */
1056 14278 : for (i = 0; i < l2; i++)
1057 12821 : mpz_add (r2[i], r2[i], r1[l2 - 1 - i]);
1058 :
1059 : #if 0
1060 : for (i = 0; i < l1 + l2 - 1; i++)
1061 : gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
1062 : #endif
1063 :
1064 1457 : mpz_mul_2exp (S1[0], S1[0], 1UL);
1065 1457 : mpz_mul_2exp (S2[0], S2[0], 1UL);
1066 :
1067 36641 : for (i = 0; i < l1 + l2 - 1; i++)
1068 35184 : mpz_set (R[i], r2[i]);
1069 :
1070 : #if 0
1071 : for (i = 0; i < l1 + l2 - 1; i++)
1072 : gmp_printf ("list_mul_reciprocal: R[%lu] = %Zd\n", i, R[i]);
1073 : #endif
1074 : #ifdef WANT_ASSERT
1075 : list_recip_eval1 (sum1, R, l1 + l2 - 1);
1076 : mpz_mod (sum1, sum1, modulus);
1077 : ASSERT (mpz_cmp (prod, sum1) == 0);
1078 : mpz_clear (sum1);
1079 : mpz_clear (sum2);
1080 : mpz_clear (prod);
1081 : #endif
1082 : }
1083 :
1084 : /*
1085 : Computes V_k(S), where the Chebyshev polynomial V_k(X) is defined by
1086 : V_k(X + 1/X) = X^k + 1/X^k
1087 : */
1088 :
1089 : static void
1090 12684 : V (mpres_t R, const mpres_t S, const long k, mpmod_t modulus)
1091 : {
1092 : mpres_t V0, Vi, Vi1;
1093 : unsigned long j, uk;
1094 : int po2;
1095 :
1096 12684 : if (k == 0L)
1097 : {
1098 2814 : mpres_set_ui (R, 2UL, modulus);
1099 9207 : return;
1100 : }
1101 :
1102 9870 : uk = labs (k);
1103 :
1104 9870 : if (uk == 1UL)
1105 : {
1106 3098 : mpres_set (R, S, modulus);
1107 3098 : return;
1108 : }
1109 :
1110 11951 : for (po2 = 0; uk % 2UL == 0UL; uk >>= 1, po2++);
1111 :
1112 6772 : mpres_init (V0, modulus);
1113 6772 : mpres_set_ui (V0, 2UL, modulus); /* V0 = V_0(S) = 2 */
1114 :
1115 6772 : if (uk == 1UL)
1116 : {
1117 3295 : mpres_set (R, S, modulus);
1118 6590 : while (po2-- > 0)
1119 : {
1120 3295 : mpres_sqr (R, R, modulus);
1121 3295 : mpres_sub (R, R, V0, modulus);
1122 : }
1123 3295 : mpres_clear (V0, modulus);
1124 3295 : return;
1125 : }
1126 :
1127 : if (0)
1128 : {
1129 : mpz_t tz;
1130 : mpz_init (tz);
1131 : mpres_get_z (tz, S, modulus);
1132 : gmp_printf ("Chebyshev_V(%ld, Mod(%Zd,N)) == ", k, tz);
1133 : mpz_clear (tz);
1134 : }
1135 :
1136 30071 : for (j = 1UL; j <= uk / 2UL; j <<= 1);
1137 :
1138 3477 : mpres_init (Vi, modulus);
1139 3477 : mpres_init (Vi1, modulus);
1140 :
1141 : /* i = 1. Vi = V_i(S), Vi1 = V_{i+1}(S) */
1142 3477 : mpres_set (Vi, S, modulus);
1143 3477 : mpres_sqr (Vi1, S, modulus);
1144 3477 : mpres_sub (Vi1, Vi1, V0, modulus);
1145 3477 : j >>= 1;
1146 :
1147 26594 : while (j > 1)
1148 : {
1149 23117 : if ((uk & j) != 0UL)
1150 : {
1151 : /* i' = 2i + 1.
1152 : V_{i'} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1
1153 : V_{i'+1} = V_{2i + 2} = {V_{i+1}}^2 - V_0. */
1154 11789 : mpres_mul (Vi, Vi, Vi1, modulus);
1155 11789 : mpres_sub (Vi, Vi, S, modulus);
1156 11789 : mpres_sqr (Vi1, Vi1, modulus);
1157 11789 : mpres_sub (Vi1, Vi1, V0, modulus);
1158 : }
1159 : else
1160 : {
1161 : /* i' = 2i.
1162 : V_{i'} = V_{2i} = {V_i}^2 - V0.
1163 : V_{i'+1} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1 */
1164 11328 : mpres_mul (Vi1, Vi, Vi1, modulus);
1165 11328 : mpres_sub (Vi1, Vi1, S, modulus);
1166 :
1167 11328 : mpres_sqr (Vi, Vi, modulus);
1168 11328 : mpres_sub (Vi, Vi, V0, modulus);
1169 : }
1170 23117 : j >>= 1;
1171 : }
1172 :
1173 : /* Least significant bit of uk is always 1 */
1174 3477 : mpres_mul (Vi, Vi, Vi1, modulus);
1175 3477 : mpres_sub (Vi, Vi, S, modulus);
1176 :
1177 5361 : while (po2-- > 0)
1178 : {
1179 1884 : mpres_sqr (Vi, Vi, modulus);
1180 1884 : mpres_sub (Vi, Vi, V0, modulus);
1181 : }
1182 :
1183 3477 : mpres_set (R, Vi, modulus);
1184 :
1185 3477 : mpres_clear (Vi, modulus);
1186 3477 : mpres_clear (Vi1, modulus);
1187 3477 : mpres_clear (V0, modulus);
1188 :
1189 : if (0)
1190 : {
1191 : mpz_t tz;
1192 : mpz_init (tz);
1193 : mpres_get_z (tz, R, modulus);
1194 : gmp_printf ("%Zd\n", tz);
1195 : mpz_clear (tz);
1196 : }
1197 : }
1198 :
1199 : /*
1200 : Computes U_k(S), where the Chebyshev polynomial U_k(X) is defined by
1201 : U_k(X + 1/X) = (X^k - 1/X^k) / (X - 1/X)
1202 : If R1 != NULL, stores U_{k+1}(S) there
1203 : */
1204 :
1205 : static void
1206 2814 : U (mpres_t R, mpres_t R1, const mpres_t S, const long k, mpmod_t modulus)
1207 : {
1208 : mpres_t V0, Vi, Vi1, Ui, Ui1, t;
1209 : unsigned long j, uk;
1210 :
1211 2814 : if (k == 0L)
1212 : {
1213 2814 : mpres_set_ui (R, 0UL, modulus); /* U_0 = 0 */
1214 2814 : if (R1 != NULL)
1215 2814 : mpres_set_ui (R1, 1UL, modulus); /* U_1 = 1 */
1216 2814 : return;
1217 : }
1218 :
1219 0 : uk = labs (k);
1220 :
1221 0 : if (uk == 1UL)
1222 : {
1223 0 : mpres_set_ui (R, 1UL, modulus);
1224 0 : if (k == -1)
1225 0 : mpres_neg (R, R, modulus);
1226 :
1227 0 : if (R1 != NULL)
1228 : {
1229 0 : if (k == -1)
1230 0 : mpres_set_ui (R1, 0UL, modulus);
1231 : else
1232 0 : mpres_set (R1, S, modulus); /* U_2(S) = S */
1233 : }
1234 :
1235 0 : return;
1236 : }
1237 :
1238 : if (0)
1239 : {
1240 : mpz_t tz;
1241 : mpz_init (tz);
1242 : mpres_get_z (tz, S, modulus);
1243 : gmp_printf ("Chebyshev_U(%ld, Mod(%Zd,N)) == ", k, tz);
1244 : mpz_clear (tz);
1245 : }
1246 :
1247 0 : mpres_init (V0, modulus);
1248 0 : mpres_init (Vi, modulus);
1249 0 : mpres_init (Vi1, modulus);
1250 0 : mpres_init (Ui, modulus);
1251 0 : mpres_init (Ui1, modulus);
1252 0 : mpres_init (t, modulus);
1253 :
1254 0 : for (j = 1UL; j <= uk / 2UL; j <<= 1);
1255 :
1256 0 : mpres_set_ui (Ui, 1UL, modulus); /* Ui = U_1(S) = 1 */
1257 0 : mpres_set (Ui1, S, modulus); /* Ui1 = U_2(S) = S */
1258 0 : mpres_add (V0, Ui, Ui, modulus); /* V0 = V_0(S) = 2 */
1259 0 : mpres_set (Vi, S, modulus); /* Vi = V_1(S) = S */
1260 0 : mpres_sqr (Vi1, Vi, modulus);
1261 0 : mpres_sub (Vi1, Vi1, V0, modulus); /* Vi1 = V_2(S) = S^2 - 2 */
1262 0 : j >>= 1; /* i = 1 */
1263 :
1264 0 : while (j != 0)
1265 : {
1266 0 : if ((uk & j) == 0UL)
1267 : {
1268 0 : mpres_mul (Vi1, Vi1, Vi, modulus);
1269 0 : mpres_sub (Vi1, Vi1, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
1270 : /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
1271 0 : mpres_sub (t, Ui1, Ui, modulus);
1272 0 : mpres_add (Ui1, Ui1, Ui, modulus);
1273 0 : mpres_mul (Ui1, Ui1, t, modulus);
1274 0 : mpres_mul (Ui, Ui, Vi, modulus); /* U_{2n} = U_n V_n */
1275 0 : mpres_sqr (Vi, Vi, modulus);
1276 0 : mpres_sub (Vi, Vi, V0, modulus); /* V_{2n} = V_n^2 - 2 */
1277 : }
1278 : else
1279 : {
1280 : /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
1281 0 : mpres_sub (t, Ui1, Ui, modulus);
1282 0 : mpres_add (Ui, Ui, Ui1, modulus);
1283 0 : mpres_mul (Ui, Ui, t, modulus);
1284 0 : mpres_mul (Ui1, Ui1, Vi1, modulus); /* U_{2n+2} = U_{n+1} V_{n+1} */
1285 0 : mpres_mul (Vi, Vi, Vi1, modulus);
1286 0 : mpres_sub (Vi, Vi, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
1287 0 : mpres_sqr (Vi1, Vi1, modulus);
1288 0 : mpres_sub (Vi1, Vi1, V0, modulus); /* V_{2n+2} = V_{n+1}^2 - 2 */
1289 : }
1290 0 : j >>= 1;
1291 : }
1292 :
1293 0 : if (k > 0)
1294 0 : mpres_set (R, Ui, modulus);
1295 : else
1296 0 : mpres_neg (R, Ui, modulus);
1297 :
1298 0 : if (R1 != NULL)
1299 : {
1300 : /* Here k != -1,0,1, so k+1 is negative iff k is */
1301 0 : if (k > 0)
1302 0 : mpres_set (R1, Ui1, modulus);
1303 : else
1304 0 : mpres_neg (R1, Ui1, modulus);
1305 : }
1306 :
1307 0 : mpres_clear (V0, modulus);
1308 0 : mpres_clear (Vi, modulus);
1309 0 : mpres_clear (Vi1, modulus);
1310 0 : mpres_clear (Ui, modulus);
1311 0 : mpres_clear (Ui1, modulus);
1312 0 : mpres_clear (t, modulus);
1313 :
1314 : if (0)
1315 : {
1316 : mpz_t tz;
1317 : mpz_init (tz);
1318 : mpres_get_z (tz, R, modulus);
1319 : gmp_printf ("%Zd\n", tz);
1320 : mpz_clear (tz);
1321 : }
1322 : }
1323 :
1324 :
1325 : /* Set R[i] = V_{i+k}(Q) * F[i] or U_{i+k}(Q) * F[i], for 0 <= i < len
1326 : We compute V_{i+k+1}(Q) by V_{i+k}(Q)*V_1(Q) - V_{i+k-1}(Q).
1327 : For U, we compute U_{i+k+1}(Q) by U_{i+k}(Q)*V_1(Q) - U_{i+k-1}(Q).
1328 : The values of V_1(Q), V_{k-1}(Q) and V_k(Q) and V_k(Q) are in
1329 : V1, Vk_1 and Vk, resp.
1330 : The values of Vk_1 and Vk are clobbered. */
1331 : static void
1332 5628 : scale_by_chebyshev (listz_t R, const listz_t F, const unsigned long len,
1333 : mpmod_t modulus, const mpres_t V1, mpres_t Vk_1,
1334 : mpres_t Vk)
1335 : {
1336 : mpres_t Vt;
1337 : unsigned long i;
1338 :
1339 5628 : mpres_init (Vt, modulus);
1340 :
1341 3139702 : for (i = 0; i < len; i++)
1342 : {
1343 3134074 : mpres_mul_z_to_z (R[i], Vk, F[i], modulus);
1344 3134074 : mpres_mul (Vt, Vk, V1, modulus);
1345 3134074 : mpres_sub (Vt, Vt, Vk_1, modulus);
1346 3134074 : mpres_set (Vk_1, Vk, modulus); /* Could be a swap */
1347 3134074 : mpres_set (Vk, Vt, modulus); /* Could be a swap */
1348 : }
1349 :
1350 5628 : mpres_clear (Vt, modulus);
1351 5628 : }
1352 :
1353 :
1354 : /* For a given reciprocal polynomial
1355 : F(x) = f_0 + sum_{i=1}^{deg} f_i V_i(x+1/x),
1356 : compute F(\gamma x)F(\gamma^{-1} x), with Q = \gamma + 1 / \gamma
1357 :
1358 : If NTT is used, needs 4 * deg + 3 entries in tmp.
1359 : If no NTT is used, needs 4 * deg + 2 + (memory use of list_sqr_reciprocal)
1360 : */
1361 :
1362 : static void
1363 2814 : list_scale_V (listz_t R, const listz_t F, const mpres_t Q,
1364 : const unsigned long deg, mpmod_t modulus, listz_t tmp,
1365 : const unsigned long tmplen,
1366 : mpzspv_t dct, const mpzspm_t ntt_context)
1367 : {
1368 : mpres_t Vt;
1369 : unsigned long i;
1370 2814 : const listz_t G = tmp, H = tmp + 2 * deg + 1, newtmp = tmp + 4 * deg + 2;
1371 2814 : const unsigned long newtmplen = tmplen - 4 * deg - 2;
1372 : #ifdef WANT_ASSERT
1373 : mpz_t leading;
1374 : #endif
1375 :
1376 2814 : if (deg == 0)
1377 : {
1378 : ASSERT(tmplen >= 1);
1379 0 : mpz_mul (tmp[0], F[0], F[0]);
1380 0 : mpz_mod (R[0], tmp[0], modulus->orig_modulus);
1381 0 : return;
1382 : }
1383 :
1384 : /* Make sure newtmplen does not underflow */
1385 2814 : ASSERT_ALWAYS (tmplen >= 4 * deg + 2);
1386 : #ifdef WANT_ASSERT
1387 : mpz_init (leading);
1388 : mpz_mul (leading, F[deg], F[deg]);
1389 : mpz_mod (leading, leading, modulus->orig_modulus);
1390 : #endif
1391 :
1392 : /* Generate V_1(Q)/2 ... V_{deg}(Q)/2, multiply by f_i to form coefficients
1393 : of G(x). Square the symmetric G(x) polynomial. */
1394 :
1395 2814 : outputf (OUTPUT_TRACE, "list_scale_V: Q=%Zd, deg = %lu\n", Q, deg);
1396 2814 : list_output_poly (F, deg + 1, 0, 1, "/* list_scale_V */ F(x) = ", "\n",
1397 : OUTPUT_TRACE);
1398 :
1399 : /* Compute G[i] = V_i(Q)/2 * F[i] for i = 0, ..., deg.
1400 : For i=0, V_0(Q) = 2, so G[0] = F[0],
1401 : which leaves deg entries to process */
1402 :
1403 2814 : mpz_set (G[0], F[0]);
1404 :
1405 : #if defined(_OPENMP)
1406 : #pragma omp parallel if (deg > 1000)
1407 : #endif
1408 : {
1409 2814 : const int nr_chunks = omp_get_num_threads();
1410 2814 : const int thread_nr = omp_get_thread_num();
1411 : mpmod_t modulus_local;
1412 : unsigned long l, start_i;
1413 : mpres_t Vi, Vi_1;
1414 :
1415 2814 : l = (deg - 1) / nr_chunks + 1; /* l = ceil (deg / nr_chunks) */
1416 2814 : start_i = thread_nr * l + 1;
1417 2814 : if (start_i <= deg + 1)
1418 2814 : l = MIN(l, deg + 1 - start_i);
1419 : else
1420 0 : l = 0;
1421 :
1422 2814 : mpmod_init_set (modulus_local, modulus);
1423 2814 : mpres_init (Vi_1, modulus_local);
1424 2814 : mpres_init (Vi, modulus_local);
1425 :
1426 2814 : V (Vi, Q, start_i, modulus_local);
1427 2814 : mpres_div_2exp (Vi, Vi, 1, modulus_local);
1428 2814 : V (Vi_1, Q, start_i - 1UL, modulus_local);
1429 2814 : mpres_div_2exp (Vi_1, Vi_1, 1, modulus_local);
1430 2814 : scale_by_chebyshev (G + start_i, F + start_i, l, modulus_local,
1431 : Q, Vi_1, Vi);
1432 :
1433 2814 : mpres_clear (Vi_1, modulus_local);
1434 2814 : mpres_clear (Vi, modulus_local);
1435 2814 : mpmod_clear (modulus_local);
1436 : }
1437 :
1438 :
1439 2814 : list_output_poly (G, deg + 1, 0, 1, "/* list_scale_V */ G(x) = ", "\n",
1440 : OUTPUT_TRACE);
1441 :
1442 : /* Now square the G polynomial in G[0 .. deg], put result in
1443 : G[0 .. 2*deg] */
1444 :
1445 : /* Bugfix: ks_multiply() does not like negative coefficients. FIXME */
1446 :
1447 1572665 : for (i = 0; i <= deg; i++)
1448 1569851 : if (mpz_sgn (G[i]) < 0)
1449 : {
1450 0 : mpz_add (G[i], G[i], modulus->orig_modulus);
1451 : /* FIXME: make sure the absolute size does not "run away" */
1452 0 : if (mpz_sgn (G[i]) < 0)
1453 : {
1454 0 : outputf (OUTPUT_ERROR, "list_scale_V: G[%lu] still negative\n", i);
1455 0 : mpz_mod (G[i], G[i], modulus->orig_modulus);
1456 : }
1457 : }
1458 :
1459 2814 : if (dct != NULL && ntt_context != NULL)
1460 2175 : ntt_sqr_reciprocal (G, G, dct, deg + 1, ntt_context);
1461 : else
1462 639 : list_sqr_reciprocal (G, G, deg + 1, modulus->orig_modulus,
1463 : newtmp, newtmplen);
1464 :
1465 2814 : list_output_poly (G, 2 * deg + 1, 0, 1, "/* list_scale_V */ G(x)^2 == ",
1466 : "\n", OUTPUT_TRACE);
1467 :
1468 : /* Compute H[i-1] = U_i(Q)/2 * F[i] for i = 1, ..., deg */
1469 :
1470 : #if defined(_OPENMP)
1471 : #pragma omp parallel if (deg > 1000)
1472 : #endif
1473 : {
1474 2814 : const int nr_chunks = omp_get_num_threads();
1475 2814 : const int thread_nr = omp_get_thread_num();
1476 : mpmod_t modulus_local;
1477 : unsigned long l, start_i;
1478 : mpres_t Ui, Ui_1;
1479 :
1480 2814 : l = (deg - 1) / nr_chunks + 1; /* l = ceil(deg / nr_chunks) */
1481 2814 : start_i = thread_nr * l + 1UL;
1482 2814 : if (start_i <= deg + 1)
1483 2814 : l = MIN(l, deg + 1 - start_i);
1484 : else
1485 0 : l = 0;
1486 :
1487 2814 : mpmod_init_set (modulus_local, modulus);
1488 2814 : mpres_init (Ui_1, modulus_local);
1489 2814 : mpres_init (Ui, modulus_local);
1490 :
1491 2814 : U (Ui_1, Ui, Q, start_i - 1, modulus_local);
1492 2814 : mpres_div_2exp (Ui, Ui, 1, modulus_local);
1493 2814 : mpres_div_2exp (Ui_1, Ui_1, 1, modulus_local);
1494 :
1495 2814 : scale_by_chebyshev (H - 1 + start_i, F + start_i, l, modulus_local,
1496 : Q, Ui_1, Ui);
1497 :
1498 2814 : mpres_clear (Ui_1, modulus_local);
1499 2814 : mpres_clear (Ui, modulus_local);
1500 2814 : mpmod_clear (modulus_local);
1501 : }
1502 :
1503 :
1504 : /* Convert H to standard basis */
1505 : /* We can do it in-place with H - 1 = H_U. */
1506 :
1507 1565126 : for (i = deg; i >= 3; i--)
1508 : {
1509 1562312 : mpz_add (H[i - 3], H[i - 3], H[i - 1]);
1510 1562312 : if (mpz_cmp (H[i - 3], modulus->orig_modulus) >= 0)
1511 780677 : mpz_sub (H[i - 3], H[i - 3], modulus->orig_modulus);
1512 : }
1513 :
1514 : /* U_2(X+1/X) = (X^2 - 1/X^2)/(X-1/X) = X+1/X = V_1(X+1/X),
1515 : so no addition occures here */
1516 : /* if (deg >= 2)
1517 : mpz_set (H[1], H[1]); Again, a no-op. */
1518 :
1519 : /* U_1(X+1/X) = 1, so this goes to coefficient of index 0 in std. basis */
1520 : /* mpz_set (H[0], H[0]); Another no-op. */
1521 :
1522 : /* Now H[0 ... deg-1] contains the deg coefficients in standard basis
1523 : of symmetric H(X) of degree 2*deg-2. */
1524 :
1525 2814 : list_output_poly (H, deg, 0, 1, "/* list_scale_V */ H(x) = ", "\n",
1526 : OUTPUT_TRACE);
1527 :
1528 : /* Square the symmetric H polynomial of degree 2*deg-2 (i.e. with deg
1529 : coefficents in standard basis in H[0 ... deg-1]) */
1530 :
1531 : /* Bugfix: ks_multiply() does not like negative coefficients. */
1532 :
1533 1572665 : for (i = 0; i <= deg; i++)
1534 1569851 : if (mpz_sgn (H[i]) < 0)
1535 : {
1536 0 : mpz_add (H[i], H[i], modulus->orig_modulus);
1537 0 : if (mpz_sgn (H[i]) < 0)
1538 : {
1539 0 : outputf (OUTPUT_ERROR, "list_scale_V: H[%lu] still negative\n", i);
1540 0 : mpz_mod (H[i], H[i], modulus->orig_modulus);
1541 : }
1542 : }
1543 :
1544 2814 : if (dct != NULL && ntt_context != NULL)
1545 2175 : ntt_sqr_reciprocal (H, H, dct, deg, ntt_context);
1546 : else
1547 639 : list_sqr_reciprocal (H, H, deg, modulus->orig_modulus,
1548 : newtmp, newtmplen);
1549 :
1550 : /* Now there are the 2*deg-1 coefficients in standard basis of a
1551 : symmetric polynomial of degree 4*deg - 4 in H[0 ... 2*deg-2] */
1552 :
1553 2814 : list_output_poly (H, 2*deg - 1, 0, 1, "/* list_scale_V */ H(x)^2 == ", "\n",
1554 : OUTPUT_TRACE);
1555 :
1556 : /* Multiply by Q^2-4 */
1557 2814 : mpres_init (Vt, modulus);
1558 2814 : mpres_sqr (Vt, Q, modulus);
1559 2814 : mpres_sub_ui (Vt, Vt, 4, modulus);
1560 :
1561 : #if defined(_OPENMP)
1562 : #pragma omp parallel if (deg > 1000)
1563 : {
1564 : mpmod_t modulus_local;
1565 : #ifdef _MSC_VER
1566 : long i; /* Microsoft C/C++ stuck on OpenMP 2.0 :( */
1567 : #endif
1568 : mpmod_init_set (modulus_local, modulus);
1569 :
1570 : #pragma omp for
1571 : for (i = 0; i <= 2 * deg - 2; i++)
1572 : mpres_mul_z_to_z (H[i], Vt, H[i], modulus_local);
1573 : mpmod_clear (modulus_local);
1574 : }
1575 : #else
1576 3134074 : for (i = 0; i <= 2 * deg - 2; i++)
1577 3131260 : mpres_mul_z_to_z (H[i], Vt, H[i], modulus);
1578 : #endif
1579 :
1580 2814 : list_output_poly (H, 2 * deg - 1, 0, 1, "/* list_scale_V */ "
1581 : "H(x)^2*(Q^2-4) == ", "\n", OUTPUT_TRACE);
1582 :
1583 :
1584 : /* Multiply by (X - 1/X)^2 = X^2 - 2 + 1/X^2 and subtract from G */
1585 : ASSERT (newtmplen > 0UL);
1586 2814 : if (deg == 1)
1587 : {
1588 : /* H(X) has degree 2*deg-2 = 0, so H(X) = h_0
1589 : H(X) * (X - 1/X)^2 = -2 h_0 + h_0 V_2(Y) */
1590 903 : mpz_mul_2exp (newtmp[0], H[0], 1UL);
1591 903 : mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*H[0] */
1592 903 : mpz_sub (G[2], G[2], H[0]);
1593 : }
1594 1911 : else if (deg == 2)
1595 : {
1596 : /* H(X) has degree 2*deg-2 = 2, , so
1597 : H(X) = h_0 + h_1 (X+1/X) + h_2 (X^2+1/X^2)
1598 :
1599 : H(X) * (X - 1/X)^2 =
1600 : -2*(h_0 - h_2) - h_1 * V_1(Y) + (h_0 - 2*h_2) * V_2(Y) +
1601 : h_1 * V_3(Y) + h_2 * V_4(Y)
1602 : */
1603 4 : mpz_sub (newtmp[0], H[0], H[2]); /* h_0 - h_2 */
1604 4 : mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* 2*(h_0 - h_2) */
1605 4 : mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*(h_0 - h_2) */
1606 :
1607 4 : mpz_add (G[1], G[1], H[1]); /* G[1] -= -h_1 */
1608 4 : mpz_sub (newtmp[0], newtmp[0], H[0]); /* h_0 - 2*h_2 */
1609 4 : mpz_sub (G[2], G[2], newtmp[0]); /* G[2] -= h_0 - 2*h_2 */
1610 4 : mpz_sub (G[3], G[3], H[1]); /* G[3] -= h_1 */
1611 4 : mpz_sub (G[4], G[4], H[2]); /* G[3] -= h_2 */
1612 : }
1613 : else
1614 : {
1615 : /* Let H(X) = h_0 + \sum_{i=1}^{n} h_i V_i(Y), Y = X+1/X. Then
1616 : (x - 1/x)^2 H(X) =
1617 : -2(h_0 - h_2) +
1618 : (- h_1 + h_3) V_1(Y) +
1619 : \sum_{i=2}^{n-2} (h_{i-2} - 2h_i + h_{i+2}) V_i(Y) +
1620 : (h_{n-3} - 2h_{n-1}) V_{n-1}(Y) +
1621 : (h_{n-2} - 2h_n) V_n(Y) +
1622 : h_{n-1} V_{n+1}(Y) +
1623 : h_n V_{n+2}(Y)
1624 :
1625 : In our case, n = 2 * deg - 2
1626 : */
1627 1907 : mpz_sub (newtmp[0], H[0], H[2]);
1628 1907 : mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* t[0] = 2*(h_0 - h_2) */
1629 1907 : mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*(h_0 - h_2) */
1630 :
1631 1907 : mpz_add (G[1], G[1], H[1]);
1632 1907 : mpz_sub (G[1], G[1], H[3]); /* G[1] -= -h_1 + h_3 */
1633 :
1634 3124624 : for (i = 2; i <= 2 * deg - 4; i++)
1635 : {
1636 3122717 : mpz_mul_2exp (newtmp[0], H[i], 1);
1637 3122717 : mpz_sub (newtmp[0], newtmp[0], H[i - 2]);
1638 3122717 : mpz_sub (newtmp[0], newtmp[0], H[i + 2]); /* 2h_i-h_{i-2}-h_{i+2} */
1639 3122717 : mpz_add (G[i], G[i], newtmp[0]); /* G[i] -= -2h_i+h_{i-2}+h_{i+2} */
1640 : }
1641 :
1642 5721 : for ( ; i <= 2 * deg - 2; i++)
1643 : {
1644 3814 : mpz_mul_2exp (newtmp[0], H[i], 1UL);
1645 3814 : mpz_sub (newtmp[0], H[i - 2], newtmp[0]); /* h_{n-3} - 2h_{n-1} */
1646 3814 : mpz_sub (G[i], G[i], newtmp[0]);
1647 : }
1648 :
1649 1907 : mpz_sub (G[i], G[i], H[i - 2]);
1650 1907 : mpz_sub (G[i + 1], G[i + 1], H[i - 1]);
1651 : }
1652 :
1653 3139702 : for (i = 0; i <= 2 * deg; i++)
1654 3136888 : mpz_mod (R[i], G[i], modulus->orig_modulus);
1655 :
1656 2814 : if (test_verbose (OUTPUT_TRACE))
1657 18526 : for (i = 0; i <= 2 * deg; i++)
1658 18435 : outputf (OUTPUT_TRACE, "list_scale_V: R[%lu] = %Zd\n", i, R[i]);
1659 :
1660 : #ifdef WANT_ASSERT
1661 : mpz_mod (R[2 * deg], R[2 * deg], modulus->orig_modulus);
1662 : ASSERT (mpz_cmp (leading, R[2 * deg]) == 0);
1663 : mpz_clear (leading);
1664 : #endif
1665 :
1666 2814 : mpres_clear (Vt, modulus);
1667 : }
1668 :
1669 :
1670 : #ifdef WANT_ASSERT
1671 : /* Check if l is an (anti-)symmetric, possibly monic, polynomial.
1672 : Returns -1 if it is (anti-)symmetric, or the smallest index i where
1673 : l[i] != l[len - 1 + monic - i])
1674 : If anti == 1, the list is checked for symmetry, if it is -1, for
1675 : antisymmetry.
1676 : This function is used only if assertions are enabled.
1677 : */
1678 :
1679 : static long int ATTRIBUTE_UNUSED
1680 : list_is_symmetric (listz_t l, unsigned long len, int monic, int anti,
1681 : mpz_t modulus, mpz_t tmp)
1682 : {
1683 : unsigned long i;
1684 :
1685 : ASSERT (monic == 0 || monic == 1);
1686 : ASSERT (anti == 1 || anti == -1);
1687 :
1688 : if (monic && anti == 1 && mpz_cmp_ui (l[0], 1) != 0)
1689 : return 0L;
1690 :
1691 : if (monic && anti == -1)
1692 : {
1693 : mpz_sub_ui (tmp, modulus, 1);
1694 : if (mpz_cmp (tmp, l[0]) != 0)
1695 : return 0L;
1696 : }
1697 :
1698 : for (i = monic; i < len / 2; i++)
1699 : {
1700 : if (anti == -1)
1701 : {
1702 : /* Negate (mod modulus) */
1703 : if (mpz_sgn (l[i]) == 0)
1704 : {
1705 : if (mpz_sgn (l[len - 1 + monic - i]) != 0)
1706 : return (long) i;
1707 : }
1708 : else
1709 : {
1710 : mpz_sub (tmp, modulus, l[i]);
1711 : if (mpz_cmp (tmp, l[len - 1 + monic - i]) != 0)
1712 : return (long) i;
1713 : }
1714 : }
1715 : else if (mpz_cmp (l[i], l[len - 1 + monic - i]) != 0)
1716 : return (long) i;
1717 : }
1718 :
1719 : return -1L;
1720 : }
1721 : #endif
1722 :
1723 : #if 0 && defined(WANT_ASSERT)
1724 : /* Evaluate a polynomial of degree n-1 with all coefficients given in F[],
1725 : or of degree n with an implicit leading 1 monomial not stored in F[],
1726 : at x modulo modulus. Result goes in r. tmp needs 2 entries. */
1727 :
1728 : static void
1729 : list_eval_poly (mpz_t r, const listz_t F, const mpz_t x,
1730 : const unsigned long n, const int monic, const mpz_t modulus,
1731 : listz_t tmp)
1732 : {
1733 : unsigned long i;
1734 :
1735 : mpz_set_ui (tmp[0], 1UL);
1736 : mpz_set_ui (r, 0UL);
1737 :
1738 : for (i = 0UL; i < n; i++)
1739 : {
1740 : /* tmp[0] = x^i */
1741 : mpz_mul (tmp[1], F[i], tmp[0]);
1742 : mpz_mod (tmp[1], tmp[1], modulus);
1743 : mpz_add (r, r, tmp[1]);
1744 :
1745 : mpz_mul (tmp[1], tmp[0], x);
1746 : mpz_mod (tmp[0], tmp[1], modulus);
1747 : }
1748 :
1749 : if (monic)
1750 : mpz_add (r, r, tmp[0]);
1751 :
1752 : mpz_mod (r, r, modulus);
1753 : }
1754 : #endif
1755 :
1756 : /* return the memory needed in the F[] array for poly_from_sets_V */
1757 : static unsigned long
1758 374 : mem_poly_from_sets_V (sets_long_t *sets)
1759 : {
1760 : unsigned long c, deg, i, nr;
1761 374 : set_long_t *set = sets->sets;
1762 374 : unsigned long mem, maxmem = 0;
1763 :
1764 374 : deg = 1UL;
1765 2193 : for (nr = sets->nr - 1UL; nr > 0UL; nr--)
1766 : {
1767 1819 : set = sets_nextset (sets->sets); /* Skip first set */
1768 7180 : for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
1769 5361 : set = sets_nextset (set);
1770 1819 : c = set->card;
1771 1819 : if (c == 2UL)
1772 1061 : deg *= 2UL;
1773 : else
1774 : {
1775 758 : i = (c - 1UL) / 2UL - 1; /* maximal value of i */
1776 : /* list_scale_V needs 2*deg+1 entries starting at
1777 : F + (2UL * i + 1UL) * (deg + 1UL) */
1778 758 : mem = (2UL * i + 1UL) * (deg + 1UL) + (2 * deg + 1UL);
1779 758 : if (mem > maxmem)
1780 758 : maxmem = mem;
1781 758 : deg *= c;
1782 : }
1783 : }
1784 :
1785 374 : return maxmem;
1786 : }
1787 :
1788 : /* Build a polynomial with roots r^2i, i in the sumset of the sets in "sets".
1789 : The parameter Q = r + 1/r. This code uses the fact that the polynomials
1790 : are symmetric. Requires that the first set in "sets" has cardinality 2,
1791 : all sets must be symmetric around 0. The resulting polynomial of degree
1792 : 2*d is F(x) = f_0 + \sum_{1 <= i <= d} f_i (x^i + 1/x^i). The coefficient
1793 : f_i is stored in F[i], which therefore needs d+1 elements. */
1794 :
1795 : static unsigned long
1796 480 : poly_from_sets_V (listz_t F, const mpres_t Q, sets_long_t *sets,
1797 : listz_t tmp, const unsigned long tmplen, mpmod_t modulus,
1798 : mpzspv_t dct, const mpzspm_t ntt_context)
1799 : {
1800 : unsigned long c, deg, i, nr;
1801 480 : set_long_t *set = sets->sets;
1802 : mpres_t Qt;
1803 :
1804 480 : ASSERT_ALWAYS (sets->nr > 0UL);
1805 480 : ASSERT_ALWAYS (set->card == 2UL); /* Check that the cardinality of
1806 : first set is 2 */
1807 : /* Check that first set is symmetric around 0 (we write card-1
1808 : instead of 1 to avoid a compiler warning with clang 2.9) */
1809 480 : ASSERT_ALWAYS (set->elem[0] == -set->elem[set->card - 1]);
1810 :
1811 480 : if (test_verbose (OUTPUT_TRACE))
1812 : {
1813 : mpz_t t;
1814 10 : mpz_init (t);
1815 10 : mpres_get_z (t, Q, modulus);
1816 10 : outputf (OUTPUT_TRACE, "poly_from_sets_V (F, Q = %Zd, sets)\n", t);
1817 10 : mpz_clear (t);
1818 : }
1819 :
1820 480 : mpres_init (Qt, modulus);
1821 :
1822 480 : outputf (OUTPUT_DEVVERBOSE, " (processing set of size 2");
1823 :
1824 480 : V (Qt, Q, set->elem[0], modulus); /* First set in sets is {-k, k} */
1825 480 : V (Qt, Qt, 2UL, modulus); /* Qt = V_2k(Q) */
1826 :
1827 480 : mpres_neg (Qt, Qt, modulus);
1828 480 : mpres_get_z (F[0], Qt, modulus);
1829 480 : mpz_set_ui (F[1], 1UL);
1830 480 : deg = 1UL;
1831 : /* Here, F(x) = (x - r^{2k_1})(x - r^{-2k_1}) / x =
1832 : (x^2 - x (r^{2k_1} + r^{-2k_1}) + 1) / x =
1833 : (x + 1/x) - V_{2k_1}(r + 1/r) */
1834 :
1835 2809 : for (nr = sets->nr - 1UL; nr > 0UL; nr--)
1836 : {
1837 : /* Assuming the sets are sorted in order of ascending cardinality,
1838 : we process them back-to-front so the sets of cardinality 2 are
1839 : processed last, but skipping the first set which we processed
1840 : already. */
1841 :
1842 2329 : set = sets_nextset (sets->sets); /* Skip first set */
1843 8944 : for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
1844 6615 : set = sets_nextset (set);
1845 :
1846 : /* Process this set. We assume it is either of cardinality 2, or of
1847 : odd cardinality */
1848 2329 : c = set->card;
1849 2329 : outputf (OUTPUT_DEVVERBOSE, " %lu", c);
1850 :
1851 2329 : if (c == 2UL)
1852 : {
1853 : /* Check it's symmetric (we write c-1 instead of 1 to avoid a
1854 : compiler warning with clang 2.9) */
1855 1357 : ASSERT_ALWAYS (set->elem[0] == -set->elem[c - 1]);
1856 1357 : V (Qt, Q, set->elem[0], modulus);
1857 1357 : V (Qt, Qt, 2UL, modulus);
1858 1357 : list_scale_V (F, F, Qt, deg, modulus, tmp, tmplen, dct,
1859 : ntt_context);
1860 1357 : deg *= 2UL;
1861 1357 : ASSERT_ALWAYS (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
1862 : }
1863 : else
1864 : {
1865 972 : ASSERT_ALWAYS (c % 2UL == 1UL);
1866 972 : ASSERT_ALWAYS (set->elem[(c - 1UL) / 2UL] == 0UL);
1867 : /* Generate the F(Q^{2k_i} * X)*F(Q^{-2k_i} * X) polynomials.
1868 : Each is symmetric of degree 2*deg, so each has deg+1 coefficients
1869 : in standard basis. */
1870 2429 : for (i = 0UL; i < (c - 1UL) / 2UL; i++)
1871 : {
1872 : /* Check it's symmetric */
1873 1457 : ASSERT_ALWAYS (set->elem[i] == -set->elem[c - 1L - i]);
1874 1457 : V (Qt, Q, set->elem[i], modulus);
1875 1457 : V (Qt, Qt, 2UL, modulus);
1876 : ASSERT (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
1877 1457 : list_scale_V (F + (2UL * i + 1UL) * (deg + 1UL), F, Qt, deg,
1878 : modulus, tmp, tmplen, dct, ntt_context);
1879 : ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL * deg],
1880 : 1UL) == 0); /* Check it's monic */
1881 : }
1882 : /* Multiply the polynomials */
1883 2429 : for (i = 0UL; i < (c - 1UL) / 2UL; i++)
1884 : {
1885 : /* So far, we have the product
1886 : F(X) * F(Q^{2k_j} * X) * F(Q^{-2k_j} * X), 1 <= j <= i,
1887 : at F. This product has degree 2 * deg + i * 4 * deg, that is
1888 : (2 * i + 1) * 2 * deg, which means (2 * i + 1) * deg + 1
1889 : coefficients in F[0 ... (i * 2 + 1) * deg]. */
1890 : ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * deg], 1UL) == 0);
1891 : ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL*deg],
1892 : 1UL) == 0);
1893 1457 : list_output_poly (F, (2UL * i + 1UL) * deg + 1, 0, 1,
1894 : "poly_from_sets_V: Multiplying ", "\n",
1895 : OUTPUT_TRACE);
1896 1457 : list_output_poly (F + (2UL * i + 1UL) * (deg + 1UL),
1897 1457 : 2UL * deg + 1UL, 0, 1, " and ", "\n",
1898 : OUTPUT_TRACE);
1899 1457 : list_mul_reciprocal (F,
1900 1457 : F, (2UL * i + 1UL) * deg + 1UL,
1901 1457 : F + (2UL * i + 1UL) * (deg + 1UL),
1902 1457 : 2UL * deg + 1UL, modulus->orig_modulus,
1903 : tmp, tmplen);
1904 1457 : list_mod (F, F, (2UL * i + 3UL) * deg + 1UL,
1905 1457 : modulus->orig_modulus);
1906 1457 : list_output_poly (F, (2UL * i + 3UL) * deg + 1UL, 0, 1,
1907 : " = ", "\n", OUTPUT_TRACE);
1908 : ASSERT (mpz_cmp_ui (F[(2UL * i + 3UL) * deg], 1UL) == 0);
1909 : }
1910 972 : deg *= c;
1911 : }
1912 : }
1913 :
1914 480 : mpres_clear (Qt, modulus);
1915 480 : outputf (OUTPUT_DEVVERBOSE, ")");
1916 :
1917 480 : return deg;
1918 : }
1919 :
1920 : static int
1921 374 : build_F_ntt (listz_t F, const mpres_t P_1, sets_long_t *S_1,
1922 : const faststage2_param_t *params, mpmod_t modulus)
1923 : {
1924 : mpzspm_t F_ntt_context;
1925 : mpzspv_t F_ntt;
1926 : unsigned long tmplen;
1927 : listz_t tmp;
1928 : long timestart, realstart;
1929 : unsigned long i;
1930 :
1931 374 : timestart = cputime ();
1932 374 : realstart = realtime ();
1933 :
1934 : /* Precompute the small primes, primitive roots and inverses etc. for
1935 : the NTT. The code to multiply wants a 3*k-th root of unity, where
1936 : k is the smallest power of 2 with k > s_1/2 */
1937 :
1938 374 : F_ntt_context = mpzspm_init (3UL << ceil_log2 (params->s_1 / 2 + 1),
1939 374 : modulus->orig_modulus);
1940 374 : if (F_ntt_context == NULL)
1941 : {
1942 0 : outputf (OUTPUT_ERROR, "Could not initialise F_ntt_context, "
1943 : "presumably out of memory\n");
1944 0 : return ECM_ERROR;
1945 : }
1946 :
1947 374 : print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for building F = ",
1948 : F_ntt_context);
1949 :
1950 374 : outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
1951 :
1952 374 : tmplen = params->s_1 + 100;
1953 374 : tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
1954 374 : F_ntt = mpzspv_init (1UL << ceil_log2 (params->s_1 / 2 + 1), F_ntt_context);
1955 :
1956 374 : i = poly_from_sets_V (F, P_1, S_1, tmp, tmplen, modulus, F_ntt,
1957 : F_ntt_context);
1958 374 : ASSERT_ALWAYS(2 * i == params->s_1);
1959 374 : ASSERT_ALWAYS(mpz_cmp_ui (F[i], 1UL) == 0);
1960 :
1961 374 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
1962 374 : if (test_verbose (OUTPUT_TRACE))
1963 : {
1964 8080 : for (i = 0; i < params->s_1 / 2 + 1; i++)
1965 8072 : outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
1966 8 : outputf (OUTPUT_TRACE, "f(x) = f_0");
1967 8072 : for (i = 1; i < params->s_1 / 2 + 1; i++)
1968 8064 : outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
1969 8 : outputf (OUTPUT_TRACE, "/* PARI */ \n");
1970 : }
1971 :
1972 374 : clear_list (tmp, tmplen);
1973 374 : tmp = NULL;
1974 374 : mpzspv_clear (F_ntt, F_ntt_context);
1975 374 : F_ntt = NULL;
1976 374 : mpzspm_clear (F_ntt_context);
1977 374 : F_ntt_context = NULL;
1978 :
1979 374 : return 0;
1980 : }
1981 :
1982 : /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l.
1983 : x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P.
1984 : Stores the result in g[0 ... l] and/or in g_ntt[offset ... offset + l] */
1985 :
1986 : static void
1987 452 : pm1_sequence_g (listz_t g_mpz, mpzspv_t g_ntt, const mpres_t b_1,
1988 : const unsigned long P, const long M_param,
1989 : const unsigned long l_param, const mpz_t m_1, const long k_2,
1990 : mpmod_t modulus_param, const mpzspm_t ntt_context)
1991 : {
1992 : mpres_t r[3], x_0, x_Mi;
1993 : mpz_t t;
1994 : unsigned long i;
1995 : long timestart, realstart;
1996 452 : long M = M_param;
1997 452 : unsigned long l = l_param, offset = 0UL;
1998 : mpmod_t modulus;
1999 452 : int want_output = 1;
2000 :
2001 452 : outputf (OUTPUT_VERBOSE, "Computing g_i");
2002 452 : outputf (OUTPUT_DEVVERBOSE, "\npm1_sequence_g: P = %lu, M_param = %lu, "
2003 : "l_param = %lu, m_1 = %Zd, k_2 = %lu\n",
2004 : P, M_param, l_param, m_1, k_2);
2005 452 : timestart = cputime ();
2006 452 : realstart = realtime ();
2007 :
2008 : #ifdef _OPENMP
2009 : #pragma omp parallel if (l > 100) private(r, x_0, x_Mi, t, i, M, l, offset, modulus, want_output)
2010 : {
2011 : /* When multi-threading, we adjust the parameters for each thread */
2012 :
2013 : const int nr_chunks = omp_get_num_threads();
2014 : const int thread_nr = omp_get_thread_num();
2015 :
2016 : l = (l_param - 1) / nr_chunks + 1; /* = ceil(l_param / nr_chunks) */
2017 : offset = thread_nr * l;
2018 : if (offset <= l_param)
2019 : l = MIN(l, l_param - offset);
2020 : else
2021 : l = 0;
2022 : outputf (OUTPUT_DEVVERBOSE,
2023 : "pm1_sequence_g: thread %d has l = %lu, offset = %lu.\n",
2024 : thread_nr, l, offset);
2025 : M = M_param - (long) offset;
2026 :
2027 : /* Let only the master thread print stuff */
2028 : want_output = (thread_nr == 0);
2029 :
2030 : if (want_output)
2031 : outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2032 : #endif
2033 :
2034 : /* Make a private copy of the mpmod_t struct */
2035 452 : mpmod_init_set (modulus, modulus_param);
2036 :
2037 452 : mpz_init (t);
2038 452 : mpres_init (r[0], modulus);
2039 452 : mpres_init (r[1], modulus);
2040 452 : mpres_init (r[2], modulus);
2041 452 : mpres_init (x_0, modulus);
2042 452 : mpres_init (x_Mi, modulus);
2043 :
2044 452 : if (want_output)
2045 : {
2046 452 : if (test_verbose (OUTPUT_TRACE))
2047 : {
2048 12 : mpres_get_z (t, b_1, modulus);
2049 12 : outputf (OUTPUT_TRACE, "\n/* pm1_sequence_g */ N = %Zd; "
2050 : "b_1 = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, t);
2051 12 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ P = %lu; M = %ld; "
2052 : "m_1 = %Zd; /* PARI */\n", P, M, m_1);
2053 12 : outputf (OUTPUT_TRACE,
2054 : "/* pm1_sequence_g */ r = b_1^P; /* PARI */\n");
2055 12 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 = "
2056 : "b_1^(2*%ld + (2*m_1 + 1)*P); /* PARI */\n", k_2);
2057 : }
2058 : }
2059 :
2060 : /* We use (M-(i+1))^2 = (M-i)^2 + 2*(-M+i) + 1 */
2061 452 : mpz_set_ui (t, P);
2062 452 : mpres_pow (r[0], b_1, t, modulus); /* r[0] = b_1^P = r */
2063 452 : if (test_verbose (OUTPUT_TRACE))
2064 : {
2065 12 : mpres_get_z (t, r[0], modulus);
2066 12 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ r == %Zd /* PARI C */\n", t);
2067 : }
2068 :
2069 : /* FIXME: This is a huge mess, clean up some time */
2070 :
2071 452 : mpz_set_si (t, M);
2072 452 : mpz_neg (t, t);
2073 452 : mpz_mul_2exp (t, t, 1UL);
2074 452 : mpz_add_ui (t, t, 1UL);
2075 : /* Warning: t might be negative here. */
2076 452 : mpres_pow (r[1], r[0], t, modulus); /* r[1] = r^{2(-M+i)+1}, i = 0 */
2077 452 : mpz_set_si (t, M);
2078 452 : mpz_mul (t, t, t); /* t = M^2 */
2079 452 : mpres_pow (r[2], r[0], t, modulus); /* r[2] = r^{(M-i)^2}, i = 0 */
2080 452 : mpres_sqr (r[0], r[0], modulus); /* r[0] = r^2 */
2081 :
2082 452 : mpz_mul_2exp (t, m_1, 1UL);
2083 452 : mpz_add_ui (t, t, 1UL);
2084 452 : mpz_mul_ui (t, t, P);
2085 452 : mpz_add_si (t, t, k_2);
2086 452 : mpz_add_si (t, t, k_2);
2087 452 : if (want_output)
2088 452 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ 2*%ld + (2*%Zd + 1)*P == "
2089 : "%Zd /* PARI C */\n", k_2, m_1, t);
2090 :
2091 : /* Warning: t might be negative here. */
2092 452 : mpres_pow (x_0, b_1, t, modulus); /* x_0 = b_1^{2*k_2 + (2*m_1 + 1)*P} */
2093 452 : if (want_output && test_verbose (OUTPUT_TRACE))
2094 : {
2095 12 : mpres_get_z (t, x_0, modulus);
2096 12 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 == %Zd /* PARI C */\n",
2097 : t);
2098 : }
2099 :
2100 452 : mpz_set_si (t, M);
2101 452 : mpres_pow (x_Mi, x_0, t, modulus); /* x_Mi = x_0^{M-i}, i = 0 */
2102 :
2103 452 : mpres_invert (x_0, x_0, modulus); /* x_0 := x_0^{-1} now */
2104 452 : mpres_mul (r[1], r[1], x_0, modulus); /* r[1] = x_0^{-1} * r^{-2M+1} */
2105 :
2106 452 : mpres_mul (r[2], r[2], x_Mi, modulus); /* r[2] = x_0^M * r^{M^2} */
2107 452 : mpres_get_z (t, r[2], modulus);
2108 452 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd; /* PARI */\n",
2109 : offset, t);
2110 452 : if (l > 0)
2111 : {
2112 452 : if (g_mpz != NULL)
2113 127 : mpz_set (g_mpz[offset], t);
2114 452 : if (g_ntt != NULL)
2115 325 : mpzspv_from_mpzv (g_ntt, offset, &t, 1UL, ntt_context);
2116 : }
2117 :
2118 : /* So here we have for i = 0
2119 : r[2] = x_0^(M-i) * r^{(M-i)^2}
2120 : r[1] = x_0^{-1} * r^{2(-M+i)+1}
2121 : r[0] = r^2
2122 : t = r[2]
2123 : */
2124 :
2125 3656905 : for (i = 1; i < l; i++)
2126 : {
2127 3656453 : if (g_mpz != NULL)
2128 : {
2129 646538 : mpres_mul_z_to_z (g_mpz[offset + i], r[1], g_mpz[offset + i - 1],
2130 : modulus);
2131 646538 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
2132 646538 : " /* PARI */\n", offset + i, g_mpz[offset + i]);
2133 : }
2134 3656453 : if (g_ntt != NULL)
2135 : {
2136 3009915 : mpres_mul_z_to_z (t, r[1], t, modulus);
2137 3009915 : if (g_mpz == NULL) /* Only one should be non-NULL... */
2138 3009915 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
2139 : " /* PARI */\n", offset + i, t);
2140 3009915 : mpzspv_from_mpzv (g_ntt, offset + i, &t, 1UL, ntt_context);
2141 : }
2142 3656453 : mpres_mul (r[1], r[1], r[0], modulus);
2143 : }
2144 :
2145 452 : mpres_clear (r[0], modulus);
2146 452 : mpres_clear (r[1], modulus);
2147 452 : mpres_clear (r[2], modulus);
2148 452 : mpres_clear (x_0, modulus);
2149 452 : mpres_clear (x_Mi, modulus);
2150 452 : mpz_clear (t);
2151 452 : mpmod_clear (modulus); /* Clear our private copy of modulus */
2152 :
2153 : #ifdef _OPENMP
2154 : }
2155 : #endif
2156 :
2157 452 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2158 :
2159 452 : if (test_verbose (OUTPUT_TRACE))
2160 : {
2161 43024 : for (i = 0; i < l_param; i++)
2162 : {
2163 43012 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu == x_0^"
2164 : "(M - %lu) * r^((M - %lu)^2) /* PARI C */\n", i, i, i);
2165 : }
2166 12 : outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g(x) = g_0");
2167 43012 : for (i = 1; i < l; i++)
2168 43000 : outputf (OUTPUT_TRACE, " + g_%lu * x^%lu", i, i);
2169 12 : outputf (OUTPUT_TRACE, " /* PARI */\n");
2170 : }
2171 452 : }
2172 :
2173 :
2174 : /* Compute h_j = r^(-j^2) * f_j for 0 <= j < d as described in section 9
2175 : of the paper. h == f is ok. */
2176 :
2177 : static void
2178 246 : pm1_sequence_h (listz_t h, mpzspv_t h_ntt, mpz_t *f, const mpres_t r,
2179 : const unsigned long d, mpmod_t modulus_parm,
2180 : const mpzspm_t ntt_context)
2181 : {
2182 : mpres_t invr; /* r^{-1}. Can be shared between threads */
2183 : long timestart, realstart;
2184 :
2185 246 : mpres_init (invr, modulus_parm);
2186 246 : mpres_invert (invr, r, modulus_parm); /* invr = r^{-1}. FIXME: test for
2187 : failure, even if theoretically
2188 : impossible */
2189 :
2190 246 : if (test_verbose (OUTPUT_TRACE))
2191 : {
2192 : mpz_t t;
2193 5 : mpz_init (t);
2194 5 : mpres_get_z (t, r, modulus_parm);
2195 5 : outputf (OUTPUT_TRACE, "\n/* pm1_sequence_h */ N = %Zd; "
2196 : "r = Mod(%Zd, N); /* PARI */\n",
2197 5 : modulus_parm->orig_modulus, t);
2198 5 : mpz_clear (t);
2199 : }
2200 :
2201 246 : outputf (OUTPUT_VERBOSE, "Computing h");
2202 246 : timestart = cputime ();
2203 246 : realstart = realtime ();
2204 :
2205 : #ifdef _OPENMP
2206 : #pragma omp parallel if (d > 100)
2207 : #endif
2208 : {
2209 : mpres_t fd[3]; /* finite differences table for r^{-i^2}*/
2210 : mpz_t t; /* the h_j value as an mpz_t */
2211 : unsigned long j;
2212 246 : unsigned long offset = 0UL, len = d;
2213 : mpmod_t modulus;
2214 :
2215 : /* Adjust offset and length for this thread */
2216 : #ifdef _OPENMP
2217 : {
2218 : const int nr_chunks = omp_get_num_threads();
2219 : const int thread_nr = omp_get_thread_num();
2220 : unsigned long chunklen;
2221 :
2222 : if (thread_nr == 0)
2223 : outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2224 :
2225 : /* chunklen = ceil (len / nr_chunks) */
2226 : chunklen = (len - 1UL) / (unsigned long) nr_chunks + 1UL;
2227 : offset = chunklen * (unsigned long) thread_nr;
2228 : if (offset <= len)
2229 : len = MIN(chunklen, len - offset);
2230 : else
2231 : len = 0;
2232 : }
2233 : #endif
2234 :
2235 246 : mpmod_init_set (modulus, modulus_parm);
2236 246 : mpres_init (fd[0], modulus);
2237 246 : mpres_init (fd[1], modulus);
2238 246 : mpres_init (fd[2], modulus);
2239 246 : mpz_init (t);
2240 :
2241 : /* We have (n + 1)^2 = n^2 + 2n + 1. For the finite differences we'll
2242 : need r^{-2}, r^{-(2n+1)}, r^{-n^2}. Init for n = 0. */
2243 :
2244 : /* r^{-2} in fd[0] is constant and could be shared. Computing it
2245 : separately in each thread has the advantage of putting it in
2246 : local memory. May not make much difference overall */
2247 :
2248 246 : mpres_sqr (fd[0], invr, modulus); /* fd[0] = r^{-2} */
2249 246 : mpz_set_ui (t, offset);
2250 246 : mpz_mul_2exp (t, t, 1UL);
2251 246 : mpz_add_ui (t, t, 1UL); /* t = 2 * offset + 1 */
2252 246 : mpres_pow (fd[1], invr, t, modulus); /* fd[1] = r^{-(2*offset+1)} */
2253 246 : mpz_set_ui (t, offset);
2254 246 : mpz_mul (t, t, t); /* t = offset^2 */
2255 246 : mpres_pow (fd[2], invr, t, modulus); /* fd[2] = r^{-offset^2} */
2256 :
2257 : /* Generate the sequence */
2258 182533 : for (j = offset; j < offset + len; j++)
2259 : {
2260 182287 : mpres_mul_z_to_z (t, fd[2], f[j], modulus);
2261 182287 : outputf (OUTPUT_TRACE,
2262 : "/* pm1_sequence_h */ h_%lu = %Zd; /* PARI */\n", j, t);
2263 :
2264 182287 : if (h != NULL)
2265 41010 : mpz_set (h[j], t);
2266 182287 : if (h_ntt != NULL)
2267 141277 : mpzspv_from_mpzv (h_ntt, j, &t, 1UL, ntt_context);
2268 :
2269 182287 : mpres_mul (fd[2], fd[2], fd[1], modulus); /* fd[2] = r^{-j^2} */
2270 182287 : mpres_mul (fd[1], fd[1], fd[0], modulus); /* fd[1] = r^{-2*j-1} */
2271 : }
2272 :
2273 246 : mpres_clear (fd[2], modulus);
2274 246 : mpres_clear (fd[1], modulus);
2275 246 : mpres_clear (fd[0], modulus);
2276 246 : mpz_clear (t);
2277 246 : mpmod_clear (modulus);
2278 : }
2279 :
2280 246 : mpres_clear (invr, modulus_parm);
2281 :
2282 246 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2283 :
2284 246 : if (test_verbose (OUTPUT_TRACE))
2285 : {
2286 : unsigned long j;
2287 4702 : for (j = 0; j < d; j++)
2288 4697 : outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h_%lu == "
2289 : "f_%lu * r^(-%lu^2) /* PARI C */\n", j, j, j);
2290 5 : outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h(x) = h_0");
2291 4697 : for (j = 1; j < d; j++)
2292 4692 : outputf (OUTPUT_TRACE, " + h_%lu * (x^%lu + x^(-%lu))", j, j, j);
2293 5 : outputf (OUTPUT_TRACE, " /* PARI */\n");
2294 : }
2295 246 : }
2296 :
2297 :
2298 : static int
2299 480 : make_S_1_S_2 (sets_long_t **S_1, set_long_t **S_2,
2300 : const faststage2_param_t *params)
2301 : {
2302 : unsigned long i;
2303 : sets_long_t *facS_2;
2304 : size_t facS_2_size;
2305 :
2306 480 : *S_1 = sets_get_factored_sorted (params->P);
2307 480 : if (*S_1 == NULL)
2308 0 : return ECM_ERROR;
2309 :
2310 : {
2311 : mpz_t t1, t2;
2312 :
2313 480 : mpz_init (t1);
2314 480 : mpz_init (t2);
2315 480 : sets_sumset_minmax (t1, *S_1, 1);
2316 480 : sets_max (t2, params->P);
2317 480 : ASSERT_ALWAYS (mpz_cmp (t1, t2) == 0);
2318 480 : mpz_clear (t1);
2319 480 : mpz_clear (t2);
2320 : }
2321 :
2322 480 : *S_2 = malloc (set_sizeof(params->s_2));
2323 480 : if (*S_2 == NULL)
2324 : {
2325 0 : free (*S_1);
2326 0 : return ECM_ERROR;
2327 : }
2328 :
2329 : /* Extract sets for S_2 and compute the set of sums */
2330 :
2331 480 : sets_extract (NULL, &facS_2_size, *S_1, params->s_2);
2332 480 : facS_2 = malloc (facS_2_size);
2333 480 : if (facS_2 == NULL)
2334 : {
2335 0 : free (*S_1);
2336 0 : free (*S_2);
2337 0 : return ECM_ERROR;
2338 : }
2339 480 : sets_extract (facS_2, NULL, *S_1, params->s_2);
2340 480 : sets_sumset (*S_2, facS_2);
2341 480 : ASSERT_ALWAYS ((*S_2)->card == params->s_2);
2342 480 : free (facS_2);
2343 480 : quicksort_long ((*S_2)->elem, (*S_2)->card);
2344 :
2345 : /* Print the sets in devverbose mode */
2346 480 : if (test_verbose (OUTPUT_DEVVERBOSE))
2347 : {
2348 35 : outputf (OUTPUT_DEVVERBOSE, "S_1 = ");
2349 35 : sets_print (OUTPUT_DEVVERBOSE, *S_1);
2350 :
2351 35 : outputf (OUTPUT_DEVVERBOSE, "S_2 = {");
2352 89 : for (i = 0UL; i + 1UL < params->s_2; i++)
2353 54 : outputf (OUTPUT_DEVVERBOSE, "%ld, ", (*S_2)->elem[i]);
2354 35 : if (i < params->s_2)
2355 35 : outputf (OUTPUT_DEVVERBOSE, "%ld", (*S_2)->elem[i]);
2356 35 : outputf (OUTPUT_DEVVERBOSE, "}\n");
2357 : }
2358 :
2359 480 : return 0;
2360 : }
2361 :
2362 : #if 0
2363 : static mpzspv_t *
2364 : mpzspv_init_mt (spv_size_t len, mpzspm_t mpzspm)
2365 : {
2366 : int i; /* OpenMP wants the iteration variable a signed type */
2367 : mpzspv_t *x = (mpzspv_t *) malloc (mpzspm->sp_num * sizeof (spv_t *));
2368 :
2369 : if (x == NULL)
2370 : return NULL;
2371 :
2372 : for (i = 0; i < (int) mpzspm->sp_num; i++)
2373 : x[i] = NULL;
2374 :
2375 : #ifdef _OPENMP
2376 : #pragma omp parallel private(i) shared(x)
2377 : {
2378 : #pragma omp for
2379 : #endif
2380 : for (i = 0; i < (int) mpzspm->sp_num; i++)
2381 : x[i] = (spv_t *) sp_aligned_malloc (len * sizeof (sp_t));
2382 :
2383 : #ifdef _OPENMP
2384 : }
2385 : #endif
2386 :
2387 : for (i = 0; i < (int) mpzspm->sp_num; i++)
2388 : if (x[i] == NULL)
2389 : break;
2390 :
2391 : if (i != (int) mpzspm->sp_num) /* There is a NULL pointer */
2392 : {
2393 : for (i = 0; i < (int) mpzspm->sp_num; i++)
2394 : if (x[i] != NULL)
2395 : sp_aligned_free(x[i]);
2396 : return NULL;
2397 : }
2398 :
2399 : #if 0
2400 : if (test_verbose (OUTPUT_DEVVERBOSE))
2401 : {
2402 : spv_t * last = x[0];
2403 : printf ("mpzspv_init_mt: x[0] = %p\n", x[0]);
2404 : for (i = 1; i < (int) mpzspm->sp_num; i++)
2405 : printf ("mpzspv_init_mt: x[%d] = %p, distance = %ld\n",
2406 : i, x[i], (long) (x[i] - x[i-1]));
2407 : }
2408 : #endif
2409 :
2410 : return x;
2411 : }
2412 : #endif
2413 :
2414 : /* Square the reciprocal Laurent polynomial S(x) of degree 2*n-2.
2415 : S(x) = s_0 + \sum_{i=1}^{n-1} s_i (x^i + x^{-1}).
2416 : S[i] contains the n coefficients s_i, 0 <= i <= n-1.
2417 : R[i] will contain the 2n-1 coefficients r_i, 0 <= i <= 2*n-2, where
2418 : R(x) = S(x)^2 = r_0 + \sum_{i=1}^{2n-2} r_i (x^i + x^{-1}).
2419 : dft must have power of 2 length len >= 2n.
2420 : The NTT primes must be == 1 (mod 3*len).
2421 : */
2422 :
2423 : #undef TRACE_ntt_sqr_reciprocal
2424 : static void
2425 4350 : ntt_sqr_reciprocal (mpzv_t R, const mpzv_t S, mpzspv_t dft,
2426 : const spv_size_t n, const mpzspm_t ntt_context)
2427 : {
2428 : #ifdef WANT_ASSERT
2429 : mpz_t S_eval_1, R_eval_1;
2430 : #endif
2431 :
2432 4350 : if (n == 0)
2433 0 : return;
2434 :
2435 4350 : if (n == 1)
2436 : {
2437 691 : mpz_mul (R[0], S[0], S[0]);
2438 691 : mpz_mod (R[0], R[0], ntt_context->modulus);
2439 691 : return;
2440 : }
2441 :
2442 : #ifdef WANT_ASSERT
2443 : mpz_init (S_eval_1);
2444 : list_recip_eval1 (S_eval_1, S, n);
2445 : /* Compute (S(1))^2 */
2446 : mpz_mul (S_eval_1, S_eval_1, S_eval_1);
2447 : mpz_mod (S_eval_1, S_eval_1, ntt_context->modulus);
2448 : #endif
2449 :
2450 : #ifdef TRACE_ntt_sqr_reciprocal
2451 : printf ("ntt_sqr_reciprocal: n %lu, length %lu\n", n, len);
2452 : gmp_printf ("Input polynomial is %Zd", S[0]);
2453 : {
2454 : int j;
2455 : for (j = 1; (spv_size_t) j < n; j++)
2456 : gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", S[j], j, j);
2457 : }
2458 : printf ("\n");
2459 : #endif
2460 :
2461 : /* Fill NTT elements [0 .. n-1] with coefficients */
2462 3659 : mpzspv_from_mpzv (dft, (spv_size_t) 0, S, n, ntt_context);
2463 3659 : mpzspv_sqr_reciprocal (dft, n, ntt_context);
2464 :
2465 : #if defined(_OPENMP)
2466 : #pragma omp parallel if (n > 50)
2467 : #endif
2468 : {
2469 3659 : spv_size_t i, offset = 0, chunklen = 2*n - 1;
2470 :
2471 : #if defined(_OPENMP)
2472 : {
2473 : const int nr_chunks = omp_get_num_threads();
2474 : const int thread_nr = omp_get_thread_num();
2475 :
2476 : chunklen = (chunklen - 1) / (spv_size_t) nr_chunks + 1;
2477 : offset = (spv_size_t) thread_nr * chunklen;
2478 : if (offset <= 2*n - 1)
2479 : chunklen = MIN(chunklen, (2*n - 1) - offset);
2480 : else
2481 : chunklen = 0UL;
2482 : }
2483 : #endif
2484 :
2485 3659 : mpzspv_to_mpzv (dft, offset, R + offset, chunklen, ntt_context);
2486 6045296 : for (i = offset; i < offset + chunklen; i++)
2487 6041637 : mpz_mod (R[i], R[i], ntt_context->modulus);
2488 : }
2489 :
2490 : #ifdef TRACE_ntt_sqr_reciprocal
2491 : gmp_printf ("ntt_sqr_reciprocal: Output polynomial is %Zd", R[0]);
2492 : for (j = 1; (spv_size_t) j < 2*n - 1; j++)
2493 : gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
2494 : printf ("\n");
2495 : #endif
2496 :
2497 : #ifdef WANT_ASSERT
2498 : mpz_init (R_eval_1);
2499 : /* Compute (S^2)(1) and compare to (S(1))^2 */
2500 : list_recip_eval1 (R_eval_1, R, 2 * n - 1);
2501 : mpz_mod (R_eval_1, R_eval_1, ntt_context->modulus);
2502 : if (mpz_cmp (R_eval_1, S_eval_1) != 0)
2503 : {
2504 : gmp_fprintf (stderr, "ntt_sqr_reciprocal: (S(1))^2 = %Zd but "
2505 : "(S^2)(1) = %Zd\n", S_eval_1, R_eval_1);
2506 : #if 0
2507 : gmp_printf ("Output polynomial is %Zd", R[0]);
2508 : for (j = 1; (spv_size_t) j < 2*n - 1; j++)
2509 : gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
2510 : printf ("\n");
2511 : #endif
2512 : abort ();
2513 : }
2514 : mpz_clear (S_eval_1);
2515 : mpz_clear (R_eval_1);
2516 : #endif
2517 : }
2518 :
2519 :
2520 : /* Computes gcd(\prod_{0 <= i < len} (ntt[i + offset] + add[i]), N),
2521 : the NTT residues are converted to integer residues (mod N) first.
2522 : If add == NULL, add[i] is assumed to be 0. */
2523 :
2524 : static void
2525 596 : ntt_gcd (mpz_t f, mpz_t *product, mpzspv_t ntt, const unsigned long ntt_offset,
2526 : const listz_t add, const unsigned long len_param,
2527 : const mpzspm_t ntt_context, mpmod_t modulus_param)
2528 : {
2529 : unsigned long i, j;
2530 596 : const unsigned long Rlen = MPZSPV_NORMALISE_STRIDE;
2531 : listz_t R;
2532 596 : unsigned long len = len_param, thread_offset = 0;
2533 : mpres_t tmpres, tmpprod, totalprod;
2534 : mpmod_t modulus;
2535 : long timestart, realstart;
2536 :
2537 596 : outputf (OUTPUT_VERBOSE, "Computing gcd of coefficients and N");
2538 596 : timestart = cputime ();
2539 596 : realstart = realtime ();
2540 :
2541 : /* All the threads will multiply their partial products to this one. */
2542 596 : mpres_init (totalprod, modulus_param);
2543 596 : mpres_set_ui (totalprod, 1UL, modulus_param);
2544 :
2545 : #ifdef _OPENMP
2546 : #pragma omp parallel if (len > 100) private(i, j, R, len, thread_offset, tmpres, tmpprod, modulus) shared(totalprod)
2547 : {
2548 : const int nr_chunks = omp_get_num_threads();
2549 : const int thread_nr = omp_get_thread_num();
2550 :
2551 : len = (len_param - 1) / nr_chunks + 1;
2552 : thread_offset = thread_nr * len;
2553 : if (thread_offset <= len_param)
2554 : len = MIN(len, len_param - thread_offset);
2555 : else
2556 : len = 0;
2557 : #pragma omp master
2558 : {
2559 : outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2560 : }
2561 : #endif
2562 :
2563 : /* Make a private copy of the mpmod_t struct */
2564 596 : mpmod_init_set (modulus, modulus_param);
2565 :
2566 596 : R = init_list2 (Rlen, (mpz_size (modulus->orig_modulus) + 2) *
2567 : GMP_NUMB_BITS);
2568 596 : mpres_init (tmpres, modulus);
2569 596 : mpres_init (tmpprod, modulus);
2570 596 : mpres_set_ui (tmpprod, 1UL, modulus);
2571 :
2572 46665 : for (i = 0; i < len; i += Rlen)
2573 : {
2574 46069 : const unsigned long blocklen = MIN(len - i, Rlen);
2575 :
2576 : /* Convert blocklen residues from NTT to integer representatives
2577 : and store them in R */
2578 46069 : mpzspv_to_mpzv (ntt, ntt_offset + thread_offset + i, R, blocklen,
2579 : ntt_context);
2580 :
2581 : /* Accumulate product in tmpprod */
2582 5894909 : for (j = 0; j < blocklen; j++)
2583 : {
2584 5848840 : outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[j]);
2585 5848840 : if (add != NULL)
2586 183680 : mpz_add (R[j], R[j], add[i + thread_offset + j]);
2587 5848840 : mpres_set_z_for_gcd (tmpres, R[j], modulus);
2588 : #define TEST_ZERO_RESULT
2589 : #ifdef TEST_ZERO_RESULT
2590 5848840 : if (mpres_is_zero (tmpres, modulus))
2591 265 : outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
2592 : #endif
2593 5848840 : mpres_mul (tmpprod, tmpprod, tmpres, modulus);
2594 : }
2595 : }
2596 : #ifdef _OPENMP
2597 : #pragma omp critical
2598 : {
2599 : mpres_mul (totalprod, totalprod, tmpprod, modulus);
2600 : }
2601 : #else
2602 596 : mpres_set (totalprod, tmpprod, modulus);
2603 : #endif
2604 596 : mpres_clear (tmpres, modulus);
2605 596 : mpres_clear (tmpprod, modulus);
2606 596 : mpmod_clear (modulus);
2607 596 : clear_list (R, Rlen);
2608 : #ifdef _OPENMP
2609 : }
2610 : #endif
2611 :
2612 : {
2613 : mpz_t n;
2614 596 : mpz_init (n);
2615 596 : mpz_set_ui (n, len_param);
2616 596 : mpres_set_z_for_gcd_fix (totalprod, totalprod, n, modulus_param);
2617 596 : mpz_clear (n);
2618 : }
2619 :
2620 596 : if (product != NULL)
2621 53 : mpres_get_z (*product, totalprod, modulus_param);
2622 :
2623 596 : mpres_gcd (f, totalprod, modulus_param);
2624 596 : mpres_clear (totalprod, modulus_param);
2625 :
2626 596 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2627 596 : }
2628 :
2629 :
2630 : int
2631 59 : pm1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus,
2632 : const faststage2_param_t *params)
2633 : {
2634 : unsigned long phiP, nr;
2635 : unsigned long i, l, lenF, lenG, lenR, tmplen;
2636 : sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
2637 : progressions of prime length */
2638 : set_long_t *S_2; /* This is stored as a regular set */
2639 : listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
2640 : degree s_1. It is symmetric, so has only s_1 / 2 + 1
2641 : distinct coefficients. The sequence h_j will be stored in
2642 : the same memory and won't be a monic polynomial, so the
2643 : leading 1 monomial of F will be stored explicitly. Hence we
2644 : need s_1 / 2 + 1 entries. */
2645 : listz_t g, h, tmp, R;
2646 : mpz_t mt; /* All-purpose temp mpz_t */
2647 : mpres_t mr; /* All-purpose temp mpres_t */
2648 59 : int youpi = ECM_NO_FACTOR_FOUND;
2649 : long timetotalstart, realtotalstart, timestart;
2650 :
2651 59 : timetotalstart = cputime ();
2652 59 : realtotalstart = realtime ();
2653 :
2654 59 : phiP = eulerphi (params->P);
2655 59 : ASSERT_ALWAYS (phiP == params->s_1 * params->s_2);
2656 59 : ASSERT_ALWAYS (params->s_1 < params->l);
2657 59 : nr = params->l - params->s_1; /* Number of points we evaluate */
2658 :
2659 59 : if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
2660 0 : return ECM_ERROR;
2661 :
2662 : /* Allocate all the memory we'll need */
2663 : /* Allocate the correct amount of space for each mpz_t or the
2664 : reallocations will up to double the time for stage 2! */
2665 59 : mpz_init (mt);
2666 59 : mpres_init (mr, modulus);
2667 59 : lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
2668 : the leading 1 monomial for each factor */
2669 59 : F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
2670 59 : h = malloc ((params->s_1 + 1) * sizeof (mpz_t));
2671 59 : if (h == NULL)
2672 : {
2673 0 : fprintf (stderr, "Cannot allocate memory in pm1fs2\n");
2674 0 : exit (1);
2675 : }
2676 59 : lenG = params->l;
2677 59 : g = init_list2 (lenG, (unsigned int) abs (modulus->bits));
2678 59 : lenR = nr;
2679 59 : R = init_list2 (lenR, (unsigned int) abs (modulus->bits));
2680 59 : tmplen = 3UL * params->l + list_mul_mem (params->l / 2);
2681 59 : outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
2682 59 : if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
2683 : {
2684 2 : tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
2685 : /* FIXME: It appears TMulGen_space() returns a too small value! */
2686 2 : outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n",
2687 : tmplen);
2688 : }
2689 : #ifdef SHOW_TMP_USAGE
2690 : tmp = init_list (tmplen);
2691 : #else
2692 59 : tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
2693 : #endif
2694 :
2695 59 : mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
2696 59 : outputf (OUTPUT_TRACE,
2697 : "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
2698 59 : modulus->orig_modulus, mt);
2699 :
2700 :
2701 : /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2k_1}) */
2702 59 : outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
2703 :
2704 59 : timestart = cputime ();
2705 :
2706 : /* First compute X + 1/X */
2707 59 : mpres_invert (mr, X, modulus);
2708 59 : mpres_add (mr, mr, X, modulus);
2709 :
2710 59 : i = poly_from_sets_V (F, mr, S_1, tmp, tmplen, modulus, NULL, NULL);
2711 59 : ASSERT_ALWAYS(2 * i == params->s_1);
2712 : ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
2713 59 : free (S_1);
2714 59 : S_1 = NULL;
2715 :
2716 59 : outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2717 59 : if (test_verbose (OUTPUT_TRACE))
2718 : {
2719 662 : for (i = 0; i < params->s_1 / 2 + 1; i++)
2720 661 : outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
2721 1 : outputf (OUTPUT_TRACE, "f(x) = f_0");
2722 661 : for (i = 1; i < params->s_1 / 2 + 1; i++)
2723 660 : outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
2724 1 : outputf (OUTPUT_TRACE, "/* PARI */ \n");
2725 : }
2726 :
2727 59 : mpz_set_ui (mt, params->P);
2728 59 : mpres_pow (mr, X, mt, modulus); /* mr = X^P */
2729 59 : pm1_sequence_h (F, NULL, F, mr, params->s_1 / 2 + 1, modulus, NULL);
2730 :
2731 : /* Make a symmetric copy of F in h. It will have length
2732 : s_1 + 1 = 2*lenF - 1 */
2733 : /* I.e. with F = [3, 2, 1], s_1 = 4, we want h = [1, 2, 3, 2, 1] */
2734 41069 : for (i = 0; i < params->s_1 / 2 + 1; i++)
2735 41010 : *(h[i]) = *(F[params->s_1 / 2 - i]); /* Clone the mpz_t. */
2736 41010 : for (i = 0; i < params->s_1 / 2; i++)
2737 40951 : *(h[i + params->s_1 / 2 + 1]) = *(F[i + 1]);
2738 59 : if (test_verbose (OUTPUT_TRACE))
2739 : {
2740 1322 : for (i = 0; i < params->s_1 + 1; i++)
2741 1321 : outputf (OUTPUT_VERBOSE, "h_%lu = %Zd; /* PARI */\n", i, h[i]);
2742 1 : outputf (OUTPUT_VERBOSE, "h(x) = h_0");
2743 1321 : for (i = 1; i < params->s_1 + 1; i++)
2744 1320 : outputf (OUTPUT_VERBOSE, " + h_%lu * x^%lu", i, i);
2745 1 : outputf (OUTPUT_VERBOSE, " /* PARI */\n");
2746 : }
2747 :
2748 146 : for (l = 0; l < params->s_2; l++)
2749 : {
2750 127 : const unsigned long M = params->l - 1L - params->s_1 / 2L;
2751 127 : outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
2752 127 : l + 1, params->s_2);
2753 127 : pm1_sequence_g (g, NULL, X, params->P, M, params->l,
2754 127 : params->m_1, S_2->elem[l], modulus, NULL);
2755 :
2756 : /* Do the convolution */
2757 : /* Use the transposed "Middle Product" algorithm */
2758 : /* TMulGen reverses the first input sequence, but that doesn't matter
2759 : since h is symmetric. */
2760 :
2761 127 : outputf (OUTPUT_VERBOSE, "TMulGen of g and h");
2762 127 : timestart = cputime ();
2763 : ASSERT(tmplen >= TMulGen_space (nr - 1, params->l - 1, params->s_1));
2764 :
2765 : /* Computes rev(h)*g, stores coefficients of x^(s_1) to
2766 : x^(s_1+nr-1) = x^(len-1) */
2767 127 : if (TMulGen (R, nr - 1, h, params->s_1, g, params->l - 1, tmp,
2768 127 : modulus->orig_modulus) < 0)
2769 : {
2770 0 : outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
2771 : "of memory)\n");
2772 0 : youpi = ECM_ERROR;
2773 0 : break;
2774 : }
2775 127 : list_mod (R, R, nr, modulus->orig_modulus);
2776 :
2777 127 : outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2778 :
2779 : #if 0 && defined(WANT_ASSERT)
2780 :
2781 : /* See if R[i] is correct, with a test that works even if i0 != 0 */
2782 : /* More expensive self-test */
2783 : /* alpha = beta*(i0 + l*nr) */
2784 : /* This code is old and probably does not work. */
2785 :
2786 : outputf (OUTPUT_VERBOSE, "Verifying all results (slow)");
2787 : for (i = 0; i < nr; i++)
2788 : {
2789 : mpz_set_ui (mt, nr * l);
2790 : mpz_add (mt, mt, root_params->i0);
2791 : mpz_add_ui (mt, mt, i);
2792 : mpz_mul_ui (mt, mt, beta);
2793 : mpres_get_z (tmp[0], X, modulus);
2794 : mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus);
2795 : /* Hence, tmp[0] = X^(alpha + i * beta) */
2796 : list_eval_poly (tmp[1], F, tmp[0], dF, 1, modulus->orig_modulus,
2797 : tmp + 2);
2798 :
2799 : mpz_set_ui (mt, i);
2800 : mpz_mul_ui (mt, mt, i);
2801 : mpz_mul_ui (mt, mt, beta / 2); /* h(i) = beta*i^2/2 */
2802 : mpres_get_z (tmp[0], X, modulus);
2803 : mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus); /* X^h(1) */
2804 : mpz_mul (tmp[0], tmp[0], R[i]);
2805 : mpz_mod (tmp[0], tmp[0], modulus->orig_modulus);
2806 : if (mpz_cmp (tmp[0], tmp[1]) != 0)
2807 : {
2808 : outputf (OUTPUT_ERROR, "Result in R[%ld] incorrect.\n", i);
2809 : outputf (OUTPUT_ERROR, "R[%ld] = %Zd\n", i, R[i]);
2810 : abort ();
2811 : }
2812 : }
2813 : outputf (OUTPUT_VERBOSE, " - everything's correct! :-D\n");
2814 : #endif
2815 :
2816 127 : if (test_verbose (OUTPUT_TRACE))
2817 : {
2818 4968 : for (i = 0; i < nr; i++)
2819 4964 : outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[i]);
2820 : }
2821 :
2822 127 : outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)");
2823 127 : timestart = cputime ();
2824 :
2825 : {
2826 : mpres_t tmpres, tmpprod;
2827 127 : mpres_init (tmpres, modulus);
2828 127 : mpres_init (tmpprod, modulus);
2829 127 : mpres_set_z_for_gcd (tmpprod, R[0], modulus);
2830 493155 : for (i = 1; i < nr; i++)
2831 : {
2832 493028 : mpres_set_z_for_gcd (tmpres, R[i], modulus);
2833 493028 : mpres_mul (tmpprod, tmpprod, tmpres, modulus);
2834 : }
2835 127 : mpres_get_z (tmp[1], tmpprod, modulus); /* For printing */
2836 127 : mpres_gcd (tmp[0], tmpprod, modulus);
2837 127 : mpres_clear (tmpprod, modulus);
2838 127 : mpres_clear (tmpres, modulus);
2839 : }
2840 :
2841 127 : outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2842 127 : outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", tmp[1]);
2843 :
2844 127 : if (mpz_cmp_ui (tmp[0], 1UL) > 0)
2845 : {
2846 40 : mpz_set (f, tmp[0]);
2847 40 : youpi = ECM_FACTOR_FOUND_STEP2;
2848 40 : break;
2849 : }
2850 : }
2851 :
2852 : #ifdef SHOW_TMP_USAGE
2853 : for (i = tmplen - 1; i > 0; i--)
2854 : if (tmp[i]->_mp_alloc > 1)
2855 : break;
2856 : outputf (OUTPUT_DEVVERBOSE, "Highest used temp element is tmp[%lu]\n", i);
2857 : #endif
2858 :
2859 59 : free (S_2);
2860 59 : free (h);
2861 59 : clear_list (F, lenF);
2862 59 : clear_list (g, lenG);
2863 59 : clear_list (R, lenR);
2864 59 : clear_list (tmp, tmplen);
2865 :
2866 59 : mpz_clear (mt);
2867 59 : mpres_clear (mr, modulus);
2868 :
2869 59 : outputf (OUTPUT_NORMAL, "Step 2");
2870 : /* In normal output mode, print only cpu time as we always have.
2871 : In verbose mode, print real time as well if we used multi-threading */
2872 59 : if (test_verbose (OUTPUT_VERBOSE))
2873 16 : print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
2874 : else
2875 43 : print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
2876 :
2877 59 : return youpi;
2878 : }
2879 :
2880 :
2881 : int
2882 187 : pm1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
2883 : const faststage2_param_t *params)
2884 : {
2885 : unsigned long nr;
2886 : unsigned long l, lenF;
2887 : sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
2888 : progressions of prime length) */
2889 : set_long_t *S_2; /* This is stored as a regular set */
2890 : listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
2891 : degree s_1. It is symmetric, so has only s_1 / 2 + 1
2892 : distinct coefficients. The sequence h_j will be stored in
2893 : the same memory and won't be a monic polynomial, so the
2894 : leading 1 monomial of F will be stored explicitly. Hence we
2895 : need s_1 / 2 + 1 entries. */
2896 : mpzspm_t ntt_context;
2897 : mpzspv_t g_ntt, h_ntt;
2898 : mpz_t mt; /* All-purpose temp mpz_t */
2899 : mpz_t product; /* Product of each multi-point evaluation */
2900 187 : mpz_t *product_ptr = NULL;
2901 : mpres_t tmpres; /* All-purpose temp mpres_t */
2902 187 : int youpi = ECM_NO_FACTOR_FOUND;
2903 : long timetotalstart, realtotalstart, timestart, realstart;
2904 :
2905 187 : timetotalstart = cputime ();
2906 187 : realtotalstart = realtime ();
2907 :
2908 187 : ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
2909 187 : ASSERT_ALWAYS (params->s_1 < params->l);
2910 187 : nr = params->l - params->s_1; /* Number of points we evaluate */
2911 :
2912 : /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution
2913 : with g. We need NTT of transform length l. We do it here at the start
2914 : of stage 2 so that in case of a "not enough primes" condition,
2915 : we don't have to wait until after F is built to get the error. */
2916 :
2917 187 : ntt_context = mpzspm_init (params->l, modulus->orig_modulus);
2918 187 : if (ntt_context == NULL)
2919 : {
2920 0 : outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
2921 : "presumably out of memory\n");
2922 0 : return ECM_ERROR;
2923 : }
2924 :
2925 187 : print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ",
2926 : ntt_context);
2927 :
2928 187 : if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
2929 0 : return ECM_ERROR;
2930 :
2931 : /* Allocate all the memory we'll need for building f */
2932 187 : mpz_init (mt);
2933 187 : mpres_init (tmpres, modulus);
2934 187 : lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
2935 : the leading 1 monomial for each factor */
2936 : {
2937 : /* in some cases the above value of lenF is not enough, for example with
2938 : s_1 = 10, which gives lenF = 7, but 9 entries are needed */
2939 187 : unsigned long mem = mem_poly_from_sets_V (S_1);
2940 187 : if (mem > lenF)
2941 20 : lenF = mem;
2942 : }
2943 :
2944 187 : F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
2945 :
2946 187 : mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
2947 187 : outputf (OUTPUT_TRACE,
2948 : "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
2949 187 : modulus->orig_modulus, mt);
2950 :
2951 : #if 0 && defined (WANT_ASSERT)
2952 : /* For this self test run with a large enough B2 so that enough memory
2953 : is allocated for tmp and F_ntt, otherwise it segfaults. */
2954 : {
2955 : int testlen = 255;
2956 : int i, j;
2957 : /* A test of ntt_sqr_reciprocal() */
2958 : for (j = 1; j <= testlen; j++)
2959 : {
2960 : outputf (OUTPUT_VERBOSE,
2961 : "Testing ntt_sqr_reciprocal() for input degree %d\n",
2962 : j - 1);
2963 : for (i = 0; i < j; i++)
2964 : mpz_set_ui (tmp[i], 1UL);
2965 : ntt_sqr_reciprocal (tmp, tmp, F_ntt, (spv_size_t) j, ntt_context_F);
2966 : for (i = 0; i < 2 * j - 1; i++)
2967 : {
2968 : ASSERT (mpz_cmp_ui (tmp[i], 2 * j - 1 - i) == 0);
2969 : }
2970 : }
2971 : outputf (OUTPUT_VERBOSE,
2972 : "Test of ntt_sqr_reciprocal() for input degree 2 ... %d passed\n",
2973 : testlen - 1);
2974 : }
2975 : #endif
2976 :
2977 :
2978 : /* First compute X + 1/X */
2979 187 : mpres_invert (tmpres, X, modulus);
2980 187 : mpres_add (tmpres, tmpres, X, modulus);
2981 :
2982 187 : if (build_F_ntt (F, tmpres, S_1, params, modulus) == ECM_ERROR)
2983 : {
2984 0 : free (S_1);
2985 0 : free (S_2);
2986 0 : mpz_clear (mt);
2987 0 : mpres_clear (tmpres, modulus);
2988 0 : mpzspm_clear (ntt_context);
2989 0 : clear_list (F, lenF);
2990 0 : return ECM_ERROR;
2991 : }
2992 :
2993 187 : free (S_1);
2994 187 : S_1 = NULL;
2995 :
2996 187 : h_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
2997 :
2998 187 : mpz_set_ui (mt, params->P);
2999 187 : mpres_pow (tmpres, X, mt, modulus); /* tmpres = X^P */
3000 187 : pm1_sequence_h (NULL, h_ntt, F, tmpres, params->s_1 / 2 + 1, modulus,
3001 : ntt_context);
3002 :
3003 187 : clear_list (F, lenF);
3004 187 : g_ntt = mpzspv_init (params->l, ntt_context);
3005 :
3006 : /* Compute the DCT-I of h */
3007 187 : outputf (OUTPUT_VERBOSE, "Computing DCT-I of h");
3008 : #ifdef _OPENMP
3009 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
3010 : #endif
3011 187 : timestart = cputime ();
3012 187 : realstart = realtime ();
3013 :
3014 187 : mpzspv_to_dct1 (h_ntt, h_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
3015 : g_ntt, ntt_context);
3016 187 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3017 :
3018 187 : if (test_verbose (OUTPUT_RESVERBOSE))
3019 : {
3020 17 : mpz_init (product);
3021 17 : product_ptr = &product;
3022 : }
3023 :
3024 371 : for (l = 0; l < params->s_2; l++)
3025 : {
3026 325 : const unsigned long M = params->l - 1L - params->s_1 / 2L;
3027 :
3028 325 : outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
3029 325 : l + 1, params->s_2);
3030 : /* Compute the coefficients of the polynomial g(x) */
3031 325 : pm1_sequence_g (NULL, g_ntt, X, params->P, M, params->l,
3032 325 : params->m_1, S_2->elem[l], modulus, ntt_context);
3033 :
3034 : /* Do the convolution */
3035 325 : outputf (OUTPUT_VERBOSE, "Computing g*h");
3036 : #ifdef _OPENMP
3037 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
3038 : #endif
3039 325 : timestart = cputime ();
3040 325 : realstart = realtime ();
3041 325 : mpzspv_mul_by_dct (g_ntt, h_ntt, params->l, ntt_context,
3042 : NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
3043 325 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3044 :
3045 : /* Compute GCD of N and coefficients of product polynomial */
3046 325 : ntt_gcd (mt, product_ptr, g_ntt, params->s_1 / 2, NULL, nr, ntt_context,
3047 : modulus);
3048 :
3049 325 : outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);
3050 :
3051 : /* If we found a factor, stop */
3052 325 : if (mpz_cmp_ui (mt, 1UL) > 0)
3053 : {
3054 141 : mpz_set (f, mt);
3055 141 : youpi = ECM_FACTOR_FOUND_STEP2;
3056 141 : break;
3057 : }
3058 : }
3059 :
3060 187 : if (test_verbose (OUTPUT_RESVERBOSE))
3061 : {
3062 17 : product_ptr = NULL;
3063 17 : mpz_clear (product);
3064 : }
3065 187 : mpzspv_clear (g_ntt, ntt_context);
3066 187 : mpzspv_clear (h_ntt, ntt_context);
3067 187 : mpzspm_clear (ntt_context);
3068 187 : mpres_clear (tmpres, modulus);
3069 187 : mpz_clear (mt);
3070 187 : free (S_2);
3071 :
3072 187 : outputf (OUTPUT_NORMAL, "Step 2");
3073 : /* In normal output mode, print only cpu time as we always have.
3074 : In verbose mode, print real time as well if we used multi-threading */
3075 187 : if (test_verbose (OUTPUT_VERBOSE))
3076 29 : print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
3077 : else
3078 158 : print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
3079 :
3080 187 : return youpi;
3081 : }
3082 :
3083 :
3084 : static void
3085 80 : gfp_ext_print (const mpres_t r_x, const mpres_t r_y, mpmod_t modulus,
3086 : const int verbose)
3087 : {
3088 : mpz_t t1, t2;
3089 :
3090 80 : if (!test_verbose (verbose))
3091 0 : return;
3092 :
3093 80 : mpz_init (t1);
3094 80 : mpz_init (t2);
3095 80 : mpres_get_z (t1, r_x, modulus);
3096 80 : mpres_get_z (t2, r_y, modulus);
3097 80 : outputf (verbose, "Mod(%Zd, N) + Mod(%Zd, N) * w", t1, t2);
3098 :
3099 80 : mpz_clear (t1);
3100 80 : mpz_clear (t2);
3101 : }
3102 :
3103 :
3104 :
3105 : /* Multiplies (a_0 + a_1*sqrt(Delta)) * (b_0 + b_1*sqrt(Delta))
3106 : using four multiplications. Result goes in (r_0 + r_1*sqrt(Delta)).
3107 : a_0, b_0, r_0 as well as a_1, b_1, r_1 may overlap arbitrarily. t[0], t[1],
3108 : t[2] and Delta must not overlap with anything. */
3109 : /* FIXME: is there a faster multiplication routine if both inputs have
3110 : norm 1? */
3111 :
3112 : static void
3113 14396 : gfp_ext_mul (mpres_t r_0, mpres_t r_1, const mpres_t a_0, const mpres_t a_1,
3114 : const mpres_t b_0, const mpres_t b_1, const mpres_t Delta,
3115 : mpmod_t modulus, ATTRIBUTE_UNUSED const unsigned long tmplen,
3116 : mpres_t *tmp)
3117 : {
3118 : ASSERT (tmplen >= 2);
3119 : if (0 && test_verbose (OUTPUT_TRACE))
3120 : {
3121 : mpz_t t;
3122 : mpz_init (t);
3123 : mpres_get_z (t, Delta, modulus);
3124 : outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ w = quadgen (4*%Zd); "
3125 : "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3126 : mpz_clear (t);
3127 : outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ (");
3128 : gfp_ext_print (a_0, a_1, modulus, OUTPUT_TRACE);
3129 : outputf (OUTPUT_TRACE, ") * (");
3130 : gfp_ext_print (b_0, b_1, modulus, OUTPUT_TRACE);
3131 : }
3132 :
3133 14396 : mpres_add (tmp[0], a_0, a_1, modulus);
3134 14396 : mpres_add (tmp[1], b_0, b_1, modulus);
3135 14396 : mpres_mul (tmp[1], tmp[0], tmp[1], modulus); /* t[1] = (a_0+a_1)*(b_0+b_1) =
3136 : a_0*b_0 + a_0*b_1 + a_1*b_0 +
3137 : a_1*b_1 */
3138 :
3139 14396 : mpres_mul (r_0, a_0, b_0, modulus); /* r_0 = a_0*b_0. We don't need a_0
3140 : or b_0 any more now */
3141 14396 : mpres_sub (tmp[1], tmp[1], r_0, modulus); /* t[1] = a_0*b_1 + a_1*b_0 +
3142 : a_1*b_1 */
3143 :
3144 14396 : mpres_mul (tmp[0], a_1, b_1, modulus); /* t[0] = a_1*b_1. We don't need
3145 : a_1 or b_1 any more now */
3146 14396 : mpres_sub (r_1, tmp[1], tmp[0], modulus); /* r_1 == a_0*b_1 + a_1*b_0 */
3147 :
3148 14396 : mpres_mul (tmp[0], tmp[0], Delta, modulus); /* t[0] = a_1*b_1*Delta */
3149 14396 : mpres_add (r_0, r_0, tmp[0], modulus); /* r_0 = a_0*b_0 + a_1*b_1*Delta */
3150 :
3151 : if (0 && test_verbose (OUTPUT_TRACE))
3152 : {
3153 : outputf (OUTPUT_TRACE, ") == ");
3154 : gfp_ext_print (r_0, r_1, modulus, OUTPUT_TRACE);
3155 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3156 : }
3157 14396 : }
3158 :
3159 :
3160 : /* Computes (a_0 + a_1 * sqrt(Delta))^2, where the norm
3161 : (a_0^2 - a_1^2*Delta) is assumed to be equal to 1. Hence
3162 : (a_0 + a_1 * sqrt(Delta))^2 = a_0^2 + 2*a_0*a_1*sqrt(Delta) + a_1^2*Delta
3163 : and a_0^2 + a_1^2*Delta = a_0^2 + a_1^2*Delta + norm - 1 = 2*a_0^2 - 1.
3164 : a_0 and r_0, as well as a_1 and r_1 may overlap */
3165 :
3166 : static void
3167 23998 : gfp_ext_sqr_norm1 (mpres_t r_0, mpres_t r_1, const mpres_t a_0,
3168 : const mpres_t a_1, mpmod_t modulus)
3169 : {
3170 : ASSERT (a_0 != r_1); /* a_0 is read after r_1 is written */
3171 :
3172 23998 : if (pari)
3173 0 : gmp_printf ("/* gfp_ext_sqr_norm1 */ (%Zd + %Zd * w)^2 %% N == ", a_0, a_1);
3174 :
3175 23998 : mpres_mul (r_1, a_0, a_1, modulus);
3176 23998 : mpres_add (r_1, r_1, r_1, modulus); /* r_1 = 2*a_0*a_1 */
3177 :
3178 23998 : mpres_sqr (r_0, a_0, modulus);
3179 23998 : mpres_add (r_0, r_0, r_0, modulus);
3180 23998 : mpres_sub_ui (r_0, r_0, 1UL, modulus); /* r_0 = 2*a_0^2 - 1 */
3181 :
3182 23998 : if (pari)
3183 0 : gmp_printf ("(%Zd + %Zd * w) %% N /* PARI C */\n", r_0, r_1);
3184 23998 : }
3185 :
3186 :
3187 : /* Raise (a0 + a1*sqrt(Delta)) to the power e which is a signed long int.
3188 : (a0 + a1*sqrt(Delta)) is assumed to have norm 1, i.e.
3189 : a0^2 - a1^2*Delta == 1. The result is (r0 * r1*sqrt(Delta)).
3190 : a0, a1, r0 and r1 must not overlap */
3191 :
3192 : static void
3193 2682 : gfp_ext_pow_norm1_sl (mpres_t r0, mpres_t r1, const mpres_t a0,
3194 : const mpres_t a1, const long e, const mpres_t Delta,
3195 : mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
3196 : {
3197 2682 : const unsigned long abs_e = labs (e);
3198 2682 : unsigned long mask = ~0UL - (~0UL >> 1);
3199 :
3200 : ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);
3201 :
3202 2682 : if (e == 0)
3203 : {
3204 468 : mpres_set_ui (r0, 1UL, modulus);
3205 468 : mpres_set_ui (r1, 0UL, modulus);
3206 468 : return;
3207 : }
3208 :
3209 : /* If e < 0, we want 1/(a0 + a1*sqrt(Delta)). By extending with
3210 : a0 - a1*sqrt(Delta), we get
3211 : (a0 - a1*sqrt(Delta)) / (a0^2 - a1^2 * Delta), but that denomiator
3212 : is the norm which is known to be 1, so the result is
3213 : a0 - a1*sqrt(Delta). */
3214 :
3215 122902 : while ((abs_e & mask) == 0UL)
3216 120688 : mask >>= 1;
3217 :
3218 2214 : mpres_set (r0, a0, modulus);
3219 2214 : mpres_set (r1, a1, modulus);
3220 :
3221 21008 : while (mask > 1UL)
3222 : {
3223 18794 : gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
3224 18794 : mask >>= 1;
3225 18794 : if (abs_e & mask)
3226 9070 : gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
3227 : }
3228 :
3229 2214 : if (e < 0)
3230 0 : mpres_neg (r1, r1, modulus);
3231 :
3232 : if (0 && test_verbose (OUTPUT_TRACE))
3233 : {
3234 : mpz_t t;
3235 : mpz_init (t);
3236 : mpres_get_z (t, Delta, modulus);
3237 : outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ w = quadgen (4*%Zd); "
3238 : "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3239 : mpz_clear (t);
3240 : outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ (");
3241 : gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
3242 : outputf (OUTPUT_TRACE, ")^(%ld) == ", e);
3243 : gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
3244 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3245 : }
3246 : }
3247 :
3248 :
3249 : /* Same, but taking an mpz_t argument for the exponent */
3250 :
3251 : static void
3252 330 : gfp_ext_pow_norm1 (mpres_t r0, mpres_t r1, const mpres_t a0,
3253 : const mpres_t a1, mpz_t e, const mpres_t Delta,
3254 : mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
3255 : {
3256 : mpz_t abs_e;
3257 : size_t idx;
3258 :
3259 : ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);
3260 :
3261 330 : if (mpz_sgn (e) == 0)
3262 : {
3263 0 : mpres_set_ui (r0, 1UL, modulus);
3264 0 : mpres_set_ui (r1, 0UL, modulus);
3265 0 : return;
3266 : }
3267 :
3268 330 : mpz_init (abs_e);
3269 330 : mpz_abs (abs_e, e);
3270 330 : idx = mpz_sizeinbase (abs_e, 2) - 1; /* Thus ecm_tstbit (abs_e, idx) == 1 */
3271 : ASSERT (ecm_tstbit (abs_e, idx) == 1);
3272 :
3273 330 : mpres_set (r0, a0, modulus);
3274 330 : mpres_set (r1, a1, modulus);
3275 :
3276 4736 : while (idx > 0UL)
3277 : {
3278 4406 : gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
3279 4406 : idx--;
3280 4406 : if (ecm_tstbit (abs_e, idx))
3281 2410 : gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
3282 : }
3283 :
3284 330 : if (mpz_sgn (e) < 0)
3285 21 : mpres_neg (r1, r1, modulus);
3286 :
3287 330 : mpz_clear (abs_e);
3288 :
3289 330 : if (test_verbose (OUTPUT_TRACE))
3290 : {
3291 : mpz_t t;
3292 10 : mpz_init (t);
3293 10 : mpres_get_z (t, Delta, modulus);
3294 10 : outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ w = quadgen (4*%Zd); "
3295 10 : "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3296 10 : mpz_clear (t);
3297 10 : outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ (");
3298 10 : gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
3299 10 : outputf (OUTPUT_TRACE, ")^(%Zd) == ", e);
3300 10 : gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
3301 10 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3302 : }
3303 : }
3304 :
3305 : #if 0
3306 : /* Compute r[i] = a^((k+i)^2) for i = 0, 1, ..., l-1, where "a" is an
3307 : element of norm 1 in the quadratic extension ring */
3308 :
3309 : static void
3310 : gfp_ext_rn2 (mpres_t *r_x, mpres_t *r_y, const mpres_t a_x, const mpres_t a_y,
3311 : const long k, const unsigned long l, const mpres_t Delta,
3312 : mpmod_t modulus, const unsigned long origtmplen, mpres_t *origtmp)
3313 : {
3314 : mpres_t *r2_x = origtmp, *r2_y = origtmp + 2, *v = origtmp + 4,
3315 : *V2 = origtmp + 6;
3316 : const unsigned long newtmplen = origtmplen - 7;
3317 : mpres_t *newtmp = origtmp + 7;
3318 : unsigned long i;
3319 :
3320 : if (l == 0UL)
3321 : return;
3322 :
3323 : ASSERT (origtmplen >= 8UL);
3324 :
3325 : if (pari)
3326 : gmp_printf ("/* In gfp_ext_rn2 */ ; a = %Zd + %Zd * w; /* PARI */\n",
3327 : a_x, a_y, modulus->orig_modulus);
3328 :
3329 : /* Compute r[0] = a^(k^2). We do it by two exponentiations by k and use
3330 : v[0] and v[1] as temp storage */
3331 : gfp_ext_pow_norm1_sl (v[0], v[1], a_x, a_y, k, Delta, modulus, newtmplen,
3332 : newtmp);
3333 : gfp_ext_pow_norm1_sl (r_x[0], r_y[0], v[0], v[1], k, Delta, modulus,
3334 : newtmplen, newtmp);
3335 : if (pari)
3336 : gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
3337 : "/* PARI C */\n", k, r_x[0], r_y[0]);
3338 :
3339 : /* Compute r[1] = a^((k+1)^2) = a^(k^2 + 2k + 1)*/
3340 : if (l > 1)
3341 : {
3342 : /* v[0] + v[1]*sqrt(Delta) still contains a^k */
3343 : gfp_ext_sqr_norm1 (r_x[1], r_y[1], v[0], v[1], modulus);
3344 : /* Now r[1] = a^(2k) */
3345 : gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], r_x[0], r_y[0], Delta,
3346 : modulus, newtmplen, newtmp);
3347 : /* Now r[1] = a^(k^2 + 2k) */
3348 : gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], a_x, a_y, Delta, modulus,
3349 : newtmplen, newtmp);
3350 : /* Now r[1] = a^(k^2 + 2k + 1) = a^((k+1)^2) */
3351 : }
3352 : if (pari)
3353 : gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
3354 : "/* PARI C */\n", k + 1, r_x[1], r_y[1]);
3355 :
3356 : /* Compute r2[0] = a^(k^2+2) = a^(k^2) * a^2 */
3357 : gfp_ext_sqr_norm1 (v[0], v[1], a_x, a_y, modulus);
3358 : gfp_ext_mul (r2_x[0], r2_y[0], r_x[0], r_y[0], v[0], v[1], Delta, modulus,
3359 : newtmplen, newtmp);
3360 : if (pari)
3361 : gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
3362 : "/* PARI C */\n", k, r2_x[0], r2_y[0]);
3363 : /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
3364 : gfp_ext_mul (r2_x[1], r2_y[1], r_x[1], r_y[1], v[0], v[1], Delta, modulus,
3365 : newtmplen, newtmp);
3366 : if (pari)
3367 : gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
3368 : "/* PARI C */\n", k + 1, r2_x[1], r2_y[1]);
3369 :
3370 : /* Compute V_2(a + 1/a). Since 1/a = a_x - a_y, we have a+1/a = 2*a_x.
3371 : V_2(x) = x^2 - 2, so we want 4*a_x^2 - 2. */
3372 : mpres_add (*V2, a_x, a_x, modulus); /* V2 = a + 1/a = 2*a_x*/
3373 : V (v[0], *V2, 2 * k + 1, modulus); /* v[0] = V_{2k+1} (a + 1/a) */
3374 : V (v[1], *V2, 2 * k + 3, modulus); /* v[0] = V_{2k+3} (a + 1/a) */
3375 : mpres_sqr (*V2, *V2, modulus); /* V2 = 4*a_x^2 */
3376 : mpres_sub_ui (*V2, *V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
3377 : if (pari)
3378 : {
3379 : gmp_printf ("/* In gfp_ext_rn2 */ ((a + 1/a)^2 - 2) %% N == "
3380 : "%Zd %% N /* PARI C */\n", *V2);
3381 : gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3382 : "/* PARI C */\n", 2 * k + 1, v[0]);
3383 : gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3384 : "/* PARI C */\n", 2 * k + 3, v[1]);
3385 : }
3386 :
3387 : /* Compute the remaining a^((k+i)^2) values according to Peter's
3388 : recurrence */
3389 :
3390 : for (i = 2; i < l; i++)
3391 : {
3392 : /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i taken
3393 : modulo 2 */
3394 : mpres_mul (r_x[i], r2_x[1 - i % 2], v[i % 2], modulus);
3395 : mpres_sub (r_x[i], r_x[i], r2_x[i % 2], modulus);
3396 : mpres_mul (r_y[i], r2_y[1 - i % 2], v[i % 2], modulus);
3397 : mpres_sub (r_y[i], r_y[i], r2_y[i % 2], modulus);
3398 :
3399 : /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
3400 : mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
3401 : mpres_sub (r2_x[i % 2], r2_x[i % 2], r_x[i - 2], modulus);
3402 : mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
3403 : mpres_sub (r2_y[i % 2], r2_y[i % 2], r_y[i - 2], modulus);
3404 :
3405 : /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3406 : mpres_mul (newtmp[0], v[1 - i % 2], *V2, modulus);
3407 : mpres_sub (v[i % 2], newtmp[0], v[i % 2], modulus);
3408 : if (pari)
3409 : gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3410 : "/* PARI C */\n", 2 * (k + i) + 1, v[i % 2]);
3411 : }
3412 : }
3413 : #endif
3414 :
3415 : /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l.
3416 : x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P. */
3417 :
3418 : static void
3419 330 : pp1_sequence_g (listz_t g_x, listz_t g_y, mpzspv_t g_x_ntt, mpzspv_t g_y_ntt,
3420 : const mpres_t b1_x, const mpres_t b1_y, const unsigned long P,
3421 : const mpres_t Delta, const long M_param,
3422 : const unsigned long l_param, const mpz_t m_1, const long k_2,
3423 : const mpmod_t modulus_param, const mpzspm_t ntt_context)
3424 : {
3425 330 : const unsigned long tmplen = 3;
3426 330 : const int want_x = (g_x != NULL || g_x_ntt != NULL);
3427 330 : const int want_y = (g_y != NULL || g_y_ntt != NULL);
3428 : mpres_t r_x, r_y, x0_x, x0_y, v2,
3429 : r1_x[2], r1_y[2], r2_x[2], r2_y[2],
3430 : v[2], tmp[3];
3431 : mpz_t mt;
3432 : mpmod_t modulus; /* Thread-local copy of modulus_param */
3433 330 : unsigned long i, l = l_param, offset = 0;
3434 330 : long M = M_param;
3435 : long timestart, realstart;
3436 330 : int want_output = 1;
3437 :
3438 655 : outputf (OUTPUT_VERBOSE, "Computing %s%s%s",
3439 : (want_x) ? "g_x" : "",
3440 325 : (want_x && want_y) ? " and " : "",
3441 : (want_y) ? "g_y" : "");
3442 330 : timestart = cputime ();
3443 330 : realstart = realtime ();
3444 :
3445 : #ifdef _OPENMP
3446 : #pragma omp parallel if (l > 100) private(r_x, r_y, x0_x, x0_y, v2, r1_x, r1_y, r2_x, r2_y, v, tmp, mt, modulus, i, l, offset, M, want_output)
3447 : {
3448 : /* When multi-threading, we adjust the parameters for each thread */
3449 :
3450 : const int nr_chunks = omp_get_num_threads();
3451 : const int thread_nr = omp_get_thread_num();
3452 :
3453 : l = (l_param - 1) / nr_chunks + 1;
3454 : offset = thread_nr * l;
3455 : if (offset <= l_param)
3456 : l = MIN(l, l_param - offset);
3457 : else
3458 : l = 0;
3459 : M = M_param - (long) offset;
3460 :
3461 : want_output = (omp_get_thread_num() == 0);
3462 : if (want_output)
3463 : outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
3464 : #endif
3465 330 : mpmod_init_set (modulus, modulus_param);
3466 330 : mpres_init (r_x, modulus);
3467 330 : mpres_init (r_y, modulus);
3468 330 : mpres_init (x0_x, modulus);
3469 330 : mpres_init (x0_y, modulus);
3470 330 : mpres_init (v2, modulus);
3471 990 : for (i = 0; i < 2UL; i++)
3472 : {
3473 660 : mpres_init (r1_x[i], modulus);
3474 660 : mpres_init (r1_y[i], modulus);
3475 660 : mpres_init (r2_x[i], modulus);
3476 660 : mpres_init (r2_y[i], modulus);
3477 660 : mpres_init (v[i], modulus);
3478 : }
3479 1320 : for (i = 0; i < tmplen; i++)
3480 990 : mpres_init (tmp[i], modulus);
3481 330 : mpz_init (mt);
3482 :
3483 330 : if (want_output && test_verbose (OUTPUT_TRACE))
3484 : {
3485 10 : mpres_get_z (mt, Delta, modulus);
3486 10 : outputf (OUTPUT_TRACE,
3487 : "\n/* pp1_sequence_g */ w = quadgen (4*%Zd); P = %lu; "
3488 : "M = %ld; k_2 = %ld; m_1 = %Zd; N = %Zd; /* PARI */\n",
3489 : mt, P, M, k_2, m_1, modulus->orig_modulus);
3490 :
3491 10 : outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ b_1 = ");
3492 10 : gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
3493 10 : outputf (OUTPUT_TRACE, "; /* PARI */\n");
3494 10 : outputf (OUTPUT_TRACE,
3495 : "/* pp1_sequence_g */ r = b_1^P; /* PARI */\n");
3496 10 : outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
3497 : "x_0 = b_1^(2*k_2 + (2*m_1 + 1) * P); /* PARI */\n");
3498 10 : outputf (OUTPUT_TRACE,
3499 : "/* pp1_sequence_g */ addrec(x) = x + 1/x; /* PARI */\n");
3500 : }
3501 :
3502 : /* Compute r */
3503 330 : gfp_ext_pow_norm1_sl (r_x, r_y, b1_x, b1_y, P, Delta, modulus,
3504 : tmplen, tmp);
3505 330 : if (want_output && test_verbose (OUTPUT_TRACE))
3506 : {
3507 10 : outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ r == ");
3508 10 : gfp_ext_print (r_x, r_y, modulus, OUTPUT_TRACE);
3509 10 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3510 : }
3511 :
3512 : /* Compute x0 = x_0 */
3513 330 : mpz_mul_2exp (mt, m_1, 1UL);
3514 330 : mpz_add_ui (mt, mt, 1UL);
3515 330 : mpz_mul_ui (mt, mt, P);
3516 330 : mpz_add_si (mt, mt, k_2);
3517 330 : mpz_add_si (mt, mt, k_2); /* mt = 2*k_2 + (2*m_1 + 1) * P */
3518 330 : gfp_ext_pow_norm1 (x0_x, x0_y, b1_x, b1_y, mt, Delta, modulus,
3519 : tmplen, tmp);
3520 330 : if (want_output && test_verbose (OUTPUT_TRACE))
3521 : {
3522 10 : outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ x_0 == ");
3523 10 : gfp_ext_print (x0_x, x0_y, modulus, OUTPUT_TRACE);
3524 10 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3525 : }
3526 :
3527 :
3528 : /* Compute g[1] = r1[0] = x0^M * r^(M^2) = (x0 * r^M)^M.
3529 : We use v[0,1] as temporary storage */
3530 330 : gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M, Delta, modulus,
3531 : tmplen, tmp); /* v[0,1] = r^M */
3532 330 : gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus,
3533 : tmplen, tmp); /* v[0,1] = r^M * x_0 */
3534 330 : gfp_ext_pow_norm1_sl (r1_x[0], r1_y[0], v[0], v[1], M, Delta, modulus,
3535 : tmplen, tmp); /* r1[0] = (r^M * x_0)^M */
3536 330 : if (l > 0)
3537 : {
3538 330 : if (g_x != NULL)
3539 54 : mpres_get_z (g_x[offset], r1_x[0], modulus);
3540 330 : if (g_y != NULL)
3541 54 : mpres_get_z (g_y[offset], r1_y[0], modulus);
3542 330 : if (g_x_ntt != NULL)
3543 : {
3544 271 : mpres_get_z (mt, r1_x[0], modulus);
3545 271 : mpzspv_from_mpzv (g_x_ntt, offset, &mt, 1UL, ntt_context);
3546 : }
3547 330 : if (g_y_ntt != NULL)
3548 : {
3549 271 : mpres_get_z (mt, r1_y[0], modulus);
3550 271 : mpzspv_from_mpzv (g_y_ntt, offset, &mt, 1UL, ntt_context);
3551 : }
3552 : }
3553 :
3554 :
3555 : /* Compute g[1] = r1[1] = x0^(M-1) * r^((M-1)^2) = (x0 * r^(M-1))^(M-1).
3556 : We use v[0,1] as temporary storage. FIXME: simplify, reusing g_0 */
3557 330 : gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M - 1, Delta, modulus,
3558 : tmplen, tmp);
3559 330 : gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus,
3560 : tmplen, tmp);
3561 330 : gfp_ext_pow_norm1_sl (r1_x[1], r1_y[1], v[0], v[1], M - 1, Delta,
3562 : modulus, tmplen, tmp);
3563 330 : if (l > 1)
3564 : {
3565 330 : if (g_x != NULL)
3566 54 : mpres_get_z (g_x[offset + 1], r1_x[1], modulus);
3567 330 : if (g_y != NULL)
3568 54 : mpres_get_z (g_y[offset + 1], r1_y[1], modulus);
3569 330 : if (g_x_ntt != NULL)
3570 : {
3571 271 : mpres_get_z (mt, r1_x[1], modulus);
3572 271 : mpzspv_from_mpzv (g_x_ntt, offset + 1, &mt, 1UL, ntt_context);
3573 : }
3574 330 : if (g_y_ntt != NULL)
3575 : {
3576 271 : mpres_get_z (mt, r1_y[1], modulus);
3577 271 : mpzspv_from_mpzv (g_y_ntt, offset + 1, &mt, 1UL, ntt_context);
3578 : }
3579 : }
3580 :
3581 :
3582 : /* x0 := $x_0 * r^{2M - 3}$ */
3583 : /* We don't need x0 after this so we overwrite it. We use v[0,1] as
3584 : temp storage for $r^{2M - 3}$. */
3585 330 : gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, 2UL*M - 3UL, Delta, modulus,
3586 : tmplen, tmp);
3587 330 : gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, v[0], v[1], Delta, modulus,
3588 : tmplen, tmp);
3589 :
3590 : /* Compute r2[0] = r1[0] * r^2 and r2[1] = r1[1] * r^2. */
3591 : /* We only need $r^2$ from here on, so we set r = $r^2$ */
3592 330 : gfp_ext_sqr_norm1 (r_x, r_y, r_x, r_y, modulus);
3593 330 : gfp_ext_mul (r2_x[0], r2_y[0], r1_x[0], r1_y[0], r_x, r_y, Delta,
3594 : modulus, tmplen, tmp);
3595 330 : gfp_ext_mul (r2_x[1], r2_y[1], r1_x[1], r1_y[1], r_x, r_y, Delta,
3596 : modulus, tmplen, tmp);
3597 :
3598 : /* v[1] := $x_0 * r^{2*M - 3} + 1/(x_0 * r^{2M - 3}) */
3599 330 : mpres_add (v[1], x0_x, x0_x, modulus);
3600 : /* x0 := x0 * r = $x_0 * r^{2M - 1}$ */
3601 330 : gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, r_x, r_y, Delta, modulus,
3602 : tmplen, tmp);
3603 : /* v[0] := $x_0 * r^{2M - 1} + 1/(x_0 * r^{2M - 1}) */
3604 330 : mpres_add (v[0], x0_x, x0_x, modulus);
3605 :
3606 : /* v2 = V_2 (r + 1/r) = r^2 + 1/r^2 */
3607 330 : mpres_add (v2, r_x, r_x, modulus);
3608 :
3609 : /* We don't need the contents of r any more and use it as a temp var */
3610 :
3611 6685430 : for (i = 2; i < l; i++)
3612 : {
3613 6685100 : if (want_x)
3614 : {
3615 : /* r1[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i
3616 : taken modulo 2. We store the new r1_x[i] in r_x for now */
3617 6357430 : mpres_mul (r_x, r2_x[1 - i % 2], v[i % 2], modulus);
3618 6357430 : mpres_sub (r_x, r_x, r2_x[i % 2], modulus);
3619 : /* r2[i] = r2[i-1] * v[i-1] - r1[i-2] */
3620 6357430 : mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
3621 6357430 : mpres_sub (r2_x[i % 2], r2_x[i % 2], r1_x[i % 2], modulus);
3622 6357430 : mpres_set (r1_x[i % 2], r_x, modulus); /* FIXME, avoid this copy */
3623 6357430 : if (g_x != NULL)
3624 86388 : mpres_get_z (g_x[offset + i], r_x, modulus); /* FIXME, avoid these REDC */
3625 6357430 : if (g_x_ntt != NULL)
3626 : {
3627 6271042 : mpres_get_z (mt, r_x, modulus);
3628 6271042 : mpzspv_from_mpzv (g_x_ntt, offset + i, &mt, 1UL, ntt_context);
3629 : }
3630 : }
3631 :
3632 6685100 : if (want_y)
3633 : {
3634 : /* Same for y coordinate */
3635 6357430 : mpres_mul (r_y, r2_y[1 - i % 2], v[i % 2], modulus);
3636 6357430 : mpres_sub (r_y, r_y, r2_y[i % 2], modulus);
3637 6357430 : mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
3638 6357430 : mpres_sub (r2_y[i % 2], r2_y[i % 2], r1_y[i % 2], modulus);
3639 6357430 : mpres_set (r1_y[i % 2], r_y, modulus);
3640 6357430 : if (g_y != NULL)
3641 86388 : mpres_get_z (g_y[offset + i], r_y, modulus); /* Keep r1, r2 in mpz_t ? */
3642 6357430 : if (g_y_ntt != NULL)
3643 : {
3644 6271042 : mpres_get_z (mt, r_y, modulus);
3645 6271042 : mpzspv_from_mpzv (g_y_ntt, offset + i, &mt, 1UL, ntt_context);
3646 : }
3647 : }
3648 :
3649 : /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3650 6685100 : mpres_mul (r_x, v[1 - i % 2], v2, modulus);
3651 6685100 : mpres_sub (v[i % 2], r_x, v[i % 2], modulus);
3652 6685100 : if (want_output && test_verbose (OUTPUT_TRACE))
3653 : {
3654 : mpz_t t;
3655 39944 : mpz_init (t);
3656 39944 : mpres_get_z (t, v[i % 2], modulus);
3657 39944 : outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
3658 : "addrec(x_0 * r^(2*(M-%lu) - 1)) == %Zd /* PARI C */\n",
3659 : i, t);
3660 39944 : mpz_clear (t);
3661 : }
3662 : }
3663 :
3664 330 : mpres_clear (r_x, modulus);
3665 330 : mpres_clear (r_y, modulus);
3666 330 : mpres_clear (x0_x, modulus);
3667 330 : mpres_clear (x0_y, modulus);
3668 330 : mpres_clear (v2, modulus);
3669 990 : for (i = 0; i < 2; i++)
3670 : {
3671 660 : mpres_clear (r1_x[i], modulus);
3672 660 : mpres_clear (r1_y[i], modulus);
3673 660 : mpres_clear (r2_x[i], modulus);
3674 660 : mpres_clear (r2_y[i], modulus);
3675 660 : mpres_clear (v[i], modulus);
3676 : }
3677 1320 : for (i = 0; i < tmplen; i++)
3678 990 : mpres_clear (tmp[i], modulus);
3679 330 : mpz_clear (mt);
3680 330 : mpmod_clear (modulus);
3681 : #ifdef _OPENMP
3682 : }
3683 : #endif
3684 :
3685 330 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3686 :
3687 330 : if (g_x != NULL && g_y != NULL && test_verbose(OUTPUT_TRACE))
3688 : {
3689 7198 : for (i = 0; i < l_param; i++)
3690 : {
3691 7196 : outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu = "
3692 : "x_0^(M-%lu) * r^((M-%lu)^2); /* PARI */", i, i, i);
3693 7196 : outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu == "
3694 7196 : "%Zd + %Zd*w /* PARI C */\n", i, g_x[i], g_y[i]);
3695 : }
3696 : }
3697 330 : }
3698 :
3699 :
3700 : /* Compute r[i] = b1^(-P*(k+i)^2) * f_i for i = 0, 1, ..., l-1, where "b1" is
3701 : an element of norm 1 in the quadratic extension ring */
3702 :
3703 : static void
3704 234 : pp1_sequence_h (listz_t h_x, listz_t h_y, mpzspv_t h_x_ntt, mpzspv_t h_y_ntt,
3705 : const listz_t f, const mpres_t b1_x, const mpres_t b1_y,
3706 : const long k_param, const unsigned long l_param,
3707 : const unsigned long P, const mpres_t Delta,
3708 : mpmod_t modulus_param, const mpzspm_t ntt_context)
3709 : {
3710 : unsigned long i;
3711 : long timestart, realstart;
3712 :
3713 234 : if (l_param == 0UL)
3714 0 : return;
3715 :
3716 : ASSERT (f != h_x);
3717 : ASSERT (f != h_y);
3718 :
3719 234 : outputf (OUTPUT_VERBOSE, "Computing h_x and h_y");
3720 234 : timestart = cputime ();
3721 234 : realstart = realtime ();
3722 :
3723 234 : if (test_verbose (OUTPUT_TRACE))
3724 : {
3725 : mpz_t t;
3726 5 : mpz_init (t);
3727 5 : mpres_get_z (t, Delta, modulus_param);
3728 5 : outputf (OUTPUT_TRACE, "\n/* pp1_sequence_h */ N = %Zd; "
3729 : "Delta = %Zd; w = quadgen (4*Delta); k = %ld; P = %lu; "
3730 5 : "/* PARI */\n", modulus_param->orig_modulus, t, k_param, P);
3731 5 : outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ b_1 = ");
3732 5 : gfp_ext_print (b1_x, b1_y, modulus_param, OUTPUT_TRACE);
3733 5 : outputf (OUTPUT_TRACE, "; r = b_1^P; rn = b_1^(-P); /* PARI */\n");
3734 4882 : for (i = 0; i < l_param; i++)
3735 4877 : outputf (OUTPUT_TRACE,
3736 4877 : "/* pp1_sequence_h */ f_%lu = %Zd; /* PARI */\n", i, f[i]);
3737 5 : mpz_clear (t);
3738 : }
3739 :
3740 : #ifdef _OPENMP
3741 : #pragma omp parallel if (l_param > 100) private(i)
3742 : #endif
3743 : {
3744 234 : const size_t tmplen = 2;
3745 : mpres_t s_x[3], s_y[3], s2_x[2], s2_y[2], v[2], V2, rn_x, rn_y,
3746 : tmp[2];
3747 : mpmod_t modulus; /* Thread-local copy of modulus_param */
3748 : mpz_t mt;
3749 234 : unsigned long l = l_param, offset = 0;
3750 234 : long k = k_param;
3751 :
3752 : #ifdef _OPENMP
3753 : /* When multi-threading, we adjust the parameters for each thread */
3754 :
3755 : const int nr_chunks = omp_get_num_threads();
3756 : const int thread_nr = omp_get_thread_num();
3757 :
3758 : l = (l_param - 1) / nr_chunks + 1;
3759 : offset = thread_nr * l;
3760 : if (offset <= l_param)
3761 : l = MIN(l, l_param - offset);
3762 : else
3763 : l = 0;
3764 :
3765 : if (thread_nr == 0)
3766 : outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
3767 : outputf (OUTPUT_TRACE, "\n");
3768 : #endif
3769 :
3770 : /* Each thread computes r[i + offset] = b1^(-P*(k+i+offset)^2) * f_i
3771 : for i = 0, 1, ..., l-1, where l is the adjusted length of each thread */
3772 :
3773 : /* Test that k+offset does not overflow */
3774 234 : ASSERT_ALWAYS (offset <= (unsigned long) LONG_MAX &&
3775 : k <= LONG_MAX - (long) offset);
3776 234 : k += (long) offset;
3777 :
3778 234 : mpz_init (mt);
3779 : /* Make thread-local copy of modulus */
3780 234 : mpmod_init_set (modulus, modulus_param);
3781 :
3782 : /* Init the local mpres_t variables */
3783 702 : for (i = 0; i < 2; i++)
3784 : {
3785 468 : mpres_init (s_x[i], modulus);
3786 468 : mpres_init (s_y[i], modulus);
3787 468 : mpres_init (s2_x[i], modulus);
3788 468 : mpres_init (s2_y[i], modulus);
3789 468 : mpres_init (v[i], modulus);
3790 : }
3791 234 : mpres_init (s_x[2], modulus);
3792 234 : mpres_init (s_y[2], modulus);
3793 234 : mpres_init (V2, modulus);
3794 234 : mpres_init (rn_x, modulus);
3795 234 : mpres_init (rn_y, modulus);
3796 702 : for (i = 0; i < (unsigned long) tmplen; i++)
3797 468 : mpres_init (tmp[i], modulus);
3798 :
3799 : /* Compute rn = b_1^{-P}. It has the same value for all threads,
3800 : but we make thread local copies anyway. */
3801 234 : gfp_ext_pow_norm1_sl (rn_x, rn_y, b1_x, b1_y, P, Delta, modulus, tmplen,
3802 : tmp);
3803 234 : mpres_neg (rn_y, rn_y, modulus);
3804 :
3805 : /* Compute s[0] = rn^(k^2) = r^(-k^2). We do it by two
3806 : exponentiations by k and use v[0] and v[1] as temp storage */
3807 234 : gfp_ext_pow_norm1_sl (v[0], v[1], rn_x, rn_y, k, Delta, modulus,
3808 : tmplen, tmp);
3809 234 : gfp_ext_pow_norm1_sl (s_x[0], s_y[0], v[0], v[1], k, Delta, modulus,
3810 : tmplen, tmp);
3811 234 : if (test_verbose (OUTPUT_TRACE))
3812 : {
3813 : #ifdef _OPENMP
3814 : #pragma omp critical
3815 : #endif
3816 : {
3817 5 : outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ", k);
3818 5 : gfp_ext_print (s_x[0], s_y[0], modulus, OUTPUT_TRACE);
3819 5 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3820 : }
3821 : }
3822 :
3823 : /* Compute s[1] = r^(-(k+1)^2) = r^(-(k^2 + 2k + 1))*/
3824 234 : if (l > 1)
3825 : {
3826 : /* v[0] + v[1]*sqrt(Delta) still contains rn^k */
3827 234 : gfp_ext_sqr_norm1 (s_x[1], s_y[1], v[0], v[1], modulus);
3828 : /* Now s[1] = r^(-2k) */
3829 234 : gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], s_x[0], s_y[0], Delta,
3830 : modulus, tmplen, tmp);
3831 : /* Now s[1] = r^(-(k^2 + 2k)) */
3832 234 : gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], rn_x, rn_y, Delta,
3833 : modulus, tmplen, tmp);
3834 : /* Now s[1] = r^(-(k^2 + 2k + 1)) = r^(-(k+1)^2) */
3835 234 : if (test_verbose (OUTPUT_TRACE))
3836 : {
3837 : #ifdef _OPENMP
3838 : #pragma omp critical
3839 : #endif
3840 : {
3841 5 : outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ",
3842 : k + 1);
3843 5 : gfp_ext_print (s_x[1], s_y[1], modulus, OUTPUT_TRACE);
3844 5 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3845 : }
3846 : }
3847 : }
3848 :
3849 : /* Compute s2[0] = r^(k^2+2) = r^(k^2) * r^2 */
3850 234 : gfp_ext_sqr_norm1 (v[0], v[1], rn_x, rn_y, modulus);
3851 234 : gfp_ext_mul (s2_x[0], s2_y[0], s_x[0], s_y[0], v[0], v[1], Delta, modulus,
3852 : tmplen, tmp);
3853 234 : if (test_verbose (OUTPUT_TRACE))
3854 : {
3855 : #ifdef _OPENMP
3856 : #pragma omp critical
3857 : #endif
3858 : {
3859 5 : outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ", k);
3860 5 : gfp_ext_print (s2_x[0], s2_y[0], modulus, OUTPUT_TRACE);
3861 5 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3862 : }
3863 : }
3864 :
3865 : /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
3866 234 : gfp_ext_mul (s2_x[1], s2_y[1], s_x[1], s_y[1], v[0], v[1], Delta, modulus,
3867 : tmplen, tmp);
3868 234 : if (test_verbose (OUTPUT_TRACE))
3869 : {
3870 : #ifdef _OPENMP
3871 : #pragma omp critical
3872 : #endif
3873 : {
3874 5 : outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ",
3875 : k + 1);
3876 5 : gfp_ext_print (s2_x[1], s2_y[1], modulus, OUTPUT_TRACE);
3877 5 : outputf (OUTPUT_TRACE, " /* PARI C */\n");
3878 : }
3879 : }
3880 :
3881 : /* Compute V_2(r + 1/r). Since 1/r = rn_x - rn_y, we have r+1/r = 2*rn_x.
3882 : V_2(x) = x^2 - 2, so we want 4*rn_x^2 - 2. */
3883 234 : mpres_add (V2, rn_x, rn_x, modulus); /* V2 = r + 1/r = 2*rn_x */
3884 234 : V (v[0], V2, 2 * k + 1, modulus); /* v[0] = V_{2k+1} (r + 1/r) */
3885 234 : V (v[1], V2, 2 * k + 3, modulus); /* v[1] = V_{2k+3} (r + 1/r) */
3886 234 : mpres_sqr (V2, V2, modulus); /* V2 = 4*a_x^2 */
3887 234 : mpres_sub_ui (V2, V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
3888 234 : if (test_verbose (OUTPUT_TRACE))
3889 : {
3890 : #ifdef _OPENMP
3891 : #pragma omp critical
3892 : #endif
3893 : {
3894 5 : mpres_get_z (mt, V2, modulus);
3895 5 : outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^2 + 1/r^2 == %Zd "
3896 : "/* PARI C */\n", mt);
3897 5 : mpres_get_z (mt, v[0], modulus);
3898 5 : outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+1) + "
3899 : "1/r^(2*%ld+1) == %Zd /* PARI C */\n", k, k, mt);
3900 5 : mpres_get_z (mt, v[1], modulus);
3901 5 : outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+3) + "
3902 : "1/r^(2*%ld+3) == %Zd /* PARI C */\n", k, k, mt);
3903 : }
3904 : }
3905 :
3906 702 : for (i = 0; i < 2UL && i < l; i++)
3907 : {
3908 : /* Multiply the 2nd coordinate by Delta, so that after the polynomial
3909 : multipoint evaluation we get x1 + Delta*x2 */
3910 468 : mpres_mul (s_y[i], s_y[i], Delta, modulus);
3911 468 : mpres_mul (s2_y[i], s2_y[i], Delta, modulus);
3912 :
3913 468 : if (h_x != NULL)
3914 94 : mpres_mul_z_to_z (h_x[i + offset], s_x[i], f[i + offset], modulus);
3915 468 : if (h_y != NULL)
3916 94 : mpres_mul_z_to_z (h_y[i + offset], s_y[i], f[i + offset], modulus);
3917 468 : if (h_x_ntt != NULL)
3918 : {
3919 374 : mpres_mul_z_to_z (mt, s_x[i], f[i + offset], modulus);
3920 374 : mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
3921 : }
3922 468 : if (h_y_ntt != NULL)
3923 : {
3924 374 : mpres_mul_z_to_z (mt, s_y[i], f[i + offset], modulus);
3925 374 : mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
3926 : }
3927 : }
3928 :
3929 : /* Compute the remaining r^((k+i)^2) values according to Peter's
3930 : recurrence */
3931 :
3932 1396085 : for (i = 2; i < l; i++)
3933 : {
3934 1395851 : if (h_x != NULL || h_x_ntt != NULL)
3935 : {
3936 : /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i
3937 : taken modulo 2 */
3938 1395851 : mpres_mul (s_x[i % 3], s2_x[1 - i % 2], v[i % 2], modulus);
3939 1395851 : mpres_sub (s_x[i % 3], s_x[i % 3], s2_x[i % 2], modulus);
3940 :
3941 : /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
3942 1395851 : mpres_mul (s2_x[i % 2], s2_x[1 - i % 2], v[1 - i % 2], modulus);
3943 1395851 : mpres_sub (s2_x[i % 2], s2_x[i % 2], s_x[(i - 2) % 3], modulus);
3944 1395851 : if (h_x != NULL)
3945 17850 : mpres_mul_z_to_z (h_x[i + offset], s_x[i % 3], f[i + offset],
3946 : modulus);
3947 1395851 : if (h_x_ntt != NULL)
3948 : {
3949 1378001 : mpres_mul_z_to_z (mt, s_x[i % 3], f[i + offset], modulus);
3950 1378001 : mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
3951 : }
3952 : }
3953 :
3954 1395851 : if (h_y != NULL || h_y_ntt != NULL)
3955 : {
3956 : /* Same for y coordinate */
3957 1395851 : mpres_mul (s_y[i % 3], s2_y[1 - i % 2], v[i % 2], modulus);
3958 1395851 : mpres_sub (s_y[i % 3], s_y[i % 3], s2_y[i % 2], modulus);
3959 1395851 : mpres_mul (s2_y[i % 2], s2_y[1 - i % 2], v[1 - i % 2], modulus);
3960 1395851 : mpres_sub (s2_y[i % 2], s2_y[i % 2], s_y[(i - 2) % 3], modulus);
3961 1395851 : if (h_y != NULL)
3962 17850 : mpres_mul_z_to_z (h_y[i + offset], s_y[i % 3], f[i + offset],
3963 : modulus);
3964 1395851 : if (h_y_ntt != NULL)
3965 : {
3966 1378001 : mpres_mul_z_to_z (mt, s_y[i % 3], f[i + offset], modulus);
3967 1378001 : mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
3968 : }
3969 : }
3970 :
3971 : /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3972 1395851 : mpres_mul (tmp[0], v[1 - i % 2], V2, modulus);
3973 1395851 : mpres_sub (v[i % 2], tmp[0], v[i % 2], modulus);
3974 : }
3975 :
3976 : /* Clear the local mpres_t variables */
3977 702 : for (i = 0; i < 2; i++)
3978 : {
3979 468 : mpres_clear (s_x[i], modulus);
3980 468 : mpres_clear (s_y[i], modulus);
3981 468 : mpres_clear (s2_x[i], modulus);
3982 468 : mpres_clear (s2_y[i], modulus);
3983 468 : mpres_clear (v[i], modulus);
3984 : }
3985 234 : mpres_clear (s_x[2], modulus);
3986 234 : mpres_clear (s_y[2], modulus);
3987 234 : mpres_clear (V2, modulus);
3988 234 : mpres_clear (rn_x, modulus);
3989 234 : mpres_clear (rn_y, modulus);
3990 702 : for (i = 0; i < tmplen; i++)
3991 468 : mpres_clear (tmp[i], modulus);
3992 :
3993 : /* Clear the thread-local copy of modulus */
3994 234 : mpmod_clear (modulus);
3995 :
3996 234 : mpz_clear (mt);
3997 : }
3998 :
3999 234 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4000 :
4001 234 : if (h_x != NULL && h_y != NULL && test_verbose (OUTPUT_TRACE))
4002 : {
4003 842 : for (i = 0; i < l_param; i++)
4004 841 : gmp_printf ("/* pp1_sequence_h */ (rn^((k+%lu)^2) * f_%lu) == "
4005 : "(%Zd + Mod(%Zd / Delta, N) * w) /* PARI C */\n",
4006 841 : i, i, h_x[i], h_y[i]);
4007 : }
4008 : }
4009 :
4010 :
4011 : int
4012 47 : pp1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus,
4013 : const faststage2_param_t *params)
4014 : {
4015 : unsigned long nr;
4016 : unsigned long i, l, lenF, lenH, lenG, lenR, tmplen;
4017 : sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
4018 : progressions of prime length */
4019 : set_long_t *S_2; /* This is stored as a regular set */
4020 : listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
4021 : degree s_1. It is symmetric, so has only s_1 / 2 + 1
4022 : distinct coefficients. The sequence h_j will be stored in
4023 : the same memory and won't be a monic polynomial, so the
4024 : leading 1 monomial of F will be stored explicitly. Hence we
4025 : need s_1 / 2 + 1 entries. */
4026 :
4027 : listz_t g_x, g_y, fh_x, fh_y, h_x, h_y, tmp, R_x, R_y;
4028 47 : const unsigned long tmpreslen = 2UL;
4029 : mpres_t b1_x, b1_y, Delta, tmpres[2];
4030 : mpz_t mt; /* All-purpose temp mpz_t */
4031 47 : int youpi = ECM_NO_FACTOR_FOUND;
4032 : long timetotalstart, realtotalstart, timestart;
4033 :
4034 47 : timetotalstart = cputime ();
4035 47 : realtotalstart = realtime ();
4036 :
4037 47 : ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
4038 47 : ASSERT_ALWAYS (params->s_1 < params->l);
4039 47 : nr = params->l - params->s_1; /* Number of points we evaluate */
4040 :
4041 47 : if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
4042 0 : return ECM_ERROR;
4043 :
4044 : /* Allocate all the memory we'll need */
4045 : /* Allocate the correct amount of space for each mpz_t or the
4046 : reallocations will up to double the time for stage 2! */
4047 47 : mpz_init (mt);
4048 47 : mpres_init (b1_x, modulus);
4049 47 : mpres_init (b1_y, modulus);
4050 47 : mpres_init (Delta, modulus);
4051 141 : for (i = 0; i < tmpreslen; i++)
4052 94 : mpres_init (tmpres[i], modulus);
4053 47 : lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
4054 : the leading 1 monomial for each factor */
4055 47 : lenH = params->s_1 + 1;
4056 47 : lenG = params->l;
4057 47 : lenR = nr;
4058 47 : F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4059 47 : fh_x = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4060 47 : fh_y = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4061 47 : h_x = malloc (lenH * sizeof (mpz_t));
4062 47 : h_y = malloc (lenH * sizeof (mpz_t));
4063 47 : if (h_x == NULL || h_y == NULL)
4064 : {
4065 0 : fprintf (stderr, "Cannot allocate memory in pp1fs2\n");
4066 0 : exit (1);
4067 : }
4068 47 : g_x = init_list2 (lenG, (unsigned int) abs (modulus->bits));
4069 47 : g_y = init_list2 (lenG, (unsigned int) abs (modulus->bits));
4070 47 : R_x = init_list2 (lenR, (unsigned int) abs (modulus->bits));
4071 47 : R_y = init_list2 (lenR, (unsigned int) abs (modulus->bits));
4072 47 : tmplen = 3UL * params->l + list_mul_mem (params->l / 2) + 20;
4073 47 : outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
4074 47 : if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
4075 : {
4076 0 : tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
4077 : /* FIXME: It appears TMulGen_space() returns a too small value! */
4078 0 : outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n",
4079 : tmplen);
4080 : }
4081 :
4082 47 : tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
4083 :
4084 47 : if (test_verbose (OUTPUT_TRACE))
4085 : {
4086 1 : mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
4087 1 : outputf (OUTPUT_TRACE,
4088 : "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
4089 1 : modulus->orig_modulus, mt);
4090 : }
4091 :
4092 : /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2 k_1}) */
4093 47 : outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
4094 :
4095 47 : timestart = cputime ();
4096 47 : i = poly_from_sets_V (F, X, S_1, tmp, tmplen, modulus, NULL, NULL);
4097 47 : ASSERT_ALWAYS(2 * i == params->s_1);
4098 : ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
4099 47 : free (S_1);
4100 47 : S_1 = NULL;
4101 :
4102 47 : outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4103 47 : if (test_verbose (OUTPUT_TRACE))
4104 : {
4105 842 : for (i = 0; i < params->s_1 / 2 + 1; i++)
4106 841 : outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
4107 1 : outputf (OUTPUT_TRACE, "f(x) = f_0");
4108 841 : for (i = 1; i < params->s_1 / 2 + 1; i++)
4109 840 : outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
4110 1 : outputf (OUTPUT_TRACE, "/* PARI */ \n");
4111 : }
4112 :
4113 : /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
4114 47 : mpres_sqr (Delta, X, modulus);
4115 47 : mpres_sub_ui (Delta, Delta, 4UL, modulus);
4116 47 : mpres_div_2exp (b1_x, X, 1, modulus);
4117 47 : mpres_set_ui (b1_y, 1UL, modulus);
4118 47 : mpres_div_2exp (b1_y, b1_y, 1, modulus);
4119 47 : if (test_verbose (OUTPUT_TRACE))
4120 : {
4121 1 : mpres_get_z (mt, Delta, modulus);
4122 1 : outputf (OUTPUT_TRACE,
4123 : "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
4124 1 : gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
4125 1 : outputf (OUTPUT_TRACE, "; /* PARI */\n");
4126 1 : outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
4127 : }
4128 :
4129 : /* Compute the h sequence h_j = b1^(P*-j^2) * f_j for 0 <= j <= s_1 */
4130 47 : pp1_sequence_h (fh_x, fh_y, NULL, NULL, F, b1_x, b1_y, 0L,
4131 47 : params->s_1 / 2 + 1, params->P, Delta, modulus, NULL);
4132 : /* We don't need F(x) any more */
4133 47 : clear_list (F, lenF);
4134 :
4135 : /* Make a symmetric copy of fh in h. */
4136 17991 : for (i = 0; i < params->s_1 / 2 + 1; i++)
4137 : {
4138 17944 : *(h_x[i]) = *(fh_x[params->s_1 / 2 - i]); /* Clone the mpz_t */
4139 17944 : *(h_y[i]) = *(fh_y[params->s_1 / 2 - i]);
4140 : }
4141 17944 : for (i = 0; i < params->s_1 / 2; i++)
4142 : {
4143 17897 : *(h_x[i + params->s_1 / 2 + 1]) = *(fh_x[i + 1]);
4144 17897 : *(h_y[i + params->s_1 / 2 + 1]) = *(fh_y[i + 1]);
4145 : }
4146 47 : if (test_verbose (OUTPUT_TRACE))
4147 : {
4148 1682 : for (i = 0; i < params->s_1 + 1; i++)
4149 1681 : outputf (OUTPUT_VERBOSE, "h_%lu = %Zd + %Zd * w; /* PARI */\n",
4150 1681 : i, h_x[i], h_y[i]);
4151 : }
4152 :
4153 68 : for (l = 0; l < params->s_2; l++)
4154 : {
4155 54 : const long M = params->l - 1 - params->s_1 / 2;
4156 54 : outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
4157 54 : l + 1, params->s_2);
4158 54 : pp1_sequence_g (g_x, g_y, NULL, NULL, b1_x, b1_y, params->P,
4159 54 : Delta, M, params->l, params->m_1, S_2->elem[l],
4160 : modulus, NULL);
4161 :
4162 : /* Do the two convolution products */
4163 54 : outputf (OUTPUT_VERBOSE, "TMulGen of g_x and h_x");
4164 54 : timestart = cputime ();
4165 54 : if (TMulGen (R_x, nr - 1, h_x, params->s_1, g_x, params->l - 1, tmp,
4166 54 : modulus->orig_modulus) < 0)
4167 : {
4168 0 : outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
4169 : "of memory)\n");
4170 0 : youpi = ECM_ERROR;
4171 0 : break;
4172 : }
4173 54 : outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4174 54 : outputf (OUTPUT_VERBOSE, "TMulGen of g_y and h_y");
4175 54 : timestart = cputime ();
4176 54 : if (TMulGen (R_y, nr - 1, h_y, params->s_1, g_y, params->l - 1, tmp,
4177 54 : modulus->orig_modulus) < 0)
4178 : {
4179 0 : outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
4180 : "of memory)\n");
4181 0 : youpi = ECM_ERROR;
4182 0 : break;
4183 : }
4184 54 : outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4185 43948 : for (i = 0; i < nr; i++)
4186 43894 : mpz_add (R_x[i], R_x[i], R_y[i]);
4187 :
4188 54 : timestart = cputime ();
4189 54 : mpres_set_ui (tmpres[1], 1UL, modulus); /* Accumulate product in
4190 : tmpres[1] */
4191 43948 : for (i = 0; i < nr; i++)
4192 : {
4193 43894 : mpres_set_z_for_gcd (tmpres[0], R_x[i], modulus);
4194 : #define TEST_ZERO_RESULT
4195 : #ifdef TEST_ZERO_RESULT
4196 43894 : if (mpres_is_zero (tmpres[0], modulus))
4197 28 : outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
4198 : #endif
4199 43894 : mpres_mul (tmpres[1], tmpres[1], tmpres[0], modulus);
4200 : }
4201 54 : outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)^(1) took %lums\n",
4202 54 : cputime () - timestart);
4203 54 : if (test_verbose(OUTPUT_RESVERBOSE))
4204 : {
4205 6 : mpres_get_z (mt, tmpres[1], modulus);
4206 6 : outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", mt);
4207 : }
4208 :
4209 54 : mpres_gcd (mt, tmpres[1], modulus);
4210 54 : if (mpz_cmp_ui (mt, 1UL) > 0)
4211 : {
4212 33 : mpz_set (f, mt);
4213 33 : youpi = ECM_FACTOR_FOUND_STEP2;
4214 33 : break;
4215 : }
4216 : }
4217 :
4218 47 : mpz_clear (mt);
4219 47 : mpres_clear (b1_x, modulus);
4220 47 : mpres_clear (b1_y, modulus);
4221 47 : mpres_clear (Delta, modulus);
4222 141 : for (i = 0; i < tmpreslen; i++)
4223 94 : mpres_clear (tmpres[i], modulus);
4224 47 : clear_list (fh_x, lenF);
4225 47 : clear_list (fh_y, lenF);
4226 47 : free (h_x);
4227 47 : free (h_y);
4228 47 : clear_list (g_x, lenG);
4229 47 : clear_list (g_y, lenG);
4230 47 : clear_list (R_x, lenR);
4231 47 : clear_list (R_y, lenR);
4232 47 : clear_list (tmp, tmplen);
4233 47 : free (S_2);
4234 :
4235 47 : outputf (OUTPUT_NORMAL, "Step 2");
4236 : /* In normal output mode, print only cpu time as we always have.
4237 : In verbose mode, print real time as well if we used multi-threading */
4238 47 : if (test_verbose (OUTPUT_VERBOSE))
4239 9 : print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
4240 : else
4241 38 : print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
4242 :
4243 47 : return youpi;
4244 : }
4245 :
4246 :
4247 : int
4248 187 : pp1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
4249 : const faststage2_param_t *params, const int twopass)
4250 : {
4251 : unsigned long nr;
4252 : unsigned long l, lenF;
4253 : sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
4254 : progressions of prime length */
4255 : set_long_t *S_2; /* This is stored as a regular set */
4256 : listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
4257 : degree s_1. It is symmetric, so has only s_1 / 2 + 1
4258 : distinct coefficients. The sequence h_j will be stored in
4259 : the same memory and won't be a monic polynomial, so the
4260 : leading 1 monomial of F will be stored explicitly. Hence we
4261 : need s_1 / 2 + 1 entries. */
4262 187 : listz_t R = NULL; /* Is used only for two-pass convolution, has nr
4263 : entries. R is only ever referenced if twopass == 1,
4264 : but gcc does not realize that and complains about
4265 : uninitialized value, so we set it to NULL. */
4266 : mpzspm_t ntt_context;
4267 : mpzspv_t g_x_ntt, g_y_ntt, h_x_ntt, h_y_ntt;
4268 : mpres_t b1_x, b1_y, Delta;
4269 : mpz_t mt; /* All-purpose temp mpz_t */
4270 : mpz_t product;
4271 187 : mpz_t *product_ptr = NULL;
4272 187 : int youpi = ECM_NO_FACTOR_FOUND;
4273 : long timetotalstart, realtotalstart, timestart, realstart;
4274 :
4275 187 : timetotalstart = cputime ();
4276 187 : realtotalstart = realtime ();
4277 :
4278 187 : ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
4279 187 : ASSERT_ALWAYS (params->s_1 < params->l);
4280 187 : nr = params->l - params->s_1; /* Number of points we evaluate */
4281 :
4282 187 : if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
4283 0 : return ECM_ERROR;
4284 :
4285 187 : mpz_init (mt);
4286 :
4287 : /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution
4288 : with g. We need NTT of transform length l here. If we want to add
4289 : transformed vectors, we need to double the modulus. */
4290 :
4291 187 : if (twopass)
4292 1 : mpz_set (mt, modulus->orig_modulus);
4293 : else
4294 186 : mpz_mul_2exp (mt, modulus->orig_modulus, 1UL);
4295 :
4296 187 : ntt_context = mpzspm_init (params->l, mt);
4297 :
4298 187 : if (ntt_context == NULL)
4299 : {
4300 0 : outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
4301 : "presumably out of memory\n");
4302 0 : mpz_clear (mt);
4303 0 : free (S_1);
4304 0 : S_1 = NULL;
4305 0 : free (S_2);
4306 0 : S_2 = NULL;
4307 0 : return ECM_ERROR;
4308 : }
4309 :
4310 187 : print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ",
4311 : ntt_context);
4312 :
4313 : /* Allocate memory for F with correct amount of space for each mpz_t */
4314 187 : lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
4315 : the leading 1 monomial for each factor */
4316 : {
4317 : /* in some cases the above value of lenF is not enough, for example with
4318 : s_1 = 10, which gives lenF = 7, but 9 entries are needed */
4319 187 : unsigned long mem = mem_poly_from_sets_V (S_1);
4320 187 : if (mem > lenF)
4321 28 : lenF = mem;
4322 : }
4323 :
4324 187 : F = init_list2 (lenF, (unsigned int) abs (modulus->bits) + GMP_NUMB_BITS);
4325 :
4326 : /* Build F */
4327 187 : if (build_F_ntt (F, X, S_1, params, modulus) == ECM_ERROR)
4328 : {
4329 0 : free (S_1);
4330 0 : free (S_2);
4331 0 : mpz_clear (mt);
4332 0 : mpzspm_clear (ntt_context);
4333 0 : clear_list (F, lenF);
4334 0 : return ECM_ERROR;
4335 : }
4336 :
4337 187 : free (S_1);
4338 187 : S_1 = NULL;
4339 :
4340 187 : mpres_init (b1_x, modulus);
4341 187 : mpres_init (b1_y, modulus);
4342 187 : mpres_init (Delta, modulus);
4343 :
4344 : /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
4345 187 : mpres_sqr (Delta, X, modulus);
4346 187 : mpres_sub_ui (Delta, Delta, 4UL, modulus);
4347 187 : mpres_div_2exp (b1_x, X, 1, modulus);
4348 187 : mpres_set_ui (b1_y, 1UL, modulus);
4349 187 : mpres_div_2exp (b1_y, b1_y, 1, modulus);
4350 187 : if (test_verbose (OUTPUT_TRACE))
4351 : {
4352 4 : mpres_get_z (mt, Delta, modulus);
4353 4 : outputf (OUTPUT_TRACE,
4354 : "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
4355 4 : gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
4356 4 : outputf (OUTPUT_TRACE, "; /* PARI */\n");
4357 4 : outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
4358 : }
4359 :
4360 : /* Allocate remaining memory for h_ntt */
4361 187 : h_x_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
4362 187 : h_y_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
4363 : /* Compute the h_j sequence */
4364 187 : pp1_sequence_h (NULL, NULL, h_x_ntt, h_y_ntt, F, b1_x, b1_y, 0L,
4365 187 : params->s_1 / 2 + 1, params->P, Delta, modulus,
4366 : ntt_context);
4367 : /* We don't need F(x) any more */
4368 187 : clear_list (F, lenF);
4369 :
4370 : /* compute the forward transform of h and store the distinct coefficients
4371 : in h_ntt */
4372 187 : g_x_ntt = mpzspv_init (params->l, ntt_context);
4373 187 : if (twopass)
4374 : {
4375 1 : g_y_ntt = g_x_ntt;
4376 1 : R = init_list2 (nr, (mpz_size (modulus->orig_modulus) + 2) *
4377 : GMP_NUMB_BITS);
4378 : }
4379 : else
4380 186 : g_y_ntt = mpzspv_init (params->l, ntt_context);
4381 :
4382 : /* Compute DCT-I of h_x and h_y */
4383 187 : outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_x");
4384 : #ifdef _OPENMP
4385 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4386 : #endif
4387 187 : timestart = cputime ();
4388 187 : realstart = realtime ();
4389 187 : mpzspv_to_dct1 (h_x_ntt, h_x_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
4390 : g_x_ntt, ntt_context);
4391 187 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4392 :
4393 187 : outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_y");
4394 : #ifdef _OPENMP
4395 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4396 : #endif
4397 187 : timestart = cputime ();
4398 187 : realstart = realtime ();
4399 187 : mpzspv_to_dct1 (h_y_ntt, h_y_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
4400 : g_x_ntt, ntt_context);
4401 187 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4402 :
4403 187 : if (test_verbose (OUTPUT_RESVERBOSE))
4404 : {
4405 12 : mpz_init (product);
4406 12 : product_ptr = &product;
4407 : }
4408 :
4409 297 : for (l = 0; l < params->s_2; l++)
4410 : {
4411 271 : const long M = params->l - 1 - params->s_1 / 2;
4412 :
4413 271 : outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
4414 271 : l + 1, params->s_2);
4415 271 : if (twopass)
4416 : {
4417 : /* Two-pass variant. Two separate convolutions,
4418 : then addition in Z/NZ */
4419 5 : pp1_sequence_g (NULL, NULL, g_x_ntt, NULL, b1_x, b1_y, params->P,
4420 5 : Delta, M, params->l, params->m_1, S_2->elem[l],
4421 : modulus, ntt_context);
4422 :
4423 : /* Do the convolution product of g_x * h_x */
4424 5 : outputf (OUTPUT_VERBOSE, "Computing g_x*h_x");
4425 : #ifdef _OPENMP
4426 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4427 : #endif
4428 5 : timestart = cputime ();
4429 5 : realstart = realtime ();
4430 5 : mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context,
4431 : NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
4432 : /* Store the product coefficients we want in R */
4433 5 : mpzspv_to_mpzv (g_x_ntt, params->s_1 / 2, R, nr, ntt_context);
4434 5 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4435 :
4436 : /* Compute g_y sequence */
4437 5 : pp1_sequence_g (NULL, NULL, NULL, g_y_ntt, b1_x, b1_y, params->P,
4438 5 : Delta, M, params->l, params->m_1, S_2->elem[l],
4439 : modulus, ntt_context);
4440 :
4441 : /* Do the convolution product of g_y * (Delta * h_y) */
4442 5 : outputf (OUTPUT_VERBOSE, "Computing g_y*h_y");
4443 : #ifdef _OPENMP
4444 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4445 : #endif
4446 5 : timestart = cputime ();
4447 5 : realstart = realtime ();
4448 5 : mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context,
4449 : NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
4450 5 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4451 :
4452 : /* Compute product of sum of coefficients and gcd with N */
4453 5 : ntt_gcd (mt, product_ptr, g_y_ntt, params->s_1 / 2, R, nr,
4454 : ntt_context, modulus);
4455 : }
4456 : else
4457 : {
4458 : /* One-pass variant. Two forward transforms and point-wise products,
4459 : then addition and single inverse transform */
4460 266 : pp1_sequence_g (NULL, NULL, g_x_ntt, g_y_ntt, b1_x, b1_y, params->P,
4461 266 : Delta, M, params->l, params->m_1, S_2->elem[l],
4462 : modulus, ntt_context);
4463 :
4464 266 : outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_x");
4465 : #ifdef _OPENMP
4466 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4467 : #endif
4468 266 : timestart = cputime ();
4469 266 : realstart = realtime ();
4470 266 : mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context,
4471 : NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
4472 266 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4473 :
4474 266 : outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_y");
4475 : #ifdef _OPENMP
4476 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4477 : #endif
4478 266 : timestart = cputime ();
4479 266 : realstart = realtime ();
4480 266 : mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context,
4481 : NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
4482 266 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4483 :
4484 266 : outputf (OUTPUT_VERBOSE, "Adding and computing inverse NTT of sum");
4485 : #ifdef _OPENMP
4486 : outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4487 : #endif
4488 266 : timestart = cputime ();
4489 266 : realstart = realtime ();
4490 266 : mpzspv_add (g_x_ntt, (spv_size_t) 0, g_x_ntt, (spv_size_t) 0,
4491 266 : g_y_ntt, (spv_size_t) 0, params->l, ntt_context);
4492 266 : mpzspv_mul_by_dct (g_x_ntt, NULL, params->l, ntt_context,
4493 : NTT_MUL_STEP_IFFT);
4494 266 : print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4495 :
4496 266 : ntt_gcd (mt, product_ptr, g_x_ntt, params->s_1 / 2, NULL, nr,
4497 : ntt_context, modulus);
4498 : }
4499 :
4500 271 : outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);
4501 :
4502 271 : if (mpz_cmp_ui (mt, 1UL) > 0)
4503 : {
4504 161 : mpz_set (f, mt);
4505 161 : youpi = ECM_FACTOR_FOUND_STEP2;
4506 161 : break;
4507 : }
4508 : }
4509 :
4510 187 : if (test_verbose (OUTPUT_RESVERBOSE))
4511 : {
4512 12 : product_ptr = NULL;
4513 12 : mpz_clear (product);
4514 : }
4515 187 : mpzspv_clear (g_x_ntt, ntt_context);
4516 187 : if (twopass)
4517 1 : clear_list (R, nr);
4518 : else
4519 186 : mpzspv_clear (g_y_ntt, ntt_context);
4520 187 : mpzspv_clear (h_x_ntt, ntt_context);
4521 187 : mpzspv_clear (h_y_ntt, ntt_context);
4522 187 : mpzspm_clear (ntt_context);
4523 187 : mpz_clear (mt);
4524 187 : mpres_clear (b1_x, modulus);
4525 187 : mpres_clear (b1_y, modulus);
4526 187 : mpres_clear (Delta, modulus);
4527 187 : free (S_2);
4528 :
4529 187 : outputf (OUTPUT_NORMAL, "Step 2");
4530 : /* In normal output mode, print only cpu time as we always have.
4531 : In verbose mode, print real time as well if we used multi-threading */
4532 187 : if (test_verbose (OUTPUT_VERBOSE))
4533 21 : print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
4534 : else
4535 166 : print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
4536 :
4537 187 : return youpi;
4538 : }
|