Line data Source code
1 : /* Polynomial multiplication using GMP's integer multiplication code
2 :
3 : Copyright 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Dave Newman,
4 : Paul Zimmermann, Alexander Kruppa.
5 :
6 : This file is part of the ECM Library.
7 :
8 : The ECM Library is free software; you can redistribute it and/or modify
9 : it under the terms of the GNU Lesser General Public License as published by
10 : the Free Software Foundation; either version 3 of the License, or (at your
11 : option) any later version.
12 :
13 : The ECM Library is distributed in the hope that it will be useful, but
14 : WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 : or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 : License for more details.
17 :
18 : You should have received a copy of the GNU Lesser General Public License
19 : along with the ECM Library; see the file COPYING.LIB. If not, see
20 : http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 : 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22 :
23 : #include <stdlib.h>
24 : #include "ecm-gmp.h" /* for MPZ_REALLOC and MPN_COPY */
25 : #include "ecm-impl.h"
26 :
27 : #if defined(HAVE___GMPN_MULMOD_BNM1) && defined(HAVE___GMPN_MULMOD_BNM1_NEXT_SIZE)
28 : #define FFT_WRAP /* use the wrap-around trick */
29 : #endif
30 :
31 : /* Copy at r+i*s the content of A[i*stride] for 0 <= i < l
32 : Assume all A[i*stride] are non-negative, and their size is <= s.
33 : */
34 : static void
35 13378606 : pack (mp_ptr r, mpz_t *A, mp_size_t l, mp_size_t stride, mp_size_t s)
36 : {
37 : mp_size_t i, j, m;
38 :
39 83959952 : for (i = 0, j = 0; i < l; i++, j += stride, r += s)
40 : {
41 70581346 : m = SIZ(A[j]);
42 : ASSERT((0 <= m) && (m <= s));
43 70581346 : if (m)
44 70564129 : MPN_COPY (r, PTR(A[j]), m);
45 70581346 : if (m < s)
46 788761600 : MPN_ZERO (r + m, s - m);
47 : }
48 13378606 : }
49 :
50 : /* put in R[i*stride] for 0 <= i < l the content of {t+i*s, s} */
51 : void
52 6689303 : unpack (mpz_t *R, mp_size_t stride, mp_ptr t, mp_size_t l, mp_size_t s)
53 : {
54 : mp_size_t i, j, size_tmp;
55 : mp_ptr r_ptr;
56 :
57 72553853 : for (i = 0, j = 0; i < l; i++, t += s, j += stride)
58 : {
59 65864550 : size_tmp = s;
60 108211472 : MPN_NORMALIZE(t, size_tmp); /* compute the actual size */
61 65864550 : r_ptr = MPZ_REALLOC (R[j], size_tmp);
62 65864550 : if (size_tmp)
63 65847333 : MPN_COPY (r_ptr, t, size_tmp);
64 65864550 : SIZ(R[j]) = size_tmp;
65 : }
66 6689303 : }
67 :
68 : /* R <- A * B where A = A[0] + A[1]*x + ... + A[n-1]*x^(n-1), idem for B */
69 : void
70 14996192 : list_mul_n_basecase (listz_t R, listz_t A, listz_t B, unsigned int n)
71 : {
72 : unsigned int i, j;
73 :
74 14996192 : if (n == 1)
75 : {
76 10150554 : mpz_mul (R[0], A[0], B[0]);
77 10150554 : return;
78 : }
79 :
80 28554736 : for (i = 0; i < n; i++)
81 243644756 : for (j = 0; j < n; j++)
82 : {
83 219935658 : if (i == 0 || j == n - 1)
84 42572558 : mpz_mul (R[i+j], A[i], B[j]);
85 : else
86 177363100 : mpz_addmul (R[i+j], A[i], B[j]);
87 : }
88 : }
89 :
90 : static void
91 23379158 : list_mul_n_kara2 (listz_t R, listz_t A, listz_t B)
92 : {
93 23379158 : mpz_add (R[0], A[0], A[1]);
94 23379158 : mpz_add (R[2], B[0], B[1]);
95 23379158 : mpz_mul (R[1], R[0], R[2]);
96 23379158 : mpz_mul (R[0], A[0], B[0]);
97 23379158 : mpz_mul (R[2], A[1], B[1]);
98 23379158 : mpz_sub (R[1], R[1], R[0]);
99 23379158 : mpz_sub (R[1], R[1], R[2]);
100 23379158 : }
101 :
102 : /* R[0..4] <- A[0..2] * B[0..2] in 7 multiplies */
103 : static void
104 7503613 : list_mul_n_kara3 (listz_t R, listz_t A, listz_t B, listz_t T)
105 : {
106 7503613 : mpz_add (T[0], A[0], A[2]);
107 7503613 : mpz_add (R[0], B[0], B[2]);
108 7503613 : mpz_mul (R[2], T[0], R[0]); /* (A0+A2)*(B0+B2) */
109 7503613 : mpz_mul (R[3], T[0], B[1]); /* (A0+A2)*B1 */
110 7503613 : mpz_mul (R[4], A[1], R[0]); /* A1*(B0+B2) */
111 7503613 : mpz_add (R[3], R[3], R[4]); /* (A0+A2)*B1+A1*(B0+B2) */
112 7503613 : list_mul_n_kara2 (T, A, B);
113 7503613 : mpz_sub (R[2], R[2], T[0]); /* A0*A2+A2*B0+A2*B2 */
114 7503613 : mpz_sub (R[3], R[3], T[1]); /* A2*B1+A1*B2 */
115 7503613 : mpz_add (R[2], R[2], T[2]); /* A0*A2+A2*B0+A2*B2+A1*B1 */
116 7503613 : mpz_swap (R[0], T[0]); /* A0*B0 */
117 7503613 : mpz_swap (R[1], T[1]); /* A0*B1+A1*B0 */
118 7503613 : mpz_mul (R[4], A[2], B[2]); /* A2*B2 */
119 7503613 : mpz_sub (R[2], R[2], R[4]); /* A0*A2+A2*B0+A1*B1 */
120 7503613 : }
121 :
122 : /* Assume n >= 2. T is a scratch space of enough entries. */
123 : static void
124 37550927 : list_mul_n_karatsuba_aux (listz_t R, listz_t A, listz_t B, unsigned int n,
125 : listz_t T)
126 : {
127 : unsigned int h, l;
128 :
129 37550927 : if (n == 1)
130 : {
131 4194303 : list_mul_n_basecase (R, A, B, n);
132 4194303 : return;
133 : }
134 :
135 33356624 : if (n == 2)
136 : {
137 15875545 : list_mul_n_kara2 (R, A, B);
138 15875545 : return;
139 : }
140 :
141 17481079 : if (n == 3)
142 : {
143 7503613 : list_mul_n_kara3 (R, A, B, T);
144 7503613 : return;
145 : }
146 :
147 9977466 : h = n / 2;
148 9977466 : l = n - h;
149 9977466 : list_add (R, A, A + l, h);
150 9977466 : list_add (R + l, B, B + l, h);
151 9977466 : if (h < l)
152 : {
153 3145606 : mpz_set (R[h], A[h]);
154 3145606 : mpz_set (R[l + h], B[h]);
155 : }
156 9977466 : list_mul_n_karatsuba_aux (T, R, R + l, l, T + 2 * l - 1);
157 9977466 : list_mul_n_karatsuba_aux (R, A, B, l, T + 2 * l - 1);
158 : /* {R,2l-1} = Al * Bl */
159 9977466 : list_mul_n_karatsuba_aux (R + 2 * l, A + l, B + l, h, T + 2 * l - 1);
160 : /* {R+2l,2h-1} = Ah * Bh */
161 : /* T will contain Al*Bh+Ah*Bl, it thus suffices to compute its low n-1
162 : coefficients */
163 9977466 : list_sub (T, T, R, n - 1);
164 9977466 : list_sub (T, T, R + 2 * l, 2 * h - 1);
165 9977466 : mpz_set_ui (R[2 * l - 1], 0);
166 9977466 : list_add (R + l, R + l, T, n - 1);
167 : }
168 :
169 : static unsigned int
170 15695717 : list_mul_n_mem (unsigned int n)
171 : {
172 15695717 : if (n == 1)
173 7618529 : return 0;
174 : else
175 : {
176 8077188 : unsigned int k = (n + 1) / 2;
177 8077188 : return 2 * k - 1 + list_mul_n_mem (k);
178 : }
179 : }
180 :
181 : void
182 7618529 : list_mul_n_karatsuba (listz_t R, listz_t A, listz_t B, unsigned int n)
183 : {
184 : listz_t T;
185 : unsigned int s;
186 :
187 7618529 : s = list_mul_n_mem (n);
188 7618529 : T = init_list (s);
189 7618529 : list_mul_n_karatsuba_aux (R, A, B, n, T);
190 7618529 : clear_list (T, s);
191 7618529 : }
192 :
193 : /* Classical one-point Kronecker-Schoenhage substitution.
194 : Notes:
195 : - this code aligns the coeffs at limb boundaries - if instead we aligned
196 : at byte boundaries then we could save up to 3*n bytes,
197 : but tests have shown this doesn't give any significant speed increase,
198 : even for large degree polynomials.
199 : - this code requires that all coefficients A[] and B[] are nonnegative. */
200 : void
201 2744289 : list_mul_n_KS1 (listz_t R, listz_t A, listz_t B, unsigned int l)
202 : {
203 : unsigned long i;
204 2744289 : mp_size_t s, t = 0, size_t0;
205 : mp_ptr t0_ptr, t1_ptr, t2_ptr;
206 :
207 : /* compute the largest bit-size t of the A[i] and B[i] */
208 15539697 : for (i = 0; i < l; i++)
209 : {
210 12795408 : if ((s = mpz_sizeinbase (A[i], 2)) > t)
211 3915716 : t = s;
212 12795408 : if ((s = mpz_sizeinbase (B[i], 2)) > t)
213 2744289 : t = s;
214 : }
215 : /* For n > 0, s = sizeinbase (n, 2) ==> n < 2^s.
216 : For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
217 : Hence all A[i], B[i] < 2^t */
218 :
219 : /* Each coeff of A(x)*B(x) < l * 2^(2*t), so max number of bits in a
220 : coeff of the product will be 2 * t + ceil(log_2(l)) */
221 2744289 : s = 2 * t;
222 7028581 : for (i = l; i > 1; s++, i = (i + 1) >> 1);
223 :
224 : /* work out the corresponding number of limbs */
225 2744289 : s = 1 + (s - 1) / GMP_NUMB_BITS;
226 :
227 2744289 : size_t0 = s * l;
228 :
229 : /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
230 2744289 : t0_ptr = (mp_ptr) malloc (4 * size_t0 * sizeof (mp_limb_t));
231 2744289 : if (t0_ptr == NULL)
232 : {
233 0 : outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
234 0 : exit (1);
235 : }
236 2744289 : t1_ptr = t0_ptr + size_t0;
237 2744289 : t2_ptr = t1_ptr + size_t0;
238 :
239 2744289 : pack (t0_ptr, A, l, 1, s);
240 2744289 : pack (t1_ptr, B, l, 1, s);
241 :
242 2744289 : mpn_mul_n (t2_ptr, t0_ptr, t1_ptr, size_t0);
243 :
244 2744289 : unpack (R, 1, t2_ptr, 2 * l - 1, s);
245 :
246 2744289 : free (t0_ptr);
247 2744289 : }
248 :
249 : /* Two-point Kronecker substitition.
250 : Reference: Algorithm 2 from "Faster polynomial multiplication via multipoint
251 : Kronecker substitution", David Harvey, Journal of Symbolic Computation,
252 : number 44 (2009), pages 1502-1510.
253 : Assume n >= 2.
254 : Notes:
255 : - this code aligns the coeffs at limb boundaries - if instead we aligned
256 : at byte boundaries then we could save up to 3*n bytes,
257 : but tests have shown this doesn't give any significant speed increase,
258 : even for large degree polynomials.
259 : - this code requires that all coefficients A[] and B[] are nonnegative.
260 : */
261 : void
262 1972507 : list_mul_n_KS2 (listz_t R, listz_t A, listz_t B, unsigned int n)
263 : {
264 : unsigned long i;
265 1972507 : mp_size_t s, s2, t = 0, l, h, ns2;
266 : mp_ptr tmp, A0, A1, B0, B1, C0, C1;
267 : int sA, sB;
268 :
269 1972507 : ASSERT_ALWAYS (n >= 2);
270 :
271 : /* compute the largest bit-size t of the A[i] and B[i] */
272 24467772 : for (i = 0; i < n; i++)
273 : {
274 22495265 : if ((s = mpz_sizeinbase (A[i], 2)) > t)
275 3349528 : t = s;
276 22495265 : if ((s = mpz_sizeinbase (B[i], 2)) > t)
277 1926605 : t = s;
278 : }
279 : /* For n > 0, s = sizeinbase (n, 2) ==> n < 2^s.
280 : For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
281 : Hence all A[i], B[i] < 2^t */
282 :
283 : /* Each coeff of A(x)*B(x) < n * 2^(2*t), so max number of bits in a
284 : coeff of the product will be 2 * t + ceil(log_2(n)) */
285 1972507 : s = 2 * t;
286 7520683 : for (i = n; i > 1; s++, i = (i + 1) >> 1);
287 :
288 : /* work out the corresponding number of limbs */
289 1972507 : s = 1 + (s - 1) / GMP_NUMB_BITS;
290 :
291 : /* ensure s is even */
292 1972507 : s = s + (s & 1);
293 1972507 : s2 = s >> 1;
294 1972507 : ns2 = n * s2;
295 :
296 1972507 : l = n / 2;
297 1972507 : h = n - l;
298 :
299 : /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
300 1972507 : tmp = (mp_ptr) malloc (8 * ns2 * sizeof (mp_limb_t));
301 1972507 : if (tmp == NULL)
302 : {
303 0 : outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
304 0 : exit (1);
305 : }
306 :
307 1972507 : A0 = tmp;
308 1972507 : A1 = A0 + ns2;
309 1972507 : B0 = A1 + ns2;
310 1972507 : B1 = B0 + ns2;
311 1972507 : C0 = B1 + ns2;
312 1972507 : C1 = C0 + 2 * ns2;
313 :
314 1972507 : pack (A0, A, h, 2, s); /* A0 = Aeven(S) where S = 2^(s*GMP_NUMB_BITS) */
315 : /* A0 has in fact only n * s2 significant limbs:
316 : if n=2h, h*s = n*s2
317 : if n=2h-1, the last chunk from A0 has at most s2 limbs */
318 21696481 : MPN_ZERO(B0, s2);
319 1972507 : pack (B0 + s2, A + 1, l, 2, s);
320 : /* for the same reason as above, we have at most l*s-s2 significant limbs
321 : at B0+s2, thus at most l*s <= n*s2 at B0 */
322 1972507 : if ((sA = mpn_cmp (A0, B0, ns2)) >= 0)
323 753928 : mpn_sub_n (A1, A0, B0, ns2);
324 : else
325 1218579 : mpn_sub_n (A1, B0, A0, ns2);
326 1972507 : mpn_add_n (A0, A0, B0, ns2);
327 : /* now A0 is X+ with the notations of Algorithm, A1 is sA*X- */
328 :
329 1972507 : pack (B0, B, h, 2, s);
330 21696481 : MPN_ZERO(C0, s2);
331 1972507 : pack (C0 + s2, B + 1, l, 2, s);
332 1972507 : if ((sB = mpn_cmp (B0, C0, ns2)) >= 0)
333 753267 : mpn_sub_n (B1, B0, C0, ns2);
334 : else
335 1219240 : mpn_sub_n (B1, C0, B0, ns2);
336 1972507 : mpn_add_n (B0, B0, C0, ns2);
337 : /* B0 is Y+, B1 is sB*Y- with the notations of Algorithm 2 */
338 :
339 1972507 : mpn_mul_n (C0, A0, B0, ns2); /* C0 is Z+ = X+ * Y+ */
340 1972507 : mpn_mul_n (C1, A1, B1, ns2); /* C1 is sA * sB * Z- */
341 :
342 1972507 : if (sA * sB >= 0)
343 : {
344 1971698 : mpn_add_n (A0, C0, C1, 2 * ns2);
345 1971698 : mpn_sub_n (B0, C0, C1, 2 * ns2);
346 : }
347 : else
348 : {
349 809 : mpn_sub_n (A0, C0, C1, 2 * ns2);
350 809 : mpn_add_n (B0, C0, C1, 2 * ns2);
351 : }
352 1972507 : mpn_rshift (A0, A0, 4 * ns2, 1); /* global division by 2 */
353 :
354 : /* If A[] and B[] have n coefficients, the product has 2n-1 coefficients.
355 : The even part has n coefficients and the odd part n-1 coefficients */
356 1972507 : unpack (R, 2, A0, n, s);
357 1972507 : unpack (R + 1, 2, B0 + s2, n - 1, s);
358 :
359 1972507 : free (tmp);
360 1972507 : }
361 :
362 : /* Puts in R[0..2n-2] the product of A[0..n-1] and B[0..n-1], seen as
363 : polynomials.
364 : */
365 : void
366 3845177 : list_mult_n (listz_t R, listz_t A, listz_t B, unsigned int n)
367 : {
368 3845177 : int T[TUNE_LIST_MUL_N_MAX_SIZE] = LIST_MUL_TABLE, best;
369 :
370 : /* See tune_list_mul_n() in tune.c:
371 : 0 : list_mul_n_basecase
372 : 2 : list_mul_n_KS1
373 : 3 : list_mul_n_KS2 */
374 3845177 : best = (n < TUNE_LIST_MUL_N_MAX_SIZE) ? T[n] : 3;
375 :
376 3845177 : if (best == 0)
377 3732224 : list_mul_n_basecase (R, A, B, n);
378 112953 : else if (best == 1)
379 0 : list_mul_n_karatsuba (R, A, B, n);
380 112953 : else if (best == 2)
381 0 : list_mul_n_KS1 (R, A, B, n);
382 : else
383 112953 : list_mul_n_KS2 (R, A, B, n);
384 3845177 : }
385 :
386 : /* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients
387 : of degree m to n+m of rev(a)*c, i.e.
388 : b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l)
389 : ...
390 : b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k)
391 : ...
392 : b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n].
393 :
394 : If rev=0, consider a instead of rev(a).
395 :
396 : Assumes n <= l.
397 :
398 : Return non-zero if an error occurred.
399 :
400 : low(b) is the coefficients of degree 0 to m-1 of a*c (or rev(a)*c)
401 : mid(b) is the coefficients of degree m to m+n of a*c
402 : high(b) is the coefficients of degree m+n+1 to m+l+1 of a*c
403 : */
404 :
405 : int
406 8092 : TMulKS (listz_t b, unsigned int n, listz_t a, unsigned int m,
407 : listz_t c, unsigned int l, mpz_t modulus, int rev)
408 : {
409 8092 : unsigned long i, s = 0, t;
410 : mp_ptr ap, bp, cp;
411 : mp_size_t an, bn, cn;
412 8092 : int ret = 0; /* default return value */
413 : #ifdef DEBUG
414 : long st = cputime ();
415 : fprintf (ECM_STDOUT, "n=%u m=%u l=%u bits=%u n*bits=%u: ", n, m, l,
416 : mpz_sizeinbase (modulus, 2), n * mpz_sizeinbase (modulus, 2));
417 : #endif
418 :
419 : ASSERT (n <= l); /* otherwise the upper coefficients of b are 0 */
420 8092 : if (l > n + m)
421 8028 : l = n + m; /* otherwise, c has too many coeffs */
422 :
423 : /* make coefficients a[] and c[] non-negative and compute max #bits */
424 2606501 : for (i = 0; i <= m; i++)
425 : {
426 2598409 : if (mpz_sgn (a[i]) < 0)
427 0 : mpz_mod (a[i], a[i], modulus);
428 2598409 : if ((t = mpz_sizeinbase (a[i], 2)) > s)
429 12761 : s = t;
430 : }
431 5540945 : for (i = 0; i <= l; i++)
432 : {
433 5532853 : if (mpz_sgn (c[i]) < 0)
434 0 : mpz_mod (c[i], c[i], modulus);
435 5532853 : if ((t = mpz_sizeinbase (c[i], 2)) > s)
436 4099 : s = t;
437 : }
438 :
439 : #ifdef FFT_WRAP
440 8092 : s ++; /* need one extra bit to prevent carry of low(b) + high(b) */
441 : #endif
442 :
443 : /* max coeff has 2*s+ceil(log2(min(m+1,l+1))) bits,
444 : i.e. 2*s + 1 + floor(log2(min(m,l))) */
445 39860 : for (s = 2 * s, i = (m < l) ? m : l; i; s++, i >>= 1);
446 :
447 : /* corresponding number of limbs */
448 8092 : s = 1 + (s - 1) / GMP_NUMB_BITS;
449 :
450 8092 : an = (m + 1) * s;
451 8092 : cn = (l + 1) * s;
452 8092 : bn = an + cn;
453 :
454 : /* a[0..m] needs (m+1) * s limbs */
455 8092 : ap = (mp_ptr) malloc (an * sizeof (mp_limb_t));
456 8092 : if (ap == NULL)
457 : {
458 0 : ret = 1;
459 0 : goto TMulKS_end;
460 : }
461 8092 : cp = (mp_ptr) malloc (cn * sizeof (mp_limb_t));
462 8092 : if (cp == NULL)
463 : {
464 0 : ret = 1;
465 0 : goto TMulKS_free_ap;
466 : }
467 :
468 99209111 : MPN_ZERO (ap, an);
469 199916421 : MPN_ZERO (cp, cn);
470 :
471 : /* a is reverted */
472 2606501 : for (i = 0; i <= m; i++)
473 2598409 : if (SIZ(a[i]))
474 2598397 : MPN_COPY (ap + ((rev) ? (m - i) : i) * s, PTR(a[i]), SIZ(a[i]));
475 5540945 : for (i = 0; i <= l; i++)
476 5532853 : if (SIZ(c[i]))
477 5532837 : MPN_COPY (cp + i * s, PTR(c[i]), SIZ(c[i]));
478 :
479 : #ifdef FFT_WRAP
480 : /* the product rev(a) * c has m+l+1 coefficients.
481 : We throw away the first m and the last l-n <= m.
482 : If we compute mod (m+n+1) * s limbs, we are ok */
483 8092 : bn = mpn_mulmod_bnm1_next_size ((m + n + 1) * s);
484 :
485 8092 : bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
486 8092 : if (bp == NULL)
487 : {
488 0 : ret = 1;
489 0 : goto TMulKS_free_cp;
490 : }
491 : {
492 : mp_ptr tp;
493 8092 : tp = (mp_ptr) malloc ((2 * bn + 4) * sizeof (mp_limb_t));
494 8092 : if (tp == NULL)
495 : {
496 0 : ret = 1;
497 0 : goto TMulKS_free_cp;
498 : }
499 : /* mpn_mulmod_bnm1 requires that the first operand is larger */
500 8092 : if (an >= cn)
501 1158 : mpn_mulmod_bnm1 (bp, bn, ap, an, cp, cn, tp);
502 : else
503 6934 : mpn_mulmod_bnm1 (bp, bn, cp, cn, ap, an, tp);
504 8092 : free (tp);
505 : }
506 : #else /* FFT_WRAP is not defined */
507 : bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
508 : if (bp == NULL)
509 : {
510 : ret = 1;
511 : goto TMulKS_free_cp;
512 : }
513 : if (an >= cn)
514 : mpn_mul (bp, ap, an, cp, cn);
515 : else
516 : mpn_mul (bp, cp, cn, ap, an);
517 : #endif
518 :
519 : /* recover coefficients of degree m to n+m of product in b[0..n] */
520 8092 : bp += m * s;
521 2950628 : for (i = 0; i <= n; i++)
522 : {
523 2942536 : t = s;
524 3012678 : MPN_NORMALIZE(bp, t);
525 2942536 : MPZ_REALLOC (b[i], (mp_size_t) t);
526 2942536 : if (t)
527 2942536 : MPN_COPY (PTR(b[i]), bp, t);
528 2942536 : SIZ(b[i]) = t;
529 2942536 : bp += s;
530 : }
531 8092 : bp -= (m + n + 1) * s;
532 :
533 8092 : free (bp);
534 8092 : TMulKS_free_cp:
535 8092 : free (cp);
536 8092 : TMulKS_free_ap:
537 8092 : free (ap);
538 :
539 : #ifdef DEBUG
540 : fprintf (ECM_STDOUT, "%ldms\n", elltime (st, cputime ()));
541 : #endif
542 :
543 8092 : TMulKS_end:
544 8092 : return ret;
545 : }
546 :
547 : unsigned int
548 75877 : ks_wrapmul_m (unsigned int m0, unsigned int k, mpz_t n)
549 : {
550 : #ifdef FFT_WRAP
551 : mp_size_t t, s;
552 : unsigned long i, m;
553 :
554 75877 : t = mpz_sizeinbase (n, 2);
555 75877 : s = t * 2 + 1;
556 327724 : for (i = k - 1; i; s++, i >>= 1);
557 75877 : s = 1 + (s - 1) / GMP_NUMB_BITS;
558 75877 : i = mpn_mulmod_bnm1_next_size (m0 * s);
559 919780 : while (i % s)
560 843903 : i = mpn_mulmod_bnm1_next_size (i + 1);
561 75877 : m = i / s;
562 75877 : return m;
563 : #else
564 : return ~ (unsigned int) 0;
565 : #endif
566 : }
567 :
568 : /* multiply in R[] A[0]+A[1]*x+...+A[k-1]*x^(k-1)
569 : by B[0]+B[1]*x+...+B[l-1]*x^(l-1) modulo n,
570 : wrapping around coefficients of the product up from degree m >= m0.
571 : Assumes k >= l.
572 : R is assumed to have 2*m0-3+list_mul_mem(m0-1) allocated cells.
573 : Return m (or 0 if an error occurred).
574 : */
575 : unsigned int
576 75829 : ks_wrapmul (listz_t R, unsigned int m0,
577 : listz_t A, unsigned int k,
578 : listz_t B, unsigned int l,
579 : mpz_t n)
580 : {
581 : #ifndef FFT_WRAP
582 : ASSERT_ALWAYS(0); /* ks_wrapmul should not be called in that case */
583 : return 0;
584 : #else
585 : unsigned long i, m, t;
586 : mp_size_t s, size_t0, size_t1, size_tmp;
587 : mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr, tp;
588 :
589 : ASSERT(k >= l);
590 :
591 75829 : t = mpz_sizeinbase (n, 2);
592 1061485 : for (i = 0; i < k; i++)
593 985656 : if (mpz_sgn (A[i]) < 0 || mpz_sizeinbase (A[i], 2) > t)
594 0 : mpz_mod (A[i], A[i], n);
595 909827 : for (i = 0; i < l; i++)
596 833998 : if (mpz_sgn (B[i]) < 0 || mpz_sizeinbase (B[i], 2) > t)
597 0 : mpz_mod (B[i], B[i], n);
598 :
599 75829 : s = t * 2 + 1; /* one extra sign bit */
600 327203 : for (i = k - 1; i; s++, i >>= 1);
601 :
602 75829 : s = 1 + (s - 1) / GMP_NUMB_BITS;
603 :
604 75829 : size_t0 = s * k;
605 75829 : size_t1 = s * l;
606 :
607 : /* allocate one double-buffer to save malloc/MPN_ZERO/free calls */
608 75829 : t0_ptr = (mp_ptr) malloc (size_t0 * sizeof (mp_limb_t));
609 75829 : if (t0_ptr == NULL)
610 0 : return 0;
611 75829 : t1_ptr = (mp_ptr) malloc (size_t1 * sizeof (mp_limb_t));
612 75829 : if (t1_ptr == NULL)
613 : {
614 0 : free (t0_ptr);
615 0 : return 0;
616 : }
617 :
618 26128645 : MPN_ZERO (t0_ptr, size_t0);
619 22961289 : MPN_ZERO (t1_ptr, size_t1);
620 :
621 1061485 : for (i = 0; i < k; i++)
622 985656 : if (SIZ(A[i]))
623 985656 : MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i]));
624 909827 : for (i = 0; i < l; i++)
625 833998 : if (SIZ(B[i]))
626 833998 : MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i]));
627 :
628 75829 : i = mpn_mulmod_bnm1_next_size (m0 * s);
629 : /* the following loop ensures we don't cut in the middle of a
630 : coefficient */
631 901205 : while (i % s)
632 825376 : i = mpn_mulmod_bnm1_next_size (i + 1);
633 : ASSERT(i % s == 0);
634 75829 : m = i / s;
635 : ASSERT(m <= 2 * m0 - 3 + list_mul_mem (m0 - 1));
636 :
637 75829 : t2_ptr = (mp_ptr) malloc ((i + 1) * sizeof (mp_limb_t));
638 75829 : if (t2_ptr == NULL)
639 : {
640 0 : free (t0_ptr);
641 0 : free (t1_ptr);
642 0 : return 0;
643 : }
644 :
645 : {
646 75829 : mp_ptr tp = malloc ((2 * i + 4) * sizeof (mp_limb_t));
647 75829 : if (tp == NULL)
648 : {
649 0 : free (t0_ptr);
650 0 : free (t1_ptr);
651 0 : return 0;
652 : }
653 75829 : mpn_mulmod_bnm1 (t2_ptr, i, t0_ptr, size_t0, t1_ptr, size_t1, tp);
654 75829 : if ((mp_size_t) i > size_t0 + size_t1)
655 1204192 : MPN_ZERO(t2_ptr + size_t0 + size_t1, i - (size_t0 + size_t1));
656 75829 : free (tp);
657 : }
658 :
659 1528068 : for (t = 0, tp = t2_ptr; t < m; t++, tp += s)
660 : {
661 1452239 : size_tmp = s;
662 4209389 : MPN_NORMALIZE(tp, size_tmp);
663 1452239 : r_ptr = MPZ_REALLOC (R[t], size_tmp);
664 1452239 : if (size_tmp)
665 1381462 : MPN_COPY (r_ptr, tp, size_tmp);
666 1452239 : SIZ(R[t]) = size_tmp;
667 : }
668 :
669 75829 : free (t0_ptr);
670 75829 : free (t1_ptr);
671 75829 : free (t2_ptr);
672 :
673 75829 : return m;
674 : #endif /* FFT_WRAP */
675 : }
|