1/*
2 * Copyright (c) 2008-2020 Stefan Krah. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28
29#ifndef LIBMPDEC_UMODARITH_H_
30#define LIBMPDEC_UMODARITH_H_
31
32
33#include "mpdecimal.h"
34
35#include "constants.h"
36#include "typearith.h"
37
38
39/* Bignum: Low level routines for unsigned modular arithmetic. These are
40 used in the fast convolution functions for very large coefficients. */
41
42
43/**************************************************************************/
44/* ANSI modular arithmetic */
45/**************************************************************************/
46
47
48/*
49 * Restrictions: a < m and b < m
50 * ACL2 proof: umodarith.lisp: addmod-correct
51 */
52static inline mpd_uint_t
53addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
54{
55 mpd_uint_t s;
56
57 s = a + b;
58 s = (s < a) ? s - m : s;
59 s = (s >= m) ? s - m : s;
60
61 return s;
62}
63
64/*
65 * Restrictions: a < m and b < m
66 * ACL2 proof: umodarith.lisp: submod-2-correct
67 */
68static inline mpd_uint_t
69submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
70{
71 mpd_uint_t d;
72
73 d = a - b;
74 d = (a < b) ? d + m : d;
75
76 return d;
77}
78
79/*
80 * Restrictions: a < 2m and b < 2m
81 * ACL2 proof: umodarith.lisp: section ext-submod
82 */
83static inline mpd_uint_t
84ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
85{
86 mpd_uint_t d;
87
88 a = (a >= m) ? a - m : a;
89 b = (b >= m) ? b - m : b;
90
91 d = a - b;
92 d = (a < b) ? d + m : d;
93
94 return d;
95}
96
97/*
98 * Reduce double word modulo m.
99 * Restrictions: m != 0
100 * ACL2 proof: umodarith.lisp: section dw-reduce
101 */
102static inline mpd_uint_t
103dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
104{
105 mpd_uint_t r1, r2, w;
106
107 _mpd_div_word(&w, &r1, hi, m);
108 _mpd_div_words(&w, &r2, r1, lo, m);
109
110 return r2;
111}
112
113/*
114 * Subtract double word from a.
115 * Restrictions: a < m
116 * ACL2 proof: umodarith.lisp: section dw-submod
117 */
118static inline mpd_uint_t
119dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
120{
121 mpd_uint_t d, r;
122
123 r = dw_reduce(hi, lo, m);
124 d = a - r;
125 d = (a < r) ? d + m : d;
126
127 return d;
128}
129
130#ifdef CONFIG_64
131
132/**************************************************************************/
133/* 64-bit modular arithmetic */
134/**************************************************************************/
135
136/*
137 * A proof of the algorithm is in literature/mulmod-64.txt. An ACL2
138 * proof is in umodarith.lisp: section "Fast modular reduction".
139 *
140 * Algorithm: calculate (a * b) % p:
141 *
142 * a) hi, lo <- a * b # Calculate a * b.
143 *
144 * b) hi, lo <- R(hi, lo) # Reduce modulo p.
145 *
146 * c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p.
147 *
148 * d) If the result is less than p, return lo. Otherwise return lo - p.
149 */
150
151static inline mpd_uint_t
152x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
153{
154 mpd_uint_t hi, lo, x, y;
155
156
157 _mpd_mul_words(&hi, &lo, a, b);
158
159 if (m & (1ULL<<32)) { /* P1 */
160
161 /* first reduction */
162 x = y = hi;
163 hi >>= 32;
164
165 x = lo - x;
166 if (x > lo) hi--;
167
168 y <<= 32;
169 lo = y + x;
170 if (lo < y) hi++;
171
172 /* second reduction */
173 x = y = hi;
174 hi >>= 32;
175
176 x = lo - x;
177 if (x > lo) hi--;
178
179 y <<= 32;
180 lo = y + x;
181 if (lo < y) hi++;
182
183 return (hi || lo >= m ? lo - m : lo);
184 }
185 else if (m & (1ULL<<34)) { /* P2 */
186
187 /* first reduction */
188 x = y = hi;
189 hi >>= 30;
190
191 x = lo - x;
192 if (x > lo) hi--;
193
194 y <<= 34;
195 lo = y + x;
196 if (lo < y) hi++;
197
198 /* second reduction */
199 x = y = hi;
200 hi >>= 30;
201
202 x = lo - x;
203 if (x > lo) hi--;
204
205 y <<= 34;
206 lo = y + x;
207 if (lo < y) hi++;
208
209 /* third reduction */
210 x = y = hi;
211 hi >>= 30;
212
213 x = lo - x;
214 if (x > lo) hi--;
215
216 y <<= 34;
217 lo = y + x;
218 if (lo < y) hi++;
219
220 return (hi || lo >= m ? lo - m : lo);
221 }
222 else { /* P3 */
223
224 /* first reduction */
225 x = y = hi;
226 hi >>= 24;
227
228 x = lo - x;
229 if (x > lo) hi--;
230
231 y <<= 40;
232 lo = y + x;
233 if (lo < y) hi++;
234
235 /* second reduction */
236 x = y = hi;
237 hi >>= 24;
238
239 x = lo - x;
240 if (x > lo) hi--;
241
242 y <<= 40;
243 lo = y + x;
244 if (lo < y) hi++;
245
246 /* third reduction */
247 x = y = hi;
248 hi >>= 24;
249
250 x = lo - x;
251 if (x > lo) hi--;
252
253 y <<= 40;
254 lo = y + x;
255 if (lo < y) hi++;
256
257 return (hi || lo >= m ? lo - m : lo);
258 }
259}
260
261static inline void
262x64_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
263{
264 *a = x64_mulmod(*a, w, m);
265 *b = x64_mulmod(*b, w, m);
266}
267
268static inline void
269x64_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
270 mpd_uint_t m)
271{
272 *a0 = x64_mulmod(*a0, b0, m);
273 *a1 = x64_mulmod(*a1, b1, m);
274}
275
276static inline mpd_uint_t
277x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
278{
279 mpd_uint_t r = 1;
280
281 while (exp > 0) {
282 if (exp & 1)
283 r = x64_mulmod(r, base, umod);
284 base = x64_mulmod(base, base, umod);
285 exp >>= 1;
286 }
287
288 return r;
289}
290
291/* END CONFIG_64 */
292#else /* CONFIG_32 */
293
294
295/**************************************************************************/
296/* 32-bit modular arithmetic */
297/**************************************************************************/
298
299#if defined(ANSI)
300#if !defined(LEGACY_COMPILER)
301/* HAVE_UINT64_T */
302static inline mpd_uint_t
303std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
304{
305 return ((mpd_uuint_t) a * b) % m;
306}
307
308static inline void
309std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
310{
311 *a = ((mpd_uuint_t) *a * w) % m;
312 *b = ((mpd_uuint_t) *b * w) % m;
313}
314
315static inline void
316std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
317 mpd_uint_t m)
318{
319 *a0 = ((mpd_uuint_t) *a0 * b0) % m;
320 *a1 = ((mpd_uuint_t) *a1 * b1) % m;
321}
322/* END HAVE_UINT64_T */
323#else
324/* LEGACY_COMPILER */
325static inline mpd_uint_t
326std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
327{
328 mpd_uint_t hi, lo, q, r;
329 _mpd_mul_words(&hi, &lo, a, b);
330 _mpd_div_words(&q, &r, hi, lo, m);
331 return r;
332}
333
334static inline void
335std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
336{
337 *a = std_mulmod(*a, w, m);
338 *b = std_mulmod(*b, w, m);
339}
340
341static inline void
342std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
343 mpd_uint_t m)
344{
345 *a0 = std_mulmod(*a0, b0, m);
346 *a1 = std_mulmod(*a1, b1, m);
347}
348/* END LEGACY_COMPILER */
349#endif
350
351static inline mpd_uint_t
352std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
353{
354 mpd_uint_t r = 1;
355
356 while (exp > 0) {
357 if (exp & 1)
358 r = std_mulmod(r, base, umod);
359 base = std_mulmod(base, base, umod);
360 exp >>= 1;
361 }
362
363 return r;
364}
365#endif /* ANSI CONFIG_32 */
366
367
368/**************************************************************************/
369/* Pentium Pro modular arithmetic */
370/**************************************************************************/
371
372/*
373 * A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU
374 * control word must be set to 64-bit precision and truncation mode
375 * prior to using these functions.
376 *
377 * Algorithm: calculate (a * b) % p:
378 *
379 * p := prime < 2**31
380 * pinv := (long double)1.0 / p (precalculated)
381 *
382 * a) n = a * b # Calculate exact product.
383 * b) qest = n * pinv # Calculate estimate for q = n / p.
384 * c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.
385 * d) r = n - q * p # Calculate remainder.
386 *
387 * Remarks:
388 *
389 * - p = dmod and pinv = dinvmod.
390 * - dinvmod points to an array of three uint32_t, which is interpreted
391 * as an 80 bit long double by fldt.
392 * - Intel compilers prior to version 11 do not seem to handle the
393 * __GNUC__ inline assembly correctly.
394 * - random tests are provided in tests/extended/ppro_mulmod.c
395 */
396
397#if defined(PPRO)
398#if defined(ASM)
399
400/* Return (a * b) % dmod */
401static inline mpd_uint_t
402ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
403{
404 mpd_uint_t retval;
405
406 __asm__ (
407 "fildl %2\n\t"
408 "fildl %1\n\t"
409 "fmulp %%st, %%st(1)\n\t"
410 "fldt (%4)\n\t"
411 "fmul %%st(1), %%st\n\t"
412 "flds %5\n\t"
413 "fadd %%st, %%st(1)\n\t"
414 "fsubrp %%st, %%st(1)\n\t"
415 "fldl (%3)\n\t"
416 "fmulp %%st, %%st(1)\n\t"
417 "fsubrp %%st, %%st(1)\n\t"
418 "fistpl %0\n\t"
419 : "=m" (retval)
420 : "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63)
421 : "st", "memory"
422 );
423
424 return retval;
425}
426
427/*
428 * Two modular multiplications in parallel:
429 * *a0 = (*a0 * w) % dmod
430 * *a1 = (*a1 * w) % dmod
431 */
432static inline void
433ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
434 double *dmod, uint32_t *dinvmod)
435{
436 __asm__ (
437 "fildl %2\n\t"
438 "fildl (%1)\n\t"
439 "fmul %%st(1), %%st\n\t"
440 "fxch %%st(1)\n\t"
441 "fildl (%0)\n\t"
442 "fmulp %%st, %%st(1) \n\t"
443 "fldt (%4)\n\t"
444 "flds %5\n\t"
445 "fld %%st(2)\n\t"
446 "fmul %%st(2)\n\t"
447 "fadd %%st(1)\n\t"
448 "fsub %%st(1)\n\t"
449 "fmull (%3)\n\t"
450 "fsubrp %%st, %%st(3)\n\t"
451 "fxch %%st(2)\n\t"
452 "fistpl (%0)\n\t"
453 "fmul %%st(2)\n\t"
454 "fadd %%st(1)\n\t"
455 "fsubp %%st, %%st(1)\n\t"
456 "fmull (%3)\n\t"
457 "fsubrp %%st, %%st(1)\n\t"
458 "fistpl (%1)\n\t"
459 : : "r" (a0), "r" (a1), "m" (w),
460 "r" (dmod), "r" (dinvmod),
461 "m" (MPD_TWO63)
462 : "st", "memory"
463 );
464}
465
466/*
467 * Two modular multiplications in parallel:
468 * *a0 = (*a0 * b0) % dmod
469 * *a1 = (*a1 * b1) % dmod
470 */
471static inline void
472ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
473 double *dmod, uint32_t *dinvmod)
474{
475 __asm__ (
476 "fildl %3\n\t"
477 "fildl (%2)\n\t"
478 "fmulp %%st, %%st(1)\n\t"
479 "fildl %1\n\t"
480 "fildl (%0)\n\t"
481 "fmulp %%st, %%st(1)\n\t"
482 "fldt (%5)\n\t"
483 "fld %%st(2)\n\t"
484 "fmul %%st(1), %%st\n\t"
485 "fxch %%st(1)\n\t"
486 "fmul %%st(2), %%st\n\t"
487 "flds %6\n\t"
488 "fldl (%4)\n\t"
489 "fxch %%st(3)\n\t"
490 "fadd %%st(1), %%st\n\t"
491 "fxch %%st(2)\n\t"
492 "fadd %%st(1), %%st\n\t"
493 "fxch %%st(2)\n\t"
494 "fsub %%st(1), %%st\n\t"
495 "fxch %%st(2)\n\t"
496 "fsubp %%st, %%st(1)\n\t"
497 "fxch %%st(1)\n\t"
498 "fmul %%st(2), %%st\n\t"
499 "fxch %%st(1)\n\t"
500 "fmulp %%st, %%st(2)\n\t"
501 "fsubrp %%st, %%st(3)\n\t"
502 "fsubrp %%st, %%st(1)\n\t"
503 "fxch %%st(1)\n\t"
504 "fistpl (%2)\n\t"
505 "fistpl (%0)\n\t"
506 : : "r" (a0), "m" (b0), "r" (a1), "m" (b1),
507 "r" (dmod), "r" (dinvmod),
508 "m" (MPD_TWO63)
509 : "st", "memory"
510 );
511}
512/* END PPRO GCC ASM */
513#elif defined(MASM)
514
515/* Return (a * b) % dmod */
516static inline mpd_uint_t __cdecl
517ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
518{
519 mpd_uint_t retval;
520
521 __asm {
522 mov eax, dinvmod
523 mov edx, dmod
524 fild b
525 fild a
526 fmulp st(1), st
527 fld TBYTE PTR [eax]
528 fmul st, st(1)
529 fld MPD_TWO63
530 fadd st(1), st
531 fsubp st(1), st
532 fld QWORD PTR [edx]
533 fmulp st(1), st
534 fsubp st(1), st
535 fistp retval
536 }
537
538 return retval;
539}
540
541/*
542 * Two modular multiplications in parallel:
543 * *a0 = (*a0 * w) % dmod
544 * *a1 = (*a1 * w) % dmod
545 */
546static inline mpd_uint_t __cdecl
547ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
548 double *dmod, uint32_t *dinvmod)
549{
550 __asm {
551 mov ecx, dmod
552 mov edx, a1
553 mov ebx, dinvmod
554 mov eax, a0
555 fild w
556 fild DWORD PTR [edx]
557 fmul st, st(1)
558 fxch st(1)
559 fild DWORD PTR [eax]
560 fmulp st(1), st
561 fld TBYTE PTR [ebx]
562 fld MPD_TWO63
563 fld st(2)
564 fmul st, st(2)
565 fadd st, st(1)
566 fsub st, st(1)
567 fmul QWORD PTR [ecx]
568 fsubp st(3), st
569 fxch st(2)
570 fistp DWORD PTR [eax]
571 fmul st, st(2)
572 fadd st, st(1)
573 fsubrp st(1), st
574 fmul QWORD PTR [ecx]
575 fsubp st(1), st
576 fistp DWORD PTR [edx]
577 }
578}
579
580/*
581 * Two modular multiplications in parallel:
582 * *a0 = (*a0 * b0) % dmod
583 * *a1 = (*a1 * b1) % dmod
584 */
585static inline void __cdecl
586ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
587 double *dmod, uint32_t *dinvmod)
588{
589 __asm {
590 mov ecx, dmod
591 mov edx, a1
592 mov ebx, dinvmod
593 mov eax, a0
594 fild b1
595 fild DWORD PTR [edx]
596 fmulp st(1), st
597 fild b0
598 fild DWORD PTR [eax]
599 fmulp st(1), st
600 fld TBYTE PTR [ebx]
601 fld st(2)
602 fmul st, st(1)
603 fxch st(1)
604 fmul st, st(2)
605 fld DWORD PTR MPD_TWO63
606 fld QWORD PTR [ecx]
607 fxch st(3)
608 fadd st, st(1)
609 fxch st(2)
610 fadd st, st(1)
611 fxch st(2)
612 fsub st, st(1)
613 fxch st(2)
614 fsubrp st(1), st
615 fxch st(1)
616 fmul st, st(2)
617 fxch st(1)
618 fmulp st(2), st
619 fsubp st(3), st
620 fsubp st(1), st
621 fxch st(1)
622 fistp DWORD PTR [edx]
623 fistp DWORD PTR [eax]
624 }
625}
626#endif /* PPRO MASM (_MSC_VER) */
627
628
629/* Return (base ** exp) % dmod */
630static inline mpd_uint_t
631ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod)
632{
633 mpd_uint_t r = 1;
634
635 while (exp > 0) {
636 if (exp & 1)
637 r = ppro_mulmod(r, base, dmod, dinvmod);
638 base = ppro_mulmod(base, base, dmod, dinvmod);
639 exp >>= 1;
640 }
641
642 return r;
643}
644#endif /* PPRO */
645#endif /* CONFIG_32 */
646
647
648#endif /* LIBMPDEC_UMODARITH_H_ */
649