1 | /* |
2 | * Copyright (c) 2008-2020 Stefan Krah. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * |
11 | * 2. Redistributions in binary form must reproduce the above copyright |
12 | * notice, this list of conditions and the following disclaimer in the |
13 | * documentation and/or other materials provided with the distribution. |
14 | * |
15 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND |
16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
19 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
20 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
21 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
22 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
23 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
24 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
25 | * SUCH DAMAGE. |
26 | */ |
27 | |
28 | |
29 | #include "mpdecimal.h" |
30 | |
31 | #include <assert.h> |
32 | |
33 | #include "constants.h" |
34 | #include "fourstep.h" |
35 | #include "numbertheory.h" |
36 | #include "sixstep.h" |
37 | #include "umodarith.h" |
38 | |
39 | |
40 | /* Bignum: Cache efficient Matrix Fourier Transform for arrays of the |
41 | form 3 * 2**n (See literature/matrix-transform.txt). */ |
42 | |
43 | |
44 | #ifndef PPRO |
45 | static inline void |
46 | std_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, |
47 | mpd_uint_t w3table[3], mpd_uint_t umod) |
48 | { |
49 | mpd_uint_t r1, r2; |
50 | mpd_uint_t w; |
51 | mpd_uint_t s, tmp; |
52 | |
53 | |
54 | /* k = 0 -> w = 1 */ |
55 | s = *x1; |
56 | s = addmod(s, *x2, umod); |
57 | s = addmod(s, *x3, umod); |
58 | |
59 | r1 = s; |
60 | |
61 | /* k = 1 */ |
62 | s = *x1; |
63 | |
64 | w = w3table[1]; |
65 | tmp = MULMOD(*x2, w); |
66 | s = addmod(s, tmp, umod); |
67 | |
68 | w = w3table[2]; |
69 | tmp = MULMOD(*x3, w); |
70 | s = addmod(s, tmp, umod); |
71 | |
72 | r2 = s; |
73 | |
74 | /* k = 2 */ |
75 | s = *x1; |
76 | |
77 | w = w3table[2]; |
78 | tmp = MULMOD(*x2, w); |
79 | s = addmod(s, tmp, umod); |
80 | |
81 | w = w3table[1]; |
82 | tmp = MULMOD(*x3, w); |
83 | s = addmod(s, tmp, umod); |
84 | |
85 | *x3 = s; |
86 | *x2 = r2; |
87 | *x1 = r1; |
88 | } |
89 | #else /* PPRO */ |
90 | static inline void |
91 | ppro_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, mpd_uint_t w3table[3], |
92 | mpd_uint_t umod, double *dmod, uint32_t dinvmod[3]) |
93 | { |
94 | mpd_uint_t r1, r2; |
95 | mpd_uint_t w; |
96 | mpd_uint_t s, tmp; |
97 | |
98 | |
99 | /* k = 0 -> w = 1 */ |
100 | s = *x1; |
101 | s = addmod(s, *x2, umod); |
102 | s = addmod(s, *x3, umod); |
103 | |
104 | r1 = s; |
105 | |
106 | /* k = 1 */ |
107 | s = *x1; |
108 | |
109 | w = w3table[1]; |
110 | tmp = ppro_mulmod(*x2, w, dmod, dinvmod); |
111 | s = addmod(s, tmp, umod); |
112 | |
113 | w = w3table[2]; |
114 | tmp = ppro_mulmod(*x3, w, dmod, dinvmod); |
115 | s = addmod(s, tmp, umod); |
116 | |
117 | r2 = s; |
118 | |
119 | /* k = 2 */ |
120 | s = *x1; |
121 | |
122 | w = w3table[2]; |
123 | tmp = ppro_mulmod(*x2, w, dmod, dinvmod); |
124 | s = addmod(s, tmp, umod); |
125 | |
126 | w = w3table[1]; |
127 | tmp = ppro_mulmod(*x3, w, dmod, dinvmod); |
128 | s = addmod(s, tmp, umod); |
129 | |
130 | *x3 = s; |
131 | *x2 = r2; |
132 | *x1 = r1; |
133 | } |
134 | #endif |
135 | |
136 | |
137 | /* forward transform, sign = -1; transform length = 3 * 2**n */ |
138 | int |
139 | four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum) |
140 | { |
141 | mpd_size_t R = 3; /* number of rows */ |
142 | mpd_size_t C = n / 3; /* number of columns */ |
143 | mpd_uint_t w3table[3]; |
144 | mpd_uint_t kernel, w0, w1, wstep; |
145 | mpd_uint_t *s, *p0, *p1, *p2; |
146 | mpd_uint_t umod; |
147 | #ifdef PPRO |
148 | double dmod; |
149 | uint32_t dinvmod[3]; |
150 | #endif |
151 | mpd_size_t i, k; |
152 | |
153 | |
154 | assert(n >= 48); |
155 | assert(n <= 3*MPD_MAXTRANSFORM_2N); |
156 | |
157 | |
158 | /* Length R transform on the columns. */ |
159 | SETMODULUS(modnum); |
160 | _mpd_init_w3table(w3table, -1, modnum); |
161 | for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) { |
162 | |
163 | SIZE3_NTT(p0, p1, p2, w3table); |
164 | } |
165 | |
166 | /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */ |
167 | kernel = _mpd_getkernel(n, -1, modnum); |
168 | for (i = 1; i < R; i++) { |
169 | w0 = 1; /* r**(i*0): initial value for k=0 */ |
170 | w1 = POWMOD(kernel, i); /* r**(i*1): initial value for k=1 */ |
171 | wstep = MULMOD(w1, w1); /* r**(2*i) */ |
172 | for (k = 0; k < C-1; k += 2) { |
173 | mpd_uint_t x0 = a[i*C+k]; |
174 | mpd_uint_t x1 = a[i*C+k+1]; |
175 | MULMOD2(&x0, w0, &x1, w1); |
176 | MULMOD2C(&w0, &w1, wstep); /* r**(i*(k+2)) = r**(i*k) * r**(2*i) */ |
177 | a[i*C+k] = x0; |
178 | a[i*C+k+1] = x1; |
179 | } |
180 | } |
181 | |
182 | /* Length C transform on the rows. */ |
183 | for (s = a; s < a+n; s += C) { |
184 | if (!six_step_fnt(s, C, modnum)) { |
185 | return 0; |
186 | } |
187 | } |
188 | |
189 | #if 0 |
190 | /* An unordered transform is sufficient for convolution. */ |
191 | /* Transpose the matrix. */ |
192 | #include "transpose.h" |
193 | transpose_3xpow2(a, R, C); |
194 | #endif |
195 | |
196 | return 1; |
197 | } |
198 | |
199 | /* backward transform, sign = 1; transform length = 3 * 2**n */ |
200 | int |
201 | inv_four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum) |
202 | { |
203 | mpd_size_t R = 3; /* number of rows */ |
204 | mpd_size_t C = n / 3; /* number of columns */ |
205 | mpd_uint_t w3table[3]; |
206 | mpd_uint_t kernel, w0, w1, wstep; |
207 | mpd_uint_t *s, *p0, *p1, *p2; |
208 | mpd_uint_t umod; |
209 | #ifdef PPRO |
210 | double dmod; |
211 | uint32_t dinvmod[3]; |
212 | #endif |
213 | mpd_size_t i, k; |
214 | |
215 | |
216 | assert(n >= 48); |
217 | assert(n <= 3*MPD_MAXTRANSFORM_2N); |
218 | |
219 | |
220 | #if 0 |
221 | /* An unordered transform is sufficient for convolution. */ |
222 | /* Transpose the matrix, producing an R*C matrix. */ |
223 | #include "transpose.h" |
224 | transpose_3xpow2(a, C, R); |
225 | #endif |
226 | |
227 | /* Length C transform on the rows. */ |
228 | for (s = a; s < a+n; s += C) { |
229 | if (!inv_six_step_fnt(s, C, modnum)) { |
230 | return 0; |
231 | } |
232 | } |
233 | |
234 | /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */ |
235 | SETMODULUS(modnum); |
236 | kernel = _mpd_getkernel(n, 1, modnum); |
237 | for (i = 1; i < R; i++) { |
238 | w0 = 1; |
239 | w1 = POWMOD(kernel, i); |
240 | wstep = MULMOD(w1, w1); |
241 | for (k = 0; k < C; k += 2) { |
242 | mpd_uint_t x0 = a[i*C+k]; |
243 | mpd_uint_t x1 = a[i*C+k+1]; |
244 | MULMOD2(&x0, w0, &x1, w1); |
245 | MULMOD2C(&w0, &w1, wstep); |
246 | a[i*C+k] = x0; |
247 | a[i*C+k+1] = x1; |
248 | } |
249 | } |
250 | |
251 | /* Length R transform on the columns. */ |
252 | _mpd_init_w3table(w3table, 1, modnum); |
253 | for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) { |
254 | |
255 | SIZE3_NTT(p0, p1, p2, w3table); |
256 | } |
257 | |
258 | return 1; |
259 | } |
260 | |