1 | /* |
2 | * Copyright (c) 2008-2020 Stefan Krah. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * |
11 | * 2. Redistributions in binary form must reproduce the above copyright |
12 | * notice, this list of conditions and the following disclaimer in the |
13 | * documentation and/or other materials provided with the distribution. |
14 | * |
15 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND |
16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
19 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
20 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
21 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
22 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
23 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
24 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
25 | * SUCH DAMAGE. |
26 | */ |
27 | |
28 | |
29 | #include "mpdecimal.h" |
30 | |
31 | #include <assert.h> |
32 | #include <limits.h> |
33 | #include <stdio.h> |
34 | #include <stdlib.h> |
35 | #include <string.h> |
36 | |
37 | #include "bits.h" |
38 | #include "constants.h" |
39 | #include "transpose.h" |
40 | #include "typearith.h" |
41 | |
42 | |
43 | #define BUFSIZE 4096 |
44 | #define SIDE 128 |
45 | |
46 | |
47 | /* Bignum: The transpose functions are used for very large transforms |
48 | in sixstep.c and fourstep.c. */ |
49 | |
50 | |
51 | /* Definition of the matrix transpose */ |
52 | void |
53 | std_trans(mpd_uint_t dest[], mpd_uint_t src[], mpd_size_t rows, mpd_size_t cols) |
54 | { |
55 | mpd_size_t idest, isrc; |
56 | mpd_size_t r, c; |
57 | |
58 | for (r = 0; r < rows; r++) { |
59 | isrc = r * cols; |
60 | idest = r; |
61 | for (c = 0; c < cols; c++) { |
62 | dest[idest] = src[isrc]; |
63 | isrc += 1; |
64 | idest += rows; |
65 | } |
66 | } |
67 | } |
68 | |
69 | /* |
70 | * Swap half-rows of 2^n * (2*2^n) matrix. |
71 | * FORWARD_CYCLE: even/odd permutation of the halfrows. |
72 | * BACKWARD_CYCLE: reverse the even/odd permutation. |
73 | */ |
74 | static int |
75 | swap_halfrows_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir) |
76 | { |
77 | mpd_uint_t buf1[BUFSIZE]; |
78 | mpd_uint_t buf2[BUFSIZE]; |
79 | mpd_uint_t *readbuf, *writebuf, *hp; |
80 | mpd_size_t *done, dbits; |
81 | mpd_size_t b = BUFSIZE, stride; |
82 | mpd_size_t hn, hmax; /* halfrow number */ |
83 | mpd_size_t m, r=0; |
84 | mpd_size_t offset; |
85 | mpd_size_t next; |
86 | |
87 | |
88 | assert(cols == mul_size_t(2, rows)); |
89 | |
90 | if (dir == FORWARD_CYCLE) { |
91 | r = rows; |
92 | } |
93 | else if (dir == BACKWARD_CYCLE) { |
94 | r = 2; |
95 | } |
96 | else { |
97 | abort(); /* GCOV_NOT_REACHED */ |
98 | } |
99 | |
100 | m = cols - 1; |
101 | hmax = rows; /* cycles start at odd halfrows */ |
102 | dbits = 8 * sizeof *done; |
103 | if ((done = mpd_calloc(hmax/(sizeof *done) + 1, sizeof *done)) == NULL) { |
104 | return 0; |
105 | } |
106 | |
107 | for (hn = 1; hn <= hmax; hn += 2) { |
108 | |
109 | if (done[hn/dbits] & mpd_bits[hn%dbits]) { |
110 | continue; |
111 | } |
112 | |
113 | readbuf = buf1; writebuf = buf2; |
114 | |
115 | for (offset = 0; offset < cols/2; offset += b) { |
116 | |
117 | stride = (offset + b < cols/2) ? b : cols/2-offset; |
118 | |
119 | hp = matrix + hn*cols/2; |
120 | memcpy(readbuf, hp+offset, stride*(sizeof *readbuf)); |
121 | pointerswap(&readbuf, &writebuf); |
122 | |
123 | next = mulmod_size_t(hn, r, m); |
124 | hp = matrix + next*cols/2; |
125 | |
126 | while (next != hn) { |
127 | |
128 | memcpy(readbuf, hp+offset, stride*(sizeof *readbuf)); |
129 | memcpy(hp+offset, writebuf, stride*(sizeof *writebuf)); |
130 | pointerswap(&readbuf, &writebuf); |
131 | |
132 | done[next/dbits] |= mpd_bits[next%dbits]; |
133 | |
134 | next = mulmod_size_t(next, r, m); |
135 | hp = matrix + next*cols/2; |
136 | |
137 | } |
138 | |
139 | memcpy(hp+offset, writebuf, stride*(sizeof *writebuf)); |
140 | |
141 | done[hn/dbits] |= mpd_bits[hn%dbits]; |
142 | } |
143 | } |
144 | |
145 | mpd_free(done); |
146 | return 1; |
147 | } |
148 | |
149 | /* In-place transpose of a square matrix */ |
150 | static inline void |
151 | squaretrans(mpd_uint_t *buf, mpd_size_t cols) |
152 | { |
153 | mpd_uint_t tmp; |
154 | mpd_size_t idest, isrc; |
155 | mpd_size_t r, c; |
156 | |
157 | for (r = 0; r < cols; r++) { |
158 | c = r+1; |
159 | isrc = r*cols + c; |
160 | idest = c*cols + r; |
161 | for (c = r+1; c < cols; c++) { |
162 | tmp = buf[isrc]; |
163 | buf[isrc] = buf[idest]; |
164 | buf[idest] = tmp; |
165 | isrc += 1; |
166 | idest += cols; |
167 | } |
168 | } |
169 | } |
170 | |
171 | /* |
172 | * Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into |
173 | * square blocks with side length 'SIDE'. First, the blocks are transposed, |
174 | * then a square transposition is done on each individual block. |
175 | */ |
176 | static void |
177 | squaretrans_pow2(mpd_uint_t *matrix, mpd_size_t size) |
178 | { |
179 | mpd_uint_t buf1[SIDE*SIDE]; |
180 | mpd_uint_t buf2[SIDE*SIDE]; |
181 | mpd_uint_t *to, *from; |
182 | mpd_size_t b = size; |
183 | mpd_size_t r, c; |
184 | mpd_size_t i; |
185 | |
186 | while (b > SIDE) b >>= 1; |
187 | |
188 | for (r = 0; r < size; r += b) { |
189 | |
190 | for (c = r; c < size; c += b) { |
191 | |
192 | from = matrix + r*size + c; |
193 | to = buf1; |
194 | for (i = 0; i < b; i++) { |
195 | memcpy(to, from, b*(sizeof *to)); |
196 | from += size; |
197 | to += b; |
198 | } |
199 | squaretrans(buf1, b); |
200 | |
201 | if (r == c) { |
202 | to = matrix + r*size + c; |
203 | from = buf1; |
204 | for (i = 0; i < b; i++) { |
205 | memcpy(to, from, b*(sizeof *to)); |
206 | from += b; |
207 | to += size; |
208 | } |
209 | continue; |
210 | } |
211 | else { |
212 | from = matrix + c*size + r; |
213 | to = buf2; |
214 | for (i = 0; i < b; i++) { |
215 | memcpy(to, from, b*(sizeof *to)); |
216 | from += size; |
217 | to += b; |
218 | } |
219 | squaretrans(buf2, b); |
220 | |
221 | to = matrix + c*size + r; |
222 | from = buf1; |
223 | for (i = 0; i < b; i++) { |
224 | memcpy(to, from, b*(sizeof *to)); |
225 | from += b; |
226 | to += size; |
227 | } |
228 | |
229 | to = matrix + r*size + c; |
230 | from = buf2; |
231 | for (i = 0; i < b; i++) { |
232 | memcpy(to, from, b*(sizeof *to)); |
233 | from += b; |
234 | to += size; |
235 | } |
236 | } |
237 | } |
238 | } |
239 | |
240 | } |
241 | |
242 | /* |
243 | * In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n) |
244 | * or a (2*2^n) x 2^n matrix. |
245 | */ |
246 | int |
247 | transpose_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols) |
248 | { |
249 | mpd_size_t size = mul_size_t(rows, cols); |
250 | |
251 | assert(ispower2(rows)); |
252 | assert(ispower2(cols)); |
253 | |
254 | if (cols == rows) { |
255 | squaretrans_pow2(matrix, rows); |
256 | } |
257 | else if (cols == mul_size_t(2, rows)) { |
258 | if (!swap_halfrows_pow2(matrix, rows, cols, FORWARD_CYCLE)) { |
259 | return 0; |
260 | } |
261 | squaretrans_pow2(matrix, rows); |
262 | squaretrans_pow2(matrix+(size/2), rows); |
263 | } |
264 | else if (rows == mul_size_t(2, cols)) { |
265 | squaretrans_pow2(matrix, cols); |
266 | squaretrans_pow2(matrix+(size/2), cols); |
267 | if (!swap_halfrows_pow2(matrix, cols, rows, BACKWARD_CYCLE)) { |
268 | return 0; |
269 | } |
270 | } |
271 | else { |
272 | abort(); /* GCOV_NOT_REACHED */ |
273 | } |
274 | |
275 | return 1; |
276 | } |
277 | |