jidctfst.c source code [tensorflow/external/libjpeg_turbo/jidctfst.c]

1	/*
2	* jidctfst.c
3	*
4	* This file was part of the Independent JPEG Group's software:
5	* Copyright (C) 1994-1998, Thomas G. Lane.
6	* libjpeg-turbo Modifications:
7	* Copyright (C) 2015, D. R. Commander.
8	* For conditions of distribution and use, see the accompanying README.ijg
9	* file.
10	*
11	* This file contains a fast, not so accurate integer implementation of the
12	* inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
13	* must also perform dequantization of the input coefficients.
14	*
15	* A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
16	* on each row (or vice versa, but it's more convenient to emit a row at
17	* a time). Direct algorithms are also available, but they are much more
18	* complex and seem not to be any faster when reduced to code.
19	*
20	* This implementation is based on Arai, Agui, and Nakajima's algorithm for
21	* scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
22	* Japanese, but the algorithm is described in the Pennebaker & Mitchell
23	* JPEG textbook (see REFERENCES section in file README.ijg). The following
24	* code is based directly on figure 4-8 in P&M.
25	* While an 8-point DCT cannot be done in less than 11 multiplies, it is
26	* possible to arrange the computation so that many of the multiplies are
27	* simple scalings of the final outputs. These multiplies can then be
28	* folded into the multiplications or divisions by the JPEG quantization
29	* table entries. The AA&N method leaves only 5 multiplies and 29 adds
30	* to be done in the DCT itself.
31	* The primary disadvantage of this method is that with fixed-point math,
32	* accuracy is lost due to imprecise representation of the scaled
33	* quantization values. The smaller the quantization table entry, the less
34	* precise the scaled value, so this implementation does worse with high-
35	* quality-setting files than with low-quality ones.
36	*/
37
38	#define JPEG_INTERNALS
39	#include "jinclude.h"
40	#include "jpeglib.h"
41	#include "jdct.h" /* Private declarations for DCT subsystem */
42
43	#ifdef DCT_IFAST_SUPPORTED
44
45
46	/*
47	* This module is specialized to the case DCTSIZE = 8.
48	*/
49
50	#if DCTSIZE != 8
51	Sorry, this code only copes with `8x8` DCTs. / deliberate syntax err /
52	#endif
53
54
55	/ Scaling decisions are generally the same as in the LL&M algorithm;*
56	* see jidctint.c for more details. However, we choose to descale
57	* (right shift) multiplication products as soon as they are formed,
58	* rather than carrying additional fractional bits into subsequent additions.
59	* This compromises accuracy slightly, but it lets us save a few shifts.
60	* More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
61	* everywhere except in the multiplications proper; this saves a good deal
62	* of work on 16-bit-int machines.
63	*
64	* The dequantized coefficients are not integers because the AA&N scaling
65	* factors have been incorporated. We represent them scaled up by PASS1_BITS,
66	* so that the first and second IDCT rounds have the same input scaling.
67	* For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
68	* avoid a descaling shift; this compromises accuracy rather drastically
69	* for small quantization table entries, but it saves a lot of shifts.
70	* For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
71	* so we use a much larger scaling factor to preserve accuracy.
72	*
73	* A final compromise is to represent the multiplicative constants to only
74	* 8 fractional bits, rather than 13. This saves some shifting work on some
75	* machines, and may also reduce the cost of multiplication (since there
76	* are fewer one-bits in the constants).
77	*/
78
79	#if BITS_IN_JSAMPLE == 8
80	#define CONST_BITS 8
81	#define PASS1_BITS 2
82	#else
83	#define CONST_BITS 8
84	#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
85	#endif
86
87	/ Some C compilers fail to reduce "FIX(constant)" at compile time, thus*
88	* causing a lot of useless floating-point operations at run time.
89	* To get around this we use the following pre-calculated constants.
90	* If you change CONST_BITS you may want to add appropriate values.
91	* (With a reasonable C compiler, you can just rely on the FIX() macro...)
92	*/
93
94	#if CONST_BITS == 8
95	#define FIX_1_082392200 ((JLONG)277) /* FIX(1.082392200) */
96	#define FIX_1_414213562 ((JLONG)362) /* FIX(1.414213562) */
97	#define FIX_1_847759065 ((JLONG)473) /* FIX(1.847759065) */
98	#define FIX_2_613125930 ((JLONG)669) /* FIX(2.613125930) */
99	#else
100	#define FIX_1_082392200 FIX(1.082392200)
101	#define FIX_1_414213562 FIX(1.414213562)
102	#define FIX_1_847759065 FIX(1.847759065)
103	#define FIX_2_613125930 FIX(2.613125930)
104	#endif
105
106
107	/ We can gain a little more speed, with a further compromise in accuracy,*
108	* by omitting the addition in a descaling shift. This yields an incorrectly
109	* rounded result half the time...
110	*/
111
112	#ifndef USE_ACCURATE_ROUNDING
113	#undef DESCALE
114	#define DESCALE(x, n) RIGHT_SHIFT(x, n)
115	#endif
116
117
118	/ Multiply a DCTELEM variable by an JLONG constant, and immediately*
119	* descale to yield a DCTELEM result.
120	*/
121
122	#define MULTIPLY(var, const) ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
123
124
125	/ Dequantize a coefficient by multiplying it by the multiplier-table*
126	* entry; produce a DCTELEM result. For 8-bit data a 16x16->16
127	* multiplication will do. For 12-bit data, the multiplier table is
128	* declared JLONG, so a 32-bit multiply will be used.
129	*/
130
131	#if BITS_IN_JSAMPLE == 8
132	#define DEQUANTIZE(coef, quantval) (((IFAST_MULT_TYPE)(coef)) * (quantval))
133	#else
134	#define DEQUANTIZE(coef, quantval) \
135	DESCALE((coef) * (quantval), IFAST_SCALE_BITS - PASS1_BITS)
136	#endif
137
138
139	/ Like DESCALE, but applies to a DCTELEM and produces an int.*
140	* We assume that int right shift is unsigned if JLONG right shift is.
141	*/
142
143	#ifdef RIGHT_SHIFT_IS_UNSIGNED
144	#define ISHIFT_TEMPS DCTELEM ishift_temp;
145	#if BITS_IN_JSAMPLE == 8
146	#define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */
147	#else
148	#define DCTELEMBITS 32 /* DCTELEM must be 32 bits */
149	#endif
150	#define IRIGHT_SHIFT(x, shft) \
151	((ishift_temp = (x)) < 0 ? \
152	(ishift_temp >> (shft)) \| ((~((DCTELEM)0)) << (DCTELEMBITS - (shft))) : \
153	(ishift_temp >> (shft)))
154	#else
155	#define ISHIFT_TEMPS
156	#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
157	#endif
158
159	#ifdef USE_ACCURATE_ROUNDING
160	#define IDESCALE(x, n) ((int)IRIGHT_SHIFT((x) + (1 << ((n) - 1)), n))
161	#else
162	#define IDESCALE(x, n) ((int)IRIGHT_SHIFT(x, n))
163	#endif
164
165
166	/*
167	* Perform dequantization and inverse DCT on one block of coefficients.
168	*/
169
170	GLOBAL(void)
171	jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
172	JCOEFPTR coef_block, JSAMPARRAY output_buf,
173	JDIMENSION output_col)
174	{
175	DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
176	DCTELEM tmp10, tmp11, tmp12, tmp13;
177	DCTELEM z5, z10, z11, z12, z13;
178	JCOEFPTR inptr;
179	IFAST_MULT_TYPE *quantptr;
180	int *wsptr;
181	JSAMPROW outptr;
182	JSAMPLE *range_limit = IDCT_range_limit(cinfo);
183	int ctr;
184	int workspace[DCTSIZE2]; / buffers data between passes /
185	SHIFT_TEMPS / for DESCALE /
186	ISHIFT_TEMPS / for IDESCALE /
187
188	/ Pass 1: process columns from input, store into work array. /
189
190	inptr = coef_block;
191	quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
192	wsptr = workspace;
193	for (ctr = DCTSIZE; ctr > `0`; ctr--) {
194	/ Due to quantization, we will usually find that many of the input*
195	* coefficients are zero, especially the AC terms. We can exploit this
196	* by short-circuiting the IDCT calculation for any column in which all
197	* the AC terms are zero. In that case each output is equal to the
198	* DC coefficient (with scale factor as needed).
199	* With typical images and quantization tables, half or more of the
200	* column DCT calculations can be simplified this way.
201	*/
202
203	if (inptr[DCTSIZE * `1`] == `0` && inptr[DCTSIZE * `2`] == `0` &&
204	inptr[DCTSIZE * `3`] == `0` && inptr[DCTSIZE * `4`] == `0` &&
205	inptr[DCTSIZE * `5`] == `0` && inptr[DCTSIZE * `6`] == `0` &&
206	inptr[DCTSIZE * `7`] == `0`) {
207	/ AC terms all zero /
208	int dcval = (int)DEQUANTIZE(inptr[DCTSIZE * `0`], quantptr[DCTSIZE * `0`]);
209
210	wsptr[DCTSIZE * `0`] = dcval;
211	wsptr[DCTSIZE * `1`] = dcval;
212	wsptr[DCTSIZE * `2`] = dcval;
213	wsptr[DCTSIZE * `3`] = dcval;
214	wsptr[DCTSIZE * `4`] = dcval;
215	wsptr[DCTSIZE * `5`] = dcval;
216	wsptr[DCTSIZE * `6`] = dcval;
217	wsptr[DCTSIZE * `7`] = dcval;
218
219	inptr++; / advance pointers to next column /
220	quantptr++;
221	wsptr++;
222	continue;
223	}
224
225	/ Even part /
226
227	tmp0 = DEQUANTIZE(inptr[DCTSIZE * `0`], quantptr[DCTSIZE * `0`]);
228	tmp1 = DEQUANTIZE(inptr[DCTSIZE * `2`], quantptr[DCTSIZE * `2`]);
229	tmp2 = DEQUANTIZE(inptr[DCTSIZE * `4`], quantptr[DCTSIZE * `4`]);
230	tmp3 = DEQUANTIZE(inptr[DCTSIZE * `6`], quantptr[DCTSIZE * `6`]);
231
232	tmp10 = tmp0 + tmp2; / phase 3 /
233	tmp11 = tmp0 - tmp2;
234
235	tmp13 = tmp1 + tmp3; / phases 5-3 /
236	tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; / 2c4 /*
237
238	tmp0 = tmp10 + tmp13; / phase 2 /
239	tmp3 = tmp10 - tmp13;
240	tmp1 = tmp11 + tmp12;
241	tmp2 = tmp11 - tmp12;
242
243	/ Odd part /
244
245	tmp4 = DEQUANTIZE(inptr[DCTSIZE * `1`], quantptr[DCTSIZE * `1`]);
246	tmp5 = DEQUANTIZE(inptr[DCTSIZE * `3`], quantptr[DCTSIZE * `3`]);
247	tmp6 = DEQUANTIZE(inptr[DCTSIZE * `5`], quantptr[DCTSIZE * `5`]);
248	tmp7 = DEQUANTIZE(inptr[DCTSIZE * `7`], quantptr[DCTSIZE * `7`]);
249
250	z13 = tmp6 + tmp5; / phase 6 /
251	z10 = tmp6 - tmp5;
252	z11 = tmp4 + tmp7;
253	z12 = tmp4 - tmp7;
254
255	tmp7 = z11 + z13; / phase 5 /
256	tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); / 2c4 /*
257
258	z5 = MULTIPLY(z10 + z12, FIX_1_847759065); / 2c2 /*
259	tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; / 2(c2-c6) /*
260	tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; / -2(c2+c6) /*
261
262	tmp6 = tmp12 - tmp7; / phase 2 /
263	tmp5 = tmp11 - tmp6;
264	tmp4 = tmp10 + tmp5;
265
266	wsptr[DCTSIZE * `0`] = (int)(tmp0 + tmp7);
267	wsptr[DCTSIZE * `7`] = (int)(tmp0 - tmp7);
268	wsptr[DCTSIZE * `1`] = (int)(tmp1 + tmp6);
269	wsptr[DCTSIZE * `6`] = (int)(tmp1 - tmp6);
270	wsptr[DCTSIZE * `2`] = (int)(tmp2 + tmp5);
271	wsptr[DCTSIZE * `5`] = (int)(tmp2 - tmp5);
272	wsptr[DCTSIZE * `4`] = (int)(tmp3 + tmp4);
273	wsptr[DCTSIZE * `3`] = (int)(tmp3 - tmp4);
274
275	inptr++; / advance pointers to next column /
276	quantptr++;
277	wsptr++;
278	}
279
280	/ Pass 2: process rows from work array, store into output array. /
281	/ Note that we must descale the results by a factor of 8 == 2*3, /*
282	/ and also undo the PASS1_BITS scaling. /
283
284	wsptr = workspace;
285	for (ctr = `0`; ctr < DCTSIZE; ctr++) {
286	outptr = output_buf[ctr] + output_col;
287	/ Rows of zeroes can be exploited in the same way as we did with columns.*
288	* However, the column calculation has created many nonzero AC terms, so
289	* the simplification applies less often (typically 5% to 10% of the time).
290	* On machines with very fast multiplication, it's possible that the
291	* test takes more time than it's worth. In that case this section
292	* may be commented out.
293	*/
294
295	#ifndef NO_ZERO_ROW_TEST
296	if (wsptr[`1`] == `0` && wsptr[`2`] == `0` && wsptr[`3`] == `0` && wsptr[`4`] == `0` &&
297	wsptr[`5`] == `0` && wsptr[`6`] == `0` && wsptr[`7`] == `0`) {
298	/ AC terms all zero /
299	JSAMPLE dcval =
300	range_limit[IDESCALE(wsptr[`0`], PASS1_BITS + `3`) & RANGE_MASK];
301
302	outptr[`0`] = dcval;
303	outptr[`1`] = dcval;
304	outptr[`2`] = dcval;
305	outptr[`3`] = dcval;
306	outptr[`4`] = dcval;
307	outptr[`5`] = dcval;
308	outptr[`6`] = dcval;
309	outptr[`7`] = dcval;
310
311	wsptr += DCTSIZE; / advance pointer to next row /
312	continue;
313	}
314	#endif
315
316	/ Even part /
317
318	tmp10 = ((DCTELEM)wsptr[`0`] + (DCTELEM)wsptr[`4`]);
319	tmp11 = ((DCTELEM)wsptr[`0`] - (DCTELEM)wsptr[`4`]);
320
321	tmp13 = ((DCTELEM)wsptr[`2`] + (DCTELEM)wsptr[`6`]);
322	tmp12 =
323	MULTIPLY((DCTELEM)wsptr[`2`] - (DCTELEM)wsptr[`6`], FIX_1_414213562) - tmp13;
324
325	tmp0 = tmp10 + tmp13;
326	tmp3 = tmp10 - tmp13;
327	tmp1 = tmp11 + tmp12;
328	tmp2 = tmp11 - tmp12;
329
330	/ Odd part /
331
332	z13 = (DCTELEM)wsptr[`5`] + (DCTELEM)wsptr[`3`];
333	z10 = (DCTELEM)wsptr[`5`] - (DCTELEM)wsptr[`3`];
334	z11 = (DCTELEM)wsptr[`1`] + (DCTELEM)wsptr[`7`];
335	z12 = (DCTELEM)wsptr[`1`] - (DCTELEM)wsptr[`7`];
336
337	tmp7 = z11 + z13; / phase 5 /
338	tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); / 2c4 /*
339
340	z5 = MULTIPLY(z10 + z12, FIX_1_847759065); / 2c2 /*
341	tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; / 2(c2-c6) /*
342	tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; / -2(c2+c6) /*
343
344	tmp6 = tmp12 - tmp7; / phase 2 /
345	tmp5 = tmp11 - tmp6;
346	tmp4 = tmp10 + tmp5;
347
348	/ Final output stage: scale down by a factor of 8 and range-limit /
349
350	outptr[`0`] =
351	range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS + `3`) & RANGE_MASK];
352	outptr[`7`] =
353	range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS + `3`) & RANGE_MASK];
354	outptr[`1`] =
355	range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS + `3`) & RANGE_MASK];
356	outptr[`6`] =
357	range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS + `3`) & RANGE_MASK];
358	outptr[`2`] =
359	range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS + `3`) & RANGE_MASK];
360	outptr[`5`] =
361	range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS + `3`) & RANGE_MASK];
362	outptr[`4`] =
363	range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS + `3`) & RANGE_MASK];
364	outptr[`3`] =
365	range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS + `3`) & RANGE_MASK];
366
367	wsptr += DCTSIZE; / advance pointer to next row /
368	}
369	}
370
371	#endif /* DCT_IFAST_SUPPORTED */
372

Browse the source code of tensorflow/external/libjpeg_turbo/jidctfst.c