2 * FreeRDP: A Remote Desktop Protocol client.
3 * RemoteFX Codec Library - SSE2 Optimizations
5 * Copyright 2011 Stephen Erisman
6 * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
24 #include <xmmintrin.h>
25 #include <emmintrin.h>
27 #include "rfx_types.h"
31 #define __attribute__(...)
34 #define CACHE_LINE_BYTES 64
36 #define _mm_between_epi16(_val, _min, _max) \
37 do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
39 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40 _mm_prefetch_buffer(char * buffer, int num_bytes)
42 __m128i * buf = (__m128i*) buffer;
44 for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
46 _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
50 static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
52 __m128i zero = _mm_setzero_si128();
53 __m128i max = _mm_set1_epi16(255);
55 __m128i* y_r_buf = (__m128i*) y_r_buffer;
56 __m128i* cb_g_buf = (__m128i*) cb_g_buffer;
57 __m128i* cr_b_buf = (__m128i*) cr_b_buffer;
68 __m128i r_cr = _mm_set1_epi16(22986); // 1.403 << 14
69 __m128i g_cb = _mm_set1_epi16(-5636); // -0.344 << 14
70 __m128i g_cr = _mm_set1_epi16(-11698); // -0.714 << 14
71 __m128i b_cb = _mm_set1_epi16(28999); // 1.770 << 14
72 __m128i c4096 = _mm_set1_epi16(4096);
74 for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
76 _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
77 _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
78 _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
80 for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
83 In order to use SSE2 signed 16-bit integer multiplication we need to convert
84 the floating point factors to signed int without loosing information.
85 The result of this multiplication is 32 bit and we have two SSE instructions
86 that return either the hi or lo word.
87 Thus we will multiply the factors by the highest possible 2^n, take the
88 upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this
89 result by multiplying it by 2^(16-n).
90 For the given factors in the conversion matrix the best possible n is 14.
92 Example for calculating r:
93 r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
94 r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
95 r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
96 r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
99 /* y = (y_r_buf[i] + 4096) >> 2 */
100 y = _mm_load_si128(&y_r_buf[i]);
101 y = _mm_add_epi16(y, c4096);
102 y = _mm_srai_epi16(y, 2);
103 /* cb = cb_g_buf[i]; */
104 cb = _mm_load_si128(&cb_g_buf[i]);
105 /* cr = cr_b_buf[i]; */
106 cr = _mm_load_si128(&cr_b_buf[i]);
108 /* (y + HIWORD(cr*22986)) >> 3 */
109 r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
110 r = _mm_srai_epi16(r, 3);
111 /* y_r_buf[i] = MINMAX(r, 0, 255); */
112 _mm_between_epi16(r, zero, max);
113 _mm_store_si128(&y_r_buf[i], r);
115 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
116 g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
117 g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
118 g = _mm_srai_epi16(g, 3);
119 /* cb_g_buf[i] = MINMAX(g, 0, 255); */
120 _mm_between_epi16(g, zero, max);
121 _mm_store_si128(&cb_g_buf[i], g);
123 /* (y + HIWORD(cb*28999)) >> 3 */
124 b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
125 b = _mm_srai_epi16(b, 3);
126 /* cr_b_buf[i] = MINMAX(b, 0, 255); */
127 _mm_between_epi16(b, zero, max);
128 _mm_store_si128(&cr_b_buf[i], b);
132 /* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
133 static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
135 __m128i min = _mm_set1_epi16(-128 << 5);
136 __m128i max = _mm_set1_epi16(127 << 5);
138 __m128i* y_r_buf = (__m128i*) y_r_buffer;
139 __m128i* cb_g_buf = (__m128i*) cb_g_buffer;
140 __m128i* cr_b_buf = (__m128i*) cr_b_buffer;
149 __m128i y_r = _mm_set1_epi16(9798); // 0.299000 << 15
150 __m128i y_g = _mm_set1_epi16(19235); // 0.587000 << 15
151 __m128i y_b = _mm_set1_epi16(3735); // 0.114000 << 15
152 __m128i cb_r = _mm_set1_epi16(-5535); // -0.168935 << 15
153 __m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
154 __m128i cb_b = _mm_set1_epi16(16403); // 0.500590 << 15
155 __m128i cr_r = _mm_set1_epi16(16377); // 0.499813 << 15
156 __m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
157 __m128i cr_b = _mm_set1_epi16(-2663); // -0.081282 << 15
161 for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
163 _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
164 _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
165 _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
167 for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
170 In order to use SSE2 signed 16-bit integer multiplication we need to convert
171 the floating point factors to signed int without loosing information.
172 The result of this multiplication is 32 bit and using SSE2 we get either the
173 product's hi or lo word.
174 Thus we will multiply the factors by the highest possible 2^n and take the
175 upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
176 Since the final result needs to be scaled by << 5 and also in in order to keep
177 the precision within the upper 16 bits we will also have to scale the RGB
178 values used in the multiplication by << 5+(16-n).
181 /* r = y_r_buf[i]; */
182 r = _mm_load_si128(&y_r_buf[i]);
184 /* g = cb_g_buf[i]; */
185 g = _mm_load_si128(&cb_g_buf[i]);
187 /* b = cr_b_buf[i]; */
188 b = _mm_load_si128(&cr_b_buf[i]);
190 /* r<<6; g<<6; b<<6 */
191 r = _mm_slli_epi16(r, 6);
192 g = _mm_slli_epi16(g, 6);
193 b = _mm_slli_epi16(b, 6);
195 /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
196 y = _mm_mulhi_epi16(r, y_r);
197 y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
198 y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
199 y = _mm_add_epi16(y, min);
200 /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
201 _mm_between_epi16(y, min, max);
202 _mm_store_si128(&y_r_buf[i], y);
204 /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
205 cb = _mm_mulhi_epi16(r, cb_r);
206 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
207 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
208 /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
209 _mm_between_epi16(cb, min, max);
210 _mm_store_si128(&cb_g_buf[i], cb);
212 /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
213 cr = _mm_mulhi_epi16(r, cr_r);
214 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
215 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
216 /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
217 _mm_between_epi16(cr, min, max);
218 _mm_store_si128(&cr_b_buf[i], cr);
222 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223 rfx_quantization_decode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
226 __m128i * ptr = (__m128i*) buffer;
227 __m128i * buf_end = (__m128i*) (buffer + buffer_size);
234 a = _mm_load_si128(ptr);
235 a = _mm_slli_epi16(a, factor);
236 _mm_store_si128(ptr, a);
239 } while(ptr < buf_end);
242 static void rfx_quantization_decode_sse2(sint16* buffer, const uint32* quantization_values)
244 _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
246 rfx_quantization_decode_block_sse2(buffer, 4096, 5);
248 rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
249 rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
250 rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
251 rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
252 rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
253 rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
254 rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
255 rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
256 rfx_quantization_decode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
257 rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
260 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261 rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
264 __m128i* ptr = (__m128i*) buffer;
265 __m128i* buf_end = (__m128i*) (buffer + buffer_size);
271 half = _mm_set1_epi16(1 << (factor - 1));
274 a = _mm_load_si128(ptr);
275 a = _mm_add_epi16(a, half);
276 a = _mm_srai_epi16(a, factor);
277 _mm_store_si128(ptr, a);
280 } while(ptr < buf_end);
283 static void rfx_quantization_encode_sse2(sint16* buffer, const uint32* quantization_values)
285 _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
287 rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
288 rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
289 rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
290 rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
291 rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
292 rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
293 rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
294 rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
295 rfx_quantization_encode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
296 rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
298 rfx_quantization_encode_block_sse2(buffer, 4096, 5);
301 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 rfx_dwt_2d_decode_block_horiz_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
307 sint16* dst_ptr = dst;
319 for (y = 0; y < subband_width; y++)
321 /* Even coefficients */
322 for (n = 0; n < subband_width; n+=8)
324 /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
326 l_n = _mm_load_si128((__m128i*) l_ptr);
328 h_n = _mm_load_si128((__m128i*) h_ptr);
329 h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
332 first = _mm_extract_epi16(h_n_m, 1);
333 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
336 tmp_n = _mm_add_epi16(h_n, h_n_m);
337 tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
338 tmp_n = _mm_srai_epi16(tmp_n, 1);
340 dst_n = _mm_sub_epi16(l_n, tmp_n);
342 _mm_store_si128((__m128i*) l_ptr, dst_n);
347 l_ptr -= subband_width;
348 h_ptr -= subband_width;
350 /* Odd coefficients */
351 for (n = 0; n < subband_width; n+=8)
353 /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
355 h_n = _mm_load_si128((__m128i*) h_ptr);
357 h_n = _mm_slli_epi16(h_n, 1);
359 dst_n = _mm_load_si128((__m128i*) (l_ptr));
360 dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
361 if (n == subband_width - 8)
363 last = _mm_extract_epi16(dst_n_p, 6);
364 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
367 tmp_n = _mm_add_epi16(dst_n_p, dst_n);
368 tmp_n = _mm_srai_epi16(tmp_n, 1);
370 tmp_n = _mm_add_epi16(tmp_n, h_n);
372 dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
373 dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
375 _mm_store_si128((__m128i*) dst_ptr, dst1);
376 _mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
385 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 rfx_dwt_2d_decode_block_vert_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
391 sint16* dst_ptr = dst;
400 int total_width = subband_width + subband_width;
402 /* Even coefficients */
403 for (n = 0; n < subband_width; n++)
405 for (x = 0; x < total_width; x+=8)
407 /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
409 l_n = _mm_load_si128((__m128i*) l_ptr);
410 h_n = _mm_load_si128((__m128i*) h_ptr);
412 tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
414 tmp_n = _mm_add_epi16(tmp_n, h_n);
417 h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
418 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
420 tmp_n = _mm_srai_epi16(tmp_n, 1);
422 dst_n = _mm_sub_epi16(l_n, tmp_n);
423 _mm_store_si128((__m128i*) dst_ptr, dst_n);
429 dst_ptr+=total_width;
433 dst_ptr = dst + total_width;
435 /* Odd coefficients */
436 for (n = 0; n < subband_width; n++)
438 for (x = 0; x < total_width; x+=8)
440 /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
442 h_n = _mm_load_si128((__m128i*) h_ptr);
443 dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
444 h_n = _mm_slli_epi16(h_n, 1);
447 if (n == subband_width - 1)
448 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
451 dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
452 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
454 tmp_n = _mm_srai_epi16(tmp_n, 1);
456 dst_n = _mm_add_epi16(tmp_n, h_n);
457 _mm_store_si128((__m128i*) dst_ptr, dst_n);
462 dst_ptr+=total_width;
466 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467 rfx_dwt_2d_decode_block_sse2(sint16* buffer, sint16* idwt, int subband_width)
469 sint16 *hl, *lh, *hh, *ll;
470 sint16 *l_dst, *h_dst;
472 _mm_prefetch_buffer((char*) idwt, subband_width * 4 * sizeof(sint16));
474 /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
475 /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
476 /* The lower part L uses LL(3) and HL(0). */
477 /* The higher part H uses LH(1) and HH(2). */
479 ll = buffer + subband_width * subband_width * 3;
483 rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
485 lh = buffer + subband_width * subband_width;
486 hh = buffer + subband_width * subband_width * 2;
487 h_dst = idwt + subband_width * subband_width * 2;
489 rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
491 /* Inverse DWT in vertical direction, results are stored in original buffer. */
492 rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
495 static void rfx_dwt_2d_decode_sse2(sint16* buffer, sint16* dwt_buffer)
497 _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
499 rfx_dwt_2d_decode_block_sse2(buffer + 3840, dwt_buffer, 8);
500 rfx_dwt_2d_decode_block_sse2(buffer + 3072, dwt_buffer, 16);
501 rfx_dwt_2d_decode_block_sse2(buffer, dwt_buffer, 32);
504 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505 rfx_dwt_2d_encode_block_vert_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
517 total_width = subband_width << 1;
519 for (n = 0; n < subband_width; n++)
521 for (x = 0; x < total_width; x += 8)
523 src_2n = _mm_load_si128((__m128i*) src);
524 src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
525 if (n < subband_width - 1)
526 src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
530 /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
532 h_n = _mm_add_epi16(src_2n, src_2n_2);
533 h_n = _mm_srai_epi16(h_n, 1);
534 h_n = _mm_sub_epi16(src_2n_1, h_n);
535 h_n = _mm_srai_epi16(h_n, 1);
537 _mm_store_si128((__m128i*) h, h_n);
542 h_n_m = _mm_load_si128((__m128i*) (h - total_width));
544 /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
546 l_n = _mm_add_epi16(h_n_m, h_n);
547 l_n = _mm_srai_epi16(l_n, 1);
548 l_n = _mm_add_epi16(l_n, src_2n);
550 _mm_store_si128((__m128i*) l, l_n);
560 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
561 rfx_dwt_2d_encode_block_horiz_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
573 for (y = 0; y < subband_width; y++)
575 for (n = 0; n < subband_width; n += 8)
577 /* The following 3 Set operations consumes more than half of the total DWT processing time! */
578 src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
579 src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
580 src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
581 src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
583 /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
585 h_n = _mm_add_epi16(src_2n, src_2n_2);
586 h_n = _mm_srai_epi16(h_n, 1);
587 h_n = _mm_sub_epi16(src_2n_1, h_n);
588 h_n = _mm_srai_epi16(h_n, 1);
590 _mm_store_si128((__m128i*) h, h_n);
592 h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
595 first = _mm_extract_epi16(h_n_m, 1);
596 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
599 /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
601 l_n = _mm_add_epi16(h_n_m, h_n);
602 l_n = _mm_srai_epi16(l_n, 1);
603 l_n = _mm_add_epi16(l_n, src_2n);
605 _mm_store_si128((__m128i*) l, l_n);
614 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
615 rfx_dwt_2d_encode_block_sse2(sint16* buffer, sint16* dwt, int subband_width)
617 sint16 *hl, *lh, *hh, *ll;
618 sint16 *l_src, *h_src;
620 _mm_prefetch_buffer((char*) dwt, subband_width * 4 * sizeof(sint16));
622 /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
625 h_src = dwt + subband_width * subband_width * 2;
627 rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
629 /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
630 /* The lower part L generates LL(3) and HL(0). */
631 /* The higher part H generates LH(1) and HH(2). */
633 ll = buffer + subband_width * subband_width * 3;
636 lh = buffer + subband_width * subband_width;
637 hh = buffer + subband_width * subband_width * 2;
639 rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
640 rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
643 static void rfx_dwt_2d_encode_sse2(sint16* buffer, sint16* dwt_buffer)
645 _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
647 rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
648 rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
649 rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
652 void rfx_init_sse2(RFX_CONTEXT* context)
654 DEBUG_RFX("Using SSE2 optimizations");
656 IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2");
657 IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2");
658 IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
659 IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
660 IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
661 IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");
663 context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2;
664 context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2;
665 context->quantization_decode = rfx_quantization_decode_sse2;
666 context->quantization_encode = rfx_quantization_encode_sse2;
667 context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
668 context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;