Initial commit - from Precise source
[freerdp-ubuntu-pcb-backport.git] / libfreerdp-codec / rfx_sse2.c
1 /**
2  * FreeRDP: A Remote Desktop Protocol client.
3  * RemoteFX Codec Library - SSE2 Optimizations
4  *
5  * Copyright 2011 Stephen Erisman
6  * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *     http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <xmmintrin.h>
25 #include <emmintrin.h>
26
27 #include "rfx_types.h"
28 #include "rfx_sse2.h"
29
30 #ifdef _MSC_VER
31 #define __attribute__(...)
32 #endif
33
34 #define CACHE_LINE_BYTES        64
35
36 #define _mm_between_epi16(_val, _min, _max) \
37         do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
38
39 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40 _mm_prefetch_buffer(char * buffer, int num_bytes)
41 {
42         __m128i * buf = (__m128i*) buffer;
43         int i;
44         for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
45         {
46                 _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
47         }
48 }
49
50 static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
51 {       
52         __m128i zero = _mm_setzero_si128();
53         __m128i max = _mm_set1_epi16(255);
54
55         __m128i* y_r_buf = (__m128i*) y_r_buffer;
56         __m128i* cb_g_buf = (__m128i*) cb_g_buffer;
57         __m128i* cr_b_buf = (__m128i*) cr_b_buffer;
58
59         __m128i y;
60         __m128i cr;
61         __m128i cb;
62         __m128i r;
63         __m128i g;
64         __m128i b;
65
66         int i;
67
68         __m128i r_cr = _mm_set1_epi16(22986);   //  1.403 << 14
69         __m128i g_cb = _mm_set1_epi16(-5636);   // -0.344 << 14
70         __m128i g_cr = _mm_set1_epi16(-11698);  // -0.714 << 14
71         __m128i b_cb = _mm_set1_epi16(28999);   //  1.770 << 14
72         __m128i c4096 = _mm_set1_epi16(4096);
73
74         for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
75         {
76                 _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
77                 _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
78                 _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
79         }
80         for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
81         {
82                 /*
83                 In order to use SSE2 signed 16-bit integer multiplication we need to convert
84                 the floating point factors to signed int without loosing information.
85                 The result of this multiplication is 32 bit and we have two SSE instructions
86                 that return either the hi or lo word.
87                 Thus we will multiply the factors by the highest possible 2^n, take the 
88                 upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this
89                 result by multiplying it by 2^(16-n).
90                 For the given factors in the conversion matrix the best possible n is 14.
91
92                 Example for calculating r:
93                 r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
94                 r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
95                 r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
96                 r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
97                 */
98
99                 /* y = (y_r_buf[i] + 4096) >> 2 */
100                 y = _mm_load_si128(&y_r_buf[i]);
101                 y = _mm_add_epi16(y, c4096);
102                 y = _mm_srai_epi16(y, 2);
103                 /* cb = cb_g_buf[i]; */
104                 cb = _mm_load_si128(&cb_g_buf[i]);
105                 /* cr = cr_b_buf[i]; */
106                 cr = _mm_load_si128(&cr_b_buf[i]);
107
108                 /* (y + HIWORD(cr*22986)) >> 3 */
109                 r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
110                 r = _mm_srai_epi16(r, 3);
111                 /* y_r_buf[i] = MINMAX(r, 0, 255); */
112                 _mm_between_epi16(r, zero, max);
113                 _mm_store_si128(&y_r_buf[i], r);
114
115                 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
116                 g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
117                 g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
118                 g = _mm_srai_epi16(g, 3);
119                 /* cb_g_buf[i] = MINMAX(g, 0, 255); */
120                 _mm_between_epi16(g, zero, max);
121                 _mm_store_si128(&cb_g_buf[i], g);
122
123                 /* (y + HIWORD(cb*28999)) >> 3 */
124                 b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
125                 b = _mm_srai_epi16(b, 3);
126                 /* cr_b_buf[i] = MINMAX(b, 0, 255); */
127                 _mm_between_epi16(b, zero, max);
128                 _mm_store_si128(&cr_b_buf[i], b);
129         }
130 }
131
132 /* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
133 static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
134 {
135         __m128i min = _mm_set1_epi16(-128 << 5);
136         __m128i max = _mm_set1_epi16(127 << 5);
137
138         __m128i* y_r_buf = (__m128i*) y_r_buffer;
139         __m128i* cb_g_buf = (__m128i*) cb_g_buffer;
140         __m128i* cr_b_buf = (__m128i*) cr_b_buffer;
141
142         __m128i y;
143         __m128i cr;
144         __m128i cb;
145         __m128i r;
146         __m128i g;
147         __m128i b;
148
149         __m128i y_r  = _mm_set1_epi16(9798);   //  0.299000 << 15
150         __m128i y_g  = _mm_set1_epi16(19235);  //  0.587000 << 15
151         __m128i y_b  = _mm_set1_epi16(3735);   //  0.114000 << 15
152         __m128i cb_r = _mm_set1_epi16(-5535);  // -0.168935 << 15
153         __m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
154         __m128i cb_b = _mm_set1_epi16(16403);  //  0.500590 << 15
155         __m128i cr_r = _mm_set1_epi16(16377);  //  0.499813 << 15
156         __m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
157         __m128i cr_b = _mm_set1_epi16(-2663);  // -0.081282 << 15
158
159         int i;
160
161         for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
162         {
163                 _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
164                 _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
165                 _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
166         }
167         for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
168         {
169                 /*
170                 In order to use SSE2 signed 16-bit integer multiplication we need to convert
171                 the floating point factors to signed int without loosing information.
172                 The result of this multiplication is 32 bit and using SSE2 we get either the
173                 product's hi or lo word.
174                 Thus we will multiply the factors by the highest possible 2^n and take the
175                 upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
176                 Since the final result needs to be scaled by << 5 and also in in order to keep
177                 the precision within the upper 16 bits we will also have to scale the RGB
178                 values used in the multiplication by << 5+(16-n).
179                 */
180
181                 /* r = y_r_buf[i]; */
182                 r = _mm_load_si128(&y_r_buf[i]);
183
184                 /* g = cb_g_buf[i]; */
185                 g = _mm_load_si128(&cb_g_buf[i]);
186
187                 /* b = cr_b_buf[i]; */
188                 b = _mm_load_si128(&cr_b_buf[i]);
189
190                 /* r<<6; g<<6; b<<6 */
191                 r = _mm_slli_epi16(r, 6);
192                 g = _mm_slli_epi16(g, 6);
193                 b = _mm_slli_epi16(b, 6);
194
195                 /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
196                 y = _mm_mulhi_epi16(r, y_r);
197                 y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
198                 y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
199                 y = _mm_add_epi16(y, min);
200                 /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
201                 _mm_between_epi16(y, min, max);
202                 _mm_store_si128(&y_r_buf[i], y);
203
204                 /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
205                 cb = _mm_mulhi_epi16(r, cb_r);
206                 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
207                 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
208                 /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
209                 _mm_between_epi16(cb, min, max);
210                 _mm_store_si128(&cb_g_buf[i], cb);
211
212                 /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
213                 cr = _mm_mulhi_epi16(r, cr_r);
214                 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
215                 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
216                 /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
217                 _mm_between_epi16(cr, min, max);
218                 _mm_store_si128(&cr_b_buf[i], cr);
219         }
220 }
221
222 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223 rfx_quantization_decode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
224 {
225         __m128i a;
226         __m128i * ptr = (__m128i*) buffer;
227         __m128i * buf_end = (__m128i*) (buffer + buffer_size);
228
229         if (factor == 0)
230                 return;
231
232         do
233         {
234                 a = _mm_load_si128(ptr);
235                 a = _mm_slli_epi16(a, factor);
236                 _mm_store_si128(ptr, a);
237
238                 ptr++;
239         } while(ptr < buf_end);
240 }
241
242 static void rfx_quantization_decode_sse2(sint16* buffer, const uint32* quantization_values)
243 {
244         _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
245
246         rfx_quantization_decode_block_sse2(buffer, 4096, 5);
247
248         rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
249         rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
250         rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
251         rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
252         rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
253         rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
254         rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
255         rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
256         rfx_quantization_decode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
257         rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
258 }
259
260 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261 rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
262 {
263         __m128i a;
264         __m128i* ptr = (__m128i*) buffer;
265         __m128i* buf_end = (__m128i*) (buffer + buffer_size);
266         __m128i half;
267
268         if (factor == 0)
269                 return;
270
271         half = _mm_set1_epi16(1 << (factor - 1));
272         do
273         {
274                 a = _mm_load_si128(ptr);
275                 a = _mm_add_epi16(a, half);
276                 a = _mm_srai_epi16(a, factor);
277                 _mm_store_si128(ptr, a);
278
279                 ptr++;
280         } while(ptr < buf_end);
281 }
282
283 static void rfx_quantization_encode_sse2(sint16* buffer, const uint32* quantization_values)
284 {
285         _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
286
287         rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
288         rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
289         rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
290         rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
291         rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
292         rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
293         rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
294         rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
295         rfx_quantization_encode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
296         rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
297
298         rfx_quantization_encode_block_sse2(buffer, 4096, 5);
299 }
300
301 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 rfx_dwt_2d_decode_block_horiz_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
303 {
304         int y, n;
305         sint16* l_ptr = l;
306         sint16* h_ptr = h;
307         sint16* dst_ptr = dst;
308         int first;
309         int last;
310         __m128i l_n;
311         __m128i h_n;
312         __m128i h_n_m;
313         __m128i tmp_n;
314         __m128i dst_n;
315         __m128i dst_n_p;
316         __m128i dst1;
317         __m128i dst2;
318
319         for (y = 0; y < subband_width; y++)
320         {
321                 /* Even coefficients */
322                 for (n = 0; n < subband_width; n+=8)
323                 {
324                         /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
325                         
326                         l_n = _mm_load_si128((__m128i*) l_ptr);
327
328                         h_n = _mm_load_si128((__m128i*) h_ptr);
329                         h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
330                         if (n == 0)
331                         {
332                                 first = _mm_extract_epi16(h_n_m, 1);
333                                 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
334                         }
335                         
336                         tmp_n = _mm_add_epi16(h_n, h_n_m);
337                         tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
338                         tmp_n = _mm_srai_epi16(tmp_n, 1);
339                         
340                         dst_n = _mm_sub_epi16(l_n, tmp_n);
341                         
342                         _mm_store_si128((__m128i*) l_ptr, dst_n);
343                         
344                         l_ptr+=8;
345                         h_ptr+=8;
346                 }
347                 l_ptr -= subband_width;
348                 h_ptr -= subband_width;
349                 
350                 /* Odd coefficients */
351                 for (n = 0; n < subband_width; n+=8)
352                 {
353                         /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
354                         
355                         h_n = _mm_load_si128((__m128i*) h_ptr);
356                         
357                         h_n = _mm_slli_epi16(h_n, 1);
358                         
359                         dst_n = _mm_load_si128((__m128i*) (l_ptr));
360                         dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
361                         if (n == subband_width - 8)
362                         {
363                                 last = _mm_extract_epi16(dst_n_p, 6);
364                                 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
365                         }
366                         
367                         tmp_n = _mm_add_epi16(dst_n_p, dst_n);
368                         tmp_n = _mm_srai_epi16(tmp_n, 1);
369                         
370                         tmp_n = _mm_add_epi16(tmp_n, h_n);
371                         
372                         dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
373                         dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
374                         
375                         _mm_store_si128((__m128i*) dst_ptr, dst1);
376                         _mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
377                         
378                         l_ptr+=8;
379                         h_ptr+=8;
380                         dst_ptr+=16;
381                 }
382         }
383 }
384
385 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 rfx_dwt_2d_decode_block_vert_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
387 {
388         int x, n;
389         sint16* l_ptr = l;
390         sint16* h_ptr = h;
391         sint16* dst_ptr = dst;
392         __m128i l_n;
393         __m128i h_n;
394         __m128i tmp_n;
395         __m128i h_n_m;
396         __m128i dst_n;
397         __m128i dst_n_m;
398         __m128i dst_n_p;
399         
400         int total_width = subband_width + subband_width;
401
402         /* Even coefficients */
403         for (n = 0; n < subband_width; n++)
404         {
405                 for (x = 0; x < total_width; x+=8)
406                 {
407                         /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
408                         
409                         l_n = _mm_load_si128((__m128i*) l_ptr);
410                         h_n = _mm_load_si128((__m128i*) h_ptr);
411                         
412                         tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
413                         if (n == 0)
414                                 tmp_n = _mm_add_epi16(tmp_n, h_n);
415                         else
416                         {
417                                 h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
418                                 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
419                         }
420                         tmp_n = _mm_srai_epi16(tmp_n, 1);
421                         
422                         dst_n = _mm_sub_epi16(l_n, tmp_n);
423                         _mm_store_si128((__m128i*) dst_ptr, dst_n);
424                         
425                         l_ptr+=8;
426                         h_ptr+=8;
427                         dst_ptr+=8;
428                 }
429                 dst_ptr+=total_width;
430         }
431         
432         h_ptr = h;
433         dst_ptr = dst + total_width;
434         
435         /* Odd coefficients */
436         for (n = 0; n < subband_width; n++)
437         {
438                 for (x = 0; x < total_width; x+=8)
439                 {
440                         /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
441                         
442                         h_n = _mm_load_si128((__m128i*) h_ptr);
443                         dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
444                         h_n = _mm_slli_epi16(h_n, 1);
445                         
446                         tmp_n = dst_n_m;
447                         if (n == subband_width - 1)
448                                 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
449                         else
450                         {
451                                 dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
452                                 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
453                         }
454                         tmp_n = _mm_srai_epi16(tmp_n, 1);
455                         
456                         dst_n = _mm_add_epi16(tmp_n, h_n);
457                         _mm_store_si128((__m128i*) dst_ptr, dst_n);
458
459                         h_ptr+=8;
460                         dst_ptr+=8;
461                 }
462                 dst_ptr+=total_width;
463         }
464 }
465
466 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467 rfx_dwt_2d_decode_block_sse2(sint16* buffer, sint16* idwt, int subband_width)
468 {
469         sint16 *hl, *lh, *hh, *ll;
470         sint16 *l_dst, *h_dst;
471
472         _mm_prefetch_buffer((char*) idwt, subband_width * 4 * sizeof(sint16));
473
474         /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
475         /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
476         /* The lower part L uses LL(3) and HL(0). */
477         /* The higher part H uses LH(1) and HH(2). */
478
479         ll = buffer + subband_width * subband_width * 3;
480         hl = buffer;
481         l_dst = idwt;
482
483         rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
484
485         lh = buffer + subband_width * subband_width;
486         hh = buffer + subband_width * subband_width * 2;
487         h_dst = idwt + subband_width * subband_width * 2;
488         
489         rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
490
491         /* Inverse DWT in vertical direction, results are stored in original buffer. */
492         rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
493 }
494
495 static void rfx_dwt_2d_decode_sse2(sint16* buffer, sint16* dwt_buffer)
496 {
497         _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
498         
499         rfx_dwt_2d_decode_block_sse2(buffer + 3840, dwt_buffer, 8);
500         rfx_dwt_2d_decode_block_sse2(buffer + 3072, dwt_buffer, 16);
501         rfx_dwt_2d_decode_block_sse2(buffer, dwt_buffer, 32);
502 }
503
504 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505 rfx_dwt_2d_encode_block_vert_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
506 {
507         int total_width;
508         int x;
509         int n;
510         __m128i src_2n;
511         __m128i src_2n_1;
512         __m128i src_2n_2;
513         __m128i h_n;
514         __m128i h_n_m;
515         __m128i l_n;
516
517         total_width = subband_width << 1;
518
519         for (n = 0; n < subband_width; n++)
520         {
521                 for (x = 0; x < total_width; x += 8)
522                 {
523                         src_2n = _mm_load_si128((__m128i*) src);
524                         src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
525                         if (n < subband_width - 1)
526                                 src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
527                         else
528                                 src_2n_2 = src_2n;
529
530                         /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
531
532                         h_n = _mm_add_epi16(src_2n, src_2n_2);
533                         h_n = _mm_srai_epi16(h_n, 1);
534                         h_n = _mm_sub_epi16(src_2n_1, h_n);
535                         h_n = _mm_srai_epi16(h_n, 1);
536
537                         _mm_store_si128((__m128i*) h, h_n);
538
539                         if (n == 0)
540                                 h_n_m = h_n;
541                         else
542                                 h_n_m = _mm_load_si128((__m128i*) (h - total_width));
543
544                         /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
545
546                         l_n = _mm_add_epi16(h_n_m, h_n);
547                         l_n = _mm_srai_epi16(l_n, 1);
548                         l_n = _mm_add_epi16(l_n, src_2n);
549
550                         _mm_store_si128((__m128i*) l, l_n);
551
552                         src += 8;
553                         l += 8;
554                         h += 8;
555                 }
556                 src += total_width;
557         }
558 }
559
560 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
561 rfx_dwt_2d_encode_block_horiz_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
562 {
563         int y;
564         int n;
565         int first;
566         __m128i src_2n;
567         __m128i src_2n_1;
568         __m128i src_2n_2;
569         __m128i h_n;
570         __m128i h_n_m;
571         __m128i l_n;
572
573         for (y = 0; y < subband_width; y++)
574         {
575                 for (n = 0; n < subband_width; n += 8)
576                 {
577                         /* The following 3 Set operations consumes more than half of the total DWT processing time! */
578                         src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
579                         src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
580                         src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
581                                 src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
582
583                         /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
584
585                         h_n = _mm_add_epi16(src_2n, src_2n_2);
586                         h_n = _mm_srai_epi16(h_n, 1);
587                         h_n = _mm_sub_epi16(src_2n_1, h_n);
588                         h_n = _mm_srai_epi16(h_n, 1);
589
590                         _mm_store_si128((__m128i*) h, h_n);
591
592                         h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
593                         if (n == 0)
594                         {
595                                 first = _mm_extract_epi16(h_n_m, 1);
596                                 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
597                         }
598
599                         /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
600
601                         l_n = _mm_add_epi16(h_n_m, h_n);
602                         l_n = _mm_srai_epi16(l_n, 1);
603                         l_n = _mm_add_epi16(l_n, src_2n);
604
605                         _mm_store_si128((__m128i*) l, l_n);
606
607                         src += 16;
608                         l += 8;
609                         h += 8;
610                 }
611         }
612 }
613
614 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
615 rfx_dwt_2d_encode_block_sse2(sint16* buffer, sint16* dwt, int subband_width)
616 {
617         sint16 *hl, *lh, *hh, *ll;
618         sint16 *l_src, *h_src;
619
620         _mm_prefetch_buffer((char*) dwt, subband_width * 4 * sizeof(sint16));
621
622         /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
623
624         l_src = dwt;
625         h_src = dwt + subband_width * subband_width * 2;
626
627         rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
628
629         /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
630         /* The lower part L generates LL(3) and HL(0). */
631         /* The higher part H generates LH(1) and HH(2). */
632
633         ll = buffer + subband_width * subband_width * 3;
634         hl = buffer;
635
636         lh = buffer + subband_width * subband_width;
637         hh = buffer + subband_width * subband_width * 2;
638
639         rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
640         rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
641 }
642
643 static void rfx_dwt_2d_encode_sse2(sint16* buffer, sint16* dwt_buffer)
644 {
645         _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
646         
647         rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
648         rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
649         rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
650 }
651
652 void rfx_init_sse2(RFX_CONTEXT* context)
653 {
654         DEBUG_RFX("Using SSE2 optimizations");
655
656         IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2");
657         IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2");
658         IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
659         IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
660         IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
661         IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");
662
663         context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2;
664         context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2;
665         context->quantization_decode = rfx_quantization_decode_sse2;
666         context->quantization_encode = rfx_quantization_encode_sse2;
667         context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
668         context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
669 }