libfreerdp-codec/rfx_sse2.c

   1 /**
   2  * FreeRDP: A Remote Desktop Protocol client.
   3  * RemoteFX Codec Library - SSE2 Optimizations
   4  *
   5  * Copyright 2011 Stephen Erisman
   6  * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20
  21 #include <stdio.h>
  22 #include <stdlib.h>
  23 #include <string.h>
  24 #include <xmmintrin.h>
  25 #include <emmintrin.h>
  26
  27 #include "rfx_types.h"
  28 #include "rfx_sse2.h"
  29
  30 #ifdef _MSC_VER
  31 #define __attribute__(...)
  32 #endif
  33
  34 #define CACHE_LINE_BYTES        64
  35
  36 #define _mm_between_epi16(_val, _min, _max) \
  37         do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
  38
  39 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  40 _mm_prefetch_buffer(char * buffer, int num_bytes)
  41 {
  42         __m128i * buf = (__m128i*) buffer;
  43         int i;
  44         for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
  45         {
  46                 _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
  47         }
  48 }
  49
  50 static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
  51 {
  52         __m128i zero = _mm_setzero_si128();
  53         __m128i max = _mm_set1_epi16(255);
  54
  55         __m128i* y_r_buf = (__m128i*) y_r_buffer;
  56         __m128i* cb_g_buf = (__m128i*) cb_g_buffer;
  57         __m128i* cr_b_buf = (__m128i*) cr_b_buffer;
  58
  59         __m128i y;
  60         __m128i cr;
  61         __m128i cb;
  62         __m128i r;
  63         __m128i g;
  64         __m128i b;
  65
  66         int i;
  67
  68         __m128i r_cr = _mm_set1_epi16(22986);   //  1.403 << 14
  69         __m128i g_cb = _mm_set1_epi16(-5636);   // -0.344 << 14
  70         __m128i g_cr = _mm_set1_epi16(-11698);  // -0.714 << 14
  71         __m128i b_cb = _mm_set1_epi16(28999);   //  1.770 << 14
  72         __m128i c4096 = _mm_set1_epi16(4096);
  73
  74         for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
  75         {
  76                 _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
  77                 _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
  78                 _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
  79         }
  80         for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
  81         {
  82                 /*
  83                 In order to use SSE2 signed 16-bit integer multiplication we need to convert
  84                 the floating point factors to signed int without loosing information.
  85                 The result of this multiplication is 32 bit and we have two SSE instructions
  86                 that return either the hi or lo word.
  87                 Thus we will multiply the factors by the highest possible 2^n, take the
  88                 upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this
  89                 result by multiplying it by 2^(16-n).
  90                 For the given factors in the conversion matrix the best possible n is 14.
  91
  92                 Example for calculating r:
  93                 r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
  94                 r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
  95                 r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
  96                 r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
  97                 */
  98
  99                 /* y = (y_r_buf[i] + 4096) >> 2 */
 100                 y = _mm_load_si128(&y_r_buf[i]);
 101                 y = _mm_add_epi16(y, c4096);
 102                 y = _mm_srai_epi16(y, 2);
 103                 /* cb = cb_g_buf[i]; */
 104                 cb = _mm_load_si128(&cb_g_buf[i]);
 105                 /* cr = cr_b_buf[i]; */
 106                 cr = _mm_load_si128(&cr_b_buf[i]);
 107
 108                 /* (y + HIWORD(cr*22986)) >> 3 */
 109                 r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
 110                 r = _mm_srai_epi16(r, 3);
 111                 /* y_r_buf[i] = MINMAX(r, 0, 255); */
 112                 _mm_between_epi16(r, zero, max);
 113                 _mm_store_si128(&y_r_buf[i], r);
 114
 115                 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
 116                 g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
 117                 g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
 118                 g = _mm_srai_epi16(g, 3);
 119                 /* cb_g_buf[i] = MINMAX(g, 0, 255); */
 120                 _mm_between_epi16(g, zero, max);
 121                 _mm_store_si128(&cb_g_buf[i], g);
 122
 123                 /* (y + HIWORD(cb*28999)) >> 3 */
 124                 b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
 125                 b = _mm_srai_epi16(b, 3);
 126                 /* cr_b_buf[i] = MINMAX(b, 0, 255); */
 127                 _mm_between_epi16(b, zero, max);
 128                 _mm_store_si128(&cr_b_buf[i], b);
 129         }
 130 }
 131
 132 /* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
 133 static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
 134 {
 135         __m128i min = _mm_set1_epi16(-128 << 5);
 136         __m128i max = _mm_set1_epi16(127 << 5);
 137
 138         __m128i* y_r_buf = (__m128i*) y_r_buffer;
 139         __m128i* cb_g_buf = (__m128i*) cb_g_buffer;
 140         __m128i* cr_b_buf = (__m128i*) cr_b_buffer;
 141
 142         __m128i y;
 143         __m128i cr;
 144         __m128i cb;
 145         __m128i r;
 146         __m128i g;
 147         __m128i b;
 148
 149         __m128i y_r  = _mm_set1_epi16(9798);   //  0.299000 << 15
 150         __m128i y_g  = _mm_set1_epi16(19235);  //  0.587000 << 15
 151         __m128i y_b  = _mm_set1_epi16(3735);   //  0.114000 << 15
 152         __m128i cb_r = _mm_set1_epi16(-5535);  // -0.168935 << 15
 153         __m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
 154         __m128i cb_b = _mm_set1_epi16(16403);  //  0.500590 << 15
 155         __m128i cr_r = _mm_set1_epi16(16377);  //  0.499813 << 15
 156         __m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
 157         __m128i cr_b = _mm_set1_epi16(-2663);  // -0.081282 << 15
 158
 159         int i;
 160
 161         for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
 162         {
 163                 _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
 164                 _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
 165                 _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
 166         }
 167         for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
 168         {
 169                 /*
 170                 In order to use SSE2 signed 16-bit integer multiplication we need to convert
 171                 the floating point factors to signed int without loosing information.
 172                 The result of this multiplication is 32 bit and using SSE2 we get either the
 173                 product's hi or lo word.
 174                 Thus we will multiply the factors by the highest possible 2^n and take the
 175                 upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
 176                 Since the final result needs to be scaled by << 5 and also in in order to keep
 177                 the precision within the upper 16 bits we will also have to scale the RGB
 178                 values used in the multiplication by << 5+(16-n).
 179                 */
 180
 181                 /* r = y_r_buf[i]; */
 182                 r = _mm_load_si128(&y_r_buf[i]);
 183
 184                 /* g = cb_g_buf[i]; */
 185                 g = _mm_load_si128(&cb_g_buf[i]);
 186
 187                 /* b = cr_b_buf[i]; */
 188                 b = _mm_load_si128(&cr_b_buf[i]);
 189
 190                 /* r<<6; g<<6; b<<6 */
 191                 r = _mm_slli_epi16(r, 6);
 192                 g = _mm_slli_epi16(g, 6);
 193                 b = _mm_slli_epi16(b, 6);
 194
 195                 /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
 196                 y = _mm_mulhi_epi16(r, y_r);
 197                 y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
 198                 y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
 199                 y = _mm_add_epi16(y, min);
 200                 /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
 201                 _mm_between_epi16(y, min, max);
 202                 _mm_store_si128(&y_r_buf[i], y);
 203
 204                 /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
 205                 cb = _mm_mulhi_epi16(r, cb_r);
 206                 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
 207                 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
 208                 /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
 209                 _mm_between_epi16(cb, min, max);
 210                 _mm_store_si128(&cb_g_buf[i], cb);
 211
 212                 /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
 213                 cr = _mm_mulhi_epi16(r, cr_r);
 214                 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
 215                 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
 216                 /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
 217                 _mm_between_epi16(cr, min, max);
 218                 _mm_store_si128(&cr_b_buf[i], cr);
 219         }
 220 }
 221
 222 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 223 rfx_quantization_decode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
 224 {
 225         __m128i a;
 226         __m128i * ptr = (__m128i*) buffer;
 227         __m128i * buf_end = (__m128i*) (buffer + buffer_size);
 228
 229         if (factor == 0)
 230                 return;
 231
 232         do
 233         {
 234                 a = _mm_load_si128(ptr);
 235                 a = _mm_slli_epi16(a, factor);
 236                 _mm_store_si128(ptr, a);
 237
 238                 ptr++;
 239         } while(ptr < buf_end);
 240 }
 241
 242 static void rfx_quantization_decode_sse2(sint16* buffer, const uint32* quantization_values)
 243 {
 244         _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
 245
 246         rfx_quantization_decode_block_sse2(buffer, 4096, 5);
 247
 248         rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
 249         rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
 250         rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
 251         rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
 252         rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
 253         rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
 254         rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
 255         rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
 256         rfx_quantization_decode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
 257         rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
 258 }
 259
 260 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 261 rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
 262 {
 263         __m128i a;
 264         __m128i* ptr = (__m128i*) buffer;
 265         __m128i* buf_end = (__m128i*) (buffer + buffer_size);
 266         __m128i half;
 267
 268         if (factor == 0)
 269                 return;
 270
 271         half = _mm_set1_epi16(1 << (factor - 1));
 272         do
 273         {
 274                 a = _mm_load_si128(ptr);
 275                 a = _mm_add_epi16(a, half);
 276                 a = _mm_srai_epi16(a, factor);
 277                 _mm_store_si128(ptr, a);
 278
 279                 ptr++;
 280         } while(ptr < buf_end);
 281 }
 282
 283 static void rfx_quantization_encode_sse2(sint16* buffer, const uint32* quantization_values)
 284 {
 285         _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
 286
 287         rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
 288         rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
 289         rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
 290         rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
 291         rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
 292         rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
 293         rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
 294         rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
 295         rfx_quantization_encode_block_sse2(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
 296         rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
 297
 298         rfx_quantization_encode_block_sse2(buffer, 4096, 5);
 299 }
 300
 301 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 302 rfx_dwt_2d_decode_block_horiz_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
 303 {
 304         int y, n;
 305         sint16* l_ptr = l;
 306         sint16* h_ptr = h;
 307         sint16* dst_ptr = dst;
 308         int first;
 309         int last;
 310         __m128i l_n;
 311         __m128i h_n;
 312         __m128i h_n_m;
 313         __m128i tmp_n;
 314         __m128i dst_n;
 315         __m128i dst_n_p;
 316         __m128i dst1;
 317         __m128i dst2;
 318
 319         for (y = 0; y < subband_width; y++)
 320         {
 321                 /* Even coefficients */
 322                 for (n = 0; n < subband_width; n+=8)
 323                 {
 324                         /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
 325
 326                         l_n = _mm_load_si128((__m128i*) l_ptr);
 327
 328                         h_n = _mm_load_si128((__m128i*) h_ptr);
 329                         h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
 330                         if (n == 0)
 331                         {
 332                                 first = _mm_extract_epi16(h_n_m, 1);
 333                                 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
 334                         }
 335
 336                         tmp_n = _mm_add_epi16(h_n, h_n_m);
 337                         tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
 338                         tmp_n = _mm_srai_epi16(tmp_n, 1);
 339
 340                         dst_n = _mm_sub_epi16(l_n, tmp_n);
 341
 342                         _mm_store_si128((__m128i*) l_ptr, dst_n);
 343
 344                         l_ptr+=8;
 345                         h_ptr+=8;
 346                 }
 347                 l_ptr -= subband_width;
 348                 h_ptr -= subband_width;
 349
 350                 /* Odd coefficients */
 351                 for (n = 0; n < subband_width; n+=8)
 352                 {
 353                         /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
 354
 355                         h_n = _mm_load_si128((__m128i*) h_ptr);
 356
 357                         h_n = _mm_slli_epi16(h_n, 1);
 358
 359                         dst_n = _mm_load_si128((__m128i*) (l_ptr));
 360                         dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
 361                         if (n == subband_width - 8)
 362                         {
 363                                 last = _mm_extract_epi16(dst_n_p, 6);
 364                                 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
 365                         }
 366
 367                         tmp_n = _mm_add_epi16(dst_n_p, dst_n);
 368                         tmp_n = _mm_srai_epi16(tmp_n, 1);
 369
 370                         tmp_n = _mm_add_epi16(tmp_n, h_n);
 371
 372                         dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
 373                         dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
 374
 375                         _mm_store_si128((__m128i*) dst_ptr, dst1);
 376                         _mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
 377
 378                         l_ptr+=8;
 379                         h_ptr+=8;
 380                         dst_ptr+=16;
 381                 }
 382         }
 383 }
 384
 385 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 386 rfx_dwt_2d_decode_block_vert_sse2(sint16* l, sint16* h, sint16* dst, int subband_width)
 387 {
 388         int x, n;
 389         sint16* l_ptr = l;
 390         sint16* h_ptr = h;
 391         sint16* dst_ptr = dst;
 392         __m128i l_n;
 393         __m128i h_n;
 394         __m128i tmp_n;
 395         __m128i h_n_m;
 396         __m128i dst_n;
 397         __m128i dst_n_m;
 398         __m128i dst_n_p;
 399
 400         int total_width = subband_width + subband_width;
 401
 402         /* Even coefficients */
 403         for (n = 0; n < subband_width; n++)
 404         {
 405                 for (x = 0; x < total_width; x+=8)
 406                 {
 407                         /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
 408
 409                         l_n = _mm_load_si128((__m128i*) l_ptr);
 410                         h_n = _mm_load_si128((__m128i*) h_ptr);
 411
 412                         tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
 413                         if (n == 0)
 414                                 tmp_n = _mm_add_epi16(tmp_n, h_n);
 415                         else
 416                         {
 417                                 h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
 418                                 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
 419                         }
 420                         tmp_n = _mm_srai_epi16(tmp_n, 1);
 421
 422                         dst_n = _mm_sub_epi16(l_n, tmp_n);
 423                         _mm_store_si128((__m128i*) dst_ptr, dst_n);
 424
 425                         l_ptr+=8;
 426                         h_ptr+=8;
 427                         dst_ptr+=8;
 428                 }
 429                 dst_ptr+=total_width;
 430         }
 431
 432         h_ptr = h;
 433         dst_ptr = dst + total_width;
 434
 435         /* Odd coefficients */
 436         for (n = 0; n < subband_width; n++)
 437         {
 438                 for (x = 0; x < total_width; x+=8)
 439                 {
 440                         /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
 441
 442                         h_n = _mm_load_si128((__m128i*) h_ptr);
 443                         dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
 444                         h_n = _mm_slli_epi16(h_n, 1);
 445
 446                         tmp_n = dst_n_m;
 447                         if (n == subband_width - 1)
 448                                 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
 449                         else
 450                         {
 451                                 dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
 452                                 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
 453                         }
 454                         tmp_n = _mm_srai_epi16(tmp_n, 1);
 455
 456                         dst_n = _mm_add_epi16(tmp_n, h_n);
 457                         _mm_store_si128((__m128i*) dst_ptr, dst_n);
 458
 459                         h_ptr+=8;
 460                         dst_ptr+=8;
 461                 }
 462                 dst_ptr+=total_width;
 463         }
 464 }
 465
 466 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 467 rfx_dwt_2d_decode_block_sse2(sint16* buffer, sint16* idwt, int subband_width)
 468 {
 469         sint16 *hl, *lh, *hh, *ll;
 470         sint16 *l_dst, *h_dst;
 471
 472         _mm_prefetch_buffer((char*) idwt, subband_width * 4 * sizeof(sint16));
 473
 474         /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
 475         /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
 476         /* The lower part L uses LL(3) and HL(0). */
 477         /* The higher part H uses LH(1) and HH(2). */
 478
 479         ll = buffer + subband_width * subband_width * 3;
 480         hl = buffer;
 481         l_dst = idwt;
 482
 483         rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
 484
 485         lh = buffer + subband_width * subband_width;
 486         hh = buffer + subband_width * subband_width * 2;
 487         h_dst = idwt + subband_width * subband_width * 2;
 488
 489         rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
 490
 491         /* Inverse DWT in vertical direction, results are stored in original buffer. */
 492         rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
 493 }
 494
 495 static void rfx_dwt_2d_decode_sse2(sint16* buffer, sint16* dwt_buffer)
 496 {
 497         _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
 498
 499         rfx_dwt_2d_decode_block_sse2(buffer + 3840, dwt_buffer, 8);
 500         rfx_dwt_2d_decode_block_sse2(buffer + 3072, dwt_buffer, 16);
 501         rfx_dwt_2d_decode_block_sse2(buffer, dwt_buffer, 32);
 502 }
 503
 504 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 505 rfx_dwt_2d_encode_block_vert_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
 506 {
 507         int total_width;
 508         int x;
 509         int n;
 510         __m128i src_2n;
 511         __m128i src_2n_1;
 512         __m128i src_2n_2;
 513         __m128i h_n;
 514         __m128i h_n_m;
 515         __m128i l_n;
 516
 517         total_width = subband_width << 1;
 518
 519         for (n = 0; n < subband_width; n++)
 520         {
 521                 for (x = 0; x < total_width; x += 8)
 522                 {
 523                         src_2n = _mm_load_si128((__m128i*) src);
 524                         src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
 525                         if (n < subband_width - 1)
 526                                 src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
 527                         else
 528                                 src_2n_2 = src_2n;
 529
 530                         /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
 531
 532                         h_n = _mm_add_epi16(src_2n, src_2n_2);
 533                         h_n = _mm_srai_epi16(h_n, 1);
 534                         h_n = _mm_sub_epi16(src_2n_1, h_n);
 535                         h_n = _mm_srai_epi16(h_n, 1);
 536
 537                         _mm_store_si128((__m128i*) h, h_n);
 538
 539                         if (n == 0)
 540                                 h_n_m = h_n;
 541                         else
 542                                 h_n_m = _mm_load_si128((__m128i*) (h - total_width));
 543
 544                         /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
 545
 546                         l_n = _mm_add_epi16(h_n_m, h_n);
 547                         l_n = _mm_srai_epi16(l_n, 1);
 548                         l_n = _mm_add_epi16(l_n, src_2n);
 549
 550                         _mm_store_si128((__m128i*) l, l_n);
 551
 552                         src += 8;
 553                         l += 8;
 554                         h += 8;
 555                 }
 556                 src += total_width;
 557         }
 558 }
 559
 560 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 561 rfx_dwt_2d_encode_block_horiz_sse2(sint16* src, sint16* l, sint16* h, int subband_width)
 562 {
 563         int y;
 564         int n;
 565         int first;
 566         __m128i src_2n;
 567         __m128i src_2n_1;
 568         __m128i src_2n_2;
 569         __m128i h_n;
 570         __m128i h_n_m;
 571         __m128i l_n;
 572
 573         for (y = 0; y < subband_width; y++)
 574         {
 575                 for (n = 0; n < subband_width; n += 8)
 576                 {
 577                         /* The following 3 Set operations consumes more than half of the total DWT processing time! */
 578                         src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
 579                         src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
 580                         src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
 581                                 src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
 582
 583                         /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
 584
 585                         h_n = _mm_add_epi16(src_2n, src_2n_2);
 586                         h_n = _mm_srai_epi16(h_n, 1);
 587                         h_n = _mm_sub_epi16(src_2n_1, h_n);
 588                         h_n = _mm_srai_epi16(h_n, 1);
 589
 590                         _mm_store_si128((__m128i*) h, h_n);
 591
 592                         h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
 593                         if (n == 0)
 594                         {
 595                                 first = _mm_extract_epi16(h_n_m, 1);
 596                                 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
 597                         }
 598
 599                         /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
 600
 601                         l_n = _mm_add_epi16(h_n_m, h_n);
 602                         l_n = _mm_srai_epi16(l_n, 1);
 603                         l_n = _mm_add_epi16(l_n, src_2n);
 604
 605                         _mm_store_si128((__m128i*) l, l_n);
 606
 607                         src += 16;
 608                         l += 8;
 609                         h += 8;
 610                 }
 611         }
 612 }
 613
 614 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 615 rfx_dwt_2d_encode_block_sse2(sint16* buffer, sint16* dwt, int subband_width)
 616 {
 617         sint16 *hl, *lh, *hh, *ll;
 618         sint16 *l_src, *h_src;
 619
 620         _mm_prefetch_buffer((char*) dwt, subband_width * 4 * sizeof(sint16));
 621
 622         /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
 623
 624         l_src = dwt;
 625         h_src = dwt + subband_width * subband_width * 2;
 626
 627         rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
 628
 629         /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
 630         /* The lower part L generates LL(3) and HL(0). */
 631         /* The higher part H generates LH(1) and HH(2). */
 632
 633         ll = buffer + subband_width * subband_width * 3;
 634         hl = buffer;
 635
 636         lh = buffer + subband_width * subband_width;
 637         hh = buffer + subband_width * subband_width * 2;
 638
 639         rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
 640         rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
 641 }
 642
 643 static void rfx_dwt_2d_encode_sse2(sint16* buffer, sint16* dwt_buffer)
 644 {
 645         _mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
 646
 647         rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
 648         rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
 649         rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
 650 }
 651
 652 void rfx_init_sse2(RFX_CONTEXT* context)
 653 {
 654         DEBUG_RFX("Using SSE2 optimizations");
 655
 656         IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2");
 657         IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2");
 658         IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
 659         IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
 660         IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
 661         IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");
 662
 663         context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2;
 664         context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2;
 665         context->quantization_decode = rfx_quantization_decode_sse2;
 666         context->quantization_encode = rfx_quantization_encode_sse2;
 667         context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
 668         context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
 669 }