libfreerdp-codec/rfx_neon.c

   1 /*
   2    FreeRDP: A Remote Desktop Protocol client.
   3    RemoteFX Codec Library - NEON Optimizations
   4
   5    Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
   6
   7    Licensed under the Apache License, Version 2.0 (the "License");
   8    you may not use this file except in compliance with the License.
   9    You may obtain a copy of the License at
  10
  11        http://www.apache.org/licenses/LICENSE-2.0
  12
  13    Unless required by applicable law or agreed to in writing, software
  14    distributed under the License is distributed on an "AS IS" BASIS,
  15    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16    See the License for the specific language governing permissions and
  17    limitations under the License.
  18 */
  19
  20 #if defined(__ARM_NEON__)
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <arm_neon.h>
  26
  27 #include "rfx_types.h"
  28 #include "rfx_neon.h"
  29
  30 #if defined(ANDROID)
  31 #include <cpu-features.h>
  32 #endif
  33
  34
  35 void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
  36 {
  37         int16x8_t zero = vdupq_n_s16(0);
  38         int16x8_t max = vdupq_n_s16(255);
  39         int16x8_t y_add = vdupq_n_s16(128);
  40
  41         int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer;
  42         int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer;
  43         int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer;
  44
  45         int i;
  46         for (i = 0; i < 4096 / 8; i++)
  47         {
  48                 int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]);
  49                 y = vaddq_s16(y, y_add);
  50
  51                 int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]);
  52
  53                 // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
  54                 int16x8_t r = vaddq_s16(y, cr);
  55                 r = vaddq_s16(r, vshrq_n_s16(cr, 2));
  56                 r = vaddq_s16(r, vshrq_n_s16(cr, 3));
  57                 r = vaddq_s16(r, vshrq_n_s16(cr, 5));
  58                 r = vminq_s16(vmaxq_s16(r, zero), max);
  59                 vst1q_s16((sint16*)&y_r_buf[i], r);
  60
  61                 // cb = cb_g_buf[i];
  62                 int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]);
  63
  64                 // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
  65                 int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
  66                 g = vsubq_s16(g, vshrq_n_s16(cb, 4));
  67                 g = vsubq_s16(g, vshrq_n_s16(cb, 5));
  68                 g = vsubq_s16(g, vshrq_n_s16(cr, 1));
  69                 g = vsubq_s16(g, vshrq_n_s16(cr, 3));
  70                 g = vsubq_s16(g, vshrq_n_s16(cr, 4));
  71                 g = vsubq_s16(g, vshrq_n_s16(cr, 5));
  72                 g = vminq_s16(vmaxq_s16(g, zero), max);
  73                 vst1q_s16((sint16*)&cb_g_buf[i], g);
  74
  75                 // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
  76                 int16x8_t b = vaddq_s16(y, cb);
  77                 b = vaddq_s16(b, vshrq_n_s16(cb, 1));
  78                 b = vaddq_s16(b, vshrq_n_s16(cb, 2));
  79                 b = vaddq_s16(b, vshrq_n_s16(cb, 6));
  80                 b = vminq_s16(vmaxq_s16(b, zero), max);
  81                 vst1q_s16((sint16*)&cr_b_buf[i], b);
  82         }
  83
  84 }
  85
  86 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  87 rfx_quantization_decode_block_NEON(sint16 * buffer, const int buffer_size, const uint32 factor)
  88 {
  89         if (factor <= 6)
  90                 return;
  91         int16x8_t quantFactors = vdupq_n_s16(factor - 6);
  92         int16x8_t* buf = (int16x8_t*)buffer;
  93         int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
  94
  95         do
  96         {
  97                 int16x8_t val = vld1q_s16((sint16*)buf);
  98                 val = vshlq_s16(val, quantFactors);
  99                 vst1q_s16((sint16*)buf, val);
 100                 buf++;
 101         }
 102         while(buf < buf_end);
 103 }
 104
 105 void
 106 rfx_quantization_decode_NEON(sint16 * buffer, const uint32 * quantization_values)
 107 {
 108         rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */
 109         rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
 110         rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
 111         rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5]); /* HL2 */
 112         rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4]); /* LH2 */
 113         rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6]); /* HH2 */
 114         rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2]); /* HL3 */
 115         rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1]); /* LH3 */
 116         rfx_quantization_decode_block_NEON(buffer + 3968, 64, quantization_values[3]); /* HH3 */
 117         rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0]); /* LL3 */
 118 }
 119
 120
 121
 122 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 123 rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
 124 {
 125         int y, n;
 126         sint16 * l_ptr = l;
 127         sint16 * h_ptr = h;
 128         sint16 * dst_ptr = dst;
 129
 130         for (y = 0; y < subband_width; y++)
 131         {
 132                 /* Even coefficients */
 133                 for (n = 0; n < subband_width; n+=8)
 134                 {
 135                         // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
 136                         int16x8_t l_n = vld1q_s16(l_ptr);
 137
 138                         int16x8_t h_n = vld1q_s16(h_ptr);
 139                         int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
 140
 141                         if (n == 0)
 142                         {
 143                                 int16_t first = vgetq_lane_s16(h_n_m, 1);
 144                                 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
 145                         }
 146
 147                         int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
 148                         tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
 149                         tmp_n = vshrq_n_s16(tmp_n, 1);
 150
 151                         int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
 152
 153                         vst1q_s16(l_ptr, dst_n);
 154
 155                         l_ptr+=8;
 156                         h_ptr+=8;
 157                 }
 158                 l_ptr -= subband_width;
 159                 h_ptr -= subband_width;
 160
 161                 /* Odd coefficients */
 162                 for (n = 0; n < subband_width; n+=8)
 163                 {
 164                         // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
 165
 166                         int16x8_t h_n = vld1q_s16(h_ptr);
 167
 168                         h_n = vshlq_n_s16(h_n, 1);
 169
 170                         int16x8x2_t dst_n;
 171                         dst_n.val[0] = vld1q_s16(l_ptr);
 172                         int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
 173                         if (n == subband_width - 8)
 174                         {
 175                                 int16_t last = vgetq_lane_s16(dst_n_p, 6);
 176                                 dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
 177                         }
 178
 179                         dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
 180                         dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
 181
 182                         dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
 183
 184                         vst2q_s16(dst_ptr, dst_n);
 185
 186                         l_ptr+=8;
 187                         h_ptr+=8;
 188                         dst_ptr+=16;
 189                 }
 190         }
 191 }
 192
 193 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 194 rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
 195 {
 196         int x, n;
 197         sint16 * l_ptr = l;
 198         sint16 * h_ptr = h;
 199         sint16 * dst_ptr = dst;
 200
 201         int total_width = subband_width + subband_width;
 202
 203         /* Even coefficients */
 204         for (n = 0; n < subband_width; n++)
 205         {
 206                 for (x = 0; x < total_width; x+=8)
 207                 {
 208                         // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
 209
 210                         int16x8_t l_n = vld1q_s16(l_ptr);
 211                         int16x8_t h_n = vld1q_s16(h_ptr);
 212
 213                         int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));;
 214                         if (n == 0)
 215                                 tmp_n = vaddq_s16(tmp_n, h_n);
 216                         else
 217                         {
 218                                 int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
 219                                 tmp_n = vaddq_s16(tmp_n, h_n_m);
 220                         }
 221                         tmp_n = vshrq_n_s16(tmp_n, 1);
 222
 223                         int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
 224                         vst1q_s16(dst_ptr, dst_n);
 225
 226                         l_ptr+=8;
 227                         h_ptr+=8;
 228                         dst_ptr+=8;
 229                 }
 230                 dst_ptr+=total_width;
 231         }
 232
 233         h_ptr = h;
 234         dst_ptr = dst + total_width;
 235
 236         /* Odd coefficients */
 237         for (n = 0; n < subband_width; n++)
 238         {
 239                 for (x = 0; x < total_width; x+=8)
 240                 {
 241                 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
 242                 int16x8_t h_n = vld1q_s16(h_ptr);
 243                 int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
 244
 245                 h_n = vshlq_n_s16(h_n, 1);
 246
 247                 int16x8_t tmp_n = dst_n_m;
 248                 if (n == subband_width - 1)
 249                         tmp_n = vaddq_s16(tmp_n, dst_n_m);
 250                 else
 251                 {
 252                         int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
 253                         tmp_n = vaddq_s16(tmp_n, dst_n_p);
 254                 }
 255                 tmp_n = vshrq_n_s16(tmp_n, 1);
 256
 257                 int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
 258                 vst1q_s16(dst_ptr, dst_n);
 259
 260                 h_ptr+=8;
 261                 dst_ptr+=8;
 262         }
 263         dst_ptr+=total_width;
 264 }
 265 }
 266
 267 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 268 rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width)
 269 {
 270         sint16 * hl, * lh, * hh, * ll;
 271         sint16 * l_dst, * h_dst;
 272
 273         /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
 274         /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
 275         /* The lower part L uses LL(3) and HL(0). */
 276         /* The higher part H uses LH(1) and HH(2). */
 277
 278         ll = buffer + subband_width * subband_width * 3;
 279         hl = buffer;
 280         l_dst = idwt;
 281
 282         rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
 283
 284         lh = buffer + subband_width * subband_width;
 285         hh = buffer + subband_width * subband_width * 2;
 286         h_dst = idwt + subband_width * subband_width * 2;
 287
 288         rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
 289
 290         /* Inverse DWT in vertical direction, results are stored in original buffer. */
 291         rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
 292 }
 293
 294 void
 295 rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer)
 296 {
 297         rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
 298         rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
 299         rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
 300 }
 301
 302
 303
 304 int isNeonSupported()
 305 {
 306 #if defined(ANDROID)
 307         if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM)
 308         {
 309                 DEBUG_RFX("NEON optimization disabled - No ARM CPU found");
 310                 return 0;
 311         }
 312
 313         uint64_t features = android_getCpuFeatures();
 314         if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
 315         {
 316                 if (features & ANDROID_CPU_ARM_FEATURE_NEON)
 317                 {
 318                         DEBUG_RFX("NEON optimization enabled!");
 319                         return 1;
 320                 }
 321                 DEBUG_RFX("NEON optimization disabled - CPU not NEON capable");
 322         }
 323         else
 324                 DEBUG_RFX("NEON optimization disabled - No ARMv7 CPU found");
 325
 326         return 0;
 327 #else
 328         return 1;
 329 #endif
 330 }
 331
 332
 333 void rfx_init_neon(RFX_CONTEXT * context)
 334 {
 335
 336
 337         if(isNeonSupported())
 338         {
 339                 DEBUG_RFX("Using NEON optimizations");
 340
 341                 IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
 342                 IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON");
 343                 IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON");
 344
 345                 context->decode_ycbcr_to_rgb = rfx_decode_YCbCr_to_RGB_NEON;
 346                 context->quantization_decode = rfx_quantization_decode_NEON;
 347                 context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
 348         }
 349 }
 350
 351 #endif // __ARM_NEON__
 352