Initial commit - from Precise source
[freerdp-ubuntu-pcb-backport.git] / libfreerdp-codec / rfx_neon.c
1 /*
2    FreeRDP: A Remote Desktop Protocol client.
3    RemoteFX Codec Library - NEON Optimizations
4
5    Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
6
7    Licensed under the Apache License, Version 2.0 (the "License");
8    you may not use this file except in compliance with the License.
9    You may obtain a copy of the License at
10
11        http://www.apache.org/licenses/LICENSE-2.0
12
13    Unless required by applicable law or agreed to in writing, software
14    distributed under the License is distributed on an "AS IS" BASIS,
15    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16    See the License for the specific language governing permissions and
17    limitations under the License.
18 */
19
20 #if defined(__ARM_NEON__)
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <arm_neon.h>
26
27 #include "rfx_types.h"
28 #include "rfx_neon.h"
29
30 #if defined(ANDROID)
31 #include <cpu-features.h>
32 #endif
33
34
35 void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
36 {
37         int16x8_t zero = vdupq_n_s16(0);
38         int16x8_t max = vdupq_n_s16(255);
39         int16x8_t y_add = vdupq_n_s16(128);
40
41         int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer;
42         int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer;
43         int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer;
44
45         int i;
46         for (i = 0; i < 4096 / 8; i++)
47         {
48                 int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]);
49                 y = vaddq_s16(y, y_add);
50
51                 int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]);
52
53                 // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
54                 int16x8_t r = vaddq_s16(y, cr);
55                 r = vaddq_s16(r, vshrq_n_s16(cr, 2));
56                 r = vaddq_s16(r, vshrq_n_s16(cr, 3));
57                 r = vaddq_s16(r, vshrq_n_s16(cr, 5));
58                 r = vminq_s16(vmaxq_s16(r, zero), max);
59                 vst1q_s16((sint16*)&y_r_buf[i], r);
60
61                 // cb = cb_g_buf[i];
62                 int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]);
63
64                 // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
65                 int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
66                 g = vsubq_s16(g, vshrq_n_s16(cb, 4));
67                 g = vsubq_s16(g, vshrq_n_s16(cb, 5));
68                 g = vsubq_s16(g, vshrq_n_s16(cr, 1));
69                 g = vsubq_s16(g, vshrq_n_s16(cr, 3));
70                 g = vsubq_s16(g, vshrq_n_s16(cr, 4));
71                 g = vsubq_s16(g, vshrq_n_s16(cr, 5));
72                 g = vminq_s16(vmaxq_s16(g, zero), max);
73                 vst1q_s16((sint16*)&cb_g_buf[i], g);
74
75                 // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
76                 int16x8_t b = vaddq_s16(y, cb);
77                 b = vaddq_s16(b, vshrq_n_s16(cb, 1));
78                 b = vaddq_s16(b, vshrq_n_s16(cb, 2));
79                 b = vaddq_s16(b, vshrq_n_s16(cb, 6));
80                 b = vminq_s16(vmaxq_s16(b, zero), max);
81                 vst1q_s16((sint16*)&cr_b_buf[i], b);
82         }
83
84 }
85
86 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
87 rfx_quantization_decode_block_NEON(sint16 * buffer, const int buffer_size, const uint32 factor)
88 {
89         if (factor <= 6)
90                 return;
91         int16x8_t quantFactors = vdupq_n_s16(factor - 6);
92         int16x8_t* buf = (int16x8_t*)buffer;
93         int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
94
95         do
96         {
97                 int16x8_t val = vld1q_s16((sint16*)buf);
98                 val = vshlq_s16(val, quantFactors);
99                 vst1q_s16((sint16*)buf, val);
100                 buf++;
101         }
102         while(buf < buf_end);
103 }
104
105 void
106 rfx_quantization_decode_NEON(sint16 * buffer, const uint32 * quantization_values)
107 {
108         rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */
109         rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
110         rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
111         rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5]); /* HL2 */
112         rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4]); /* LH2 */
113         rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6]); /* HH2 */
114         rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2]); /* HL3 */
115         rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1]); /* LH3 */
116         rfx_quantization_decode_block_NEON(buffer + 3968, 64, quantization_values[3]); /* HH3 */
117         rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0]); /* LL3 */
118 }
119
120
121
122 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
124 {
125         int y, n;
126         sint16 * l_ptr = l;
127         sint16 * h_ptr = h;
128         sint16 * dst_ptr = dst;
129
130         for (y = 0; y < subband_width; y++)
131         {
132                 /* Even coefficients */
133                 for (n = 0; n < subband_width; n+=8)
134                 {
135                         // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
136                         int16x8_t l_n = vld1q_s16(l_ptr);
137
138                         int16x8_t h_n = vld1q_s16(h_ptr);
139                         int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
140
141                         if (n == 0)
142                         {
143                                 int16_t first = vgetq_lane_s16(h_n_m, 1);
144                                 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
145                         }
146
147                         int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
148                         tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
149                         tmp_n = vshrq_n_s16(tmp_n, 1);
150
151                         int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
152
153                         vst1q_s16(l_ptr, dst_n);
154
155                         l_ptr+=8;
156                         h_ptr+=8;
157                 }
158                 l_ptr -= subband_width;
159                 h_ptr -= subband_width;
160
161                 /* Odd coefficients */
162                 for (n = 0; n < subband_width; n+=8)
163                 {
164                         // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
165
166                         int16x8_t h_n = vld1q_s16(h_ptr);
167
168                         h_n = vshlq_n_s16(h_n, 1);
169
170                         int16x8x2_t dst_n;
171                         dst_n.val[0] = vld1q_s16(l_ptr);
172                         int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
173                         if (n == subband_width - 8)
174                         {
175                                 int16_t last = vgetq_lane_s16(dst_n_p, 6);
176                                 dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
177                         }
178
179                         dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
180                         dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
181
182                         dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
183
184                         vst2q_s16(dst_ptr, dst_n);
185
186                         l_ptr+=8;
187                         h_ptr+=8;
188                         dst_ptr+=16;
189                 }
190         }
191 }
192
193 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
195 {
196         int x, n;
197         sint16 * l_ptr = l;
198         sint16 * h_ptr = h;
199         sint16 * dst_ptr = dst;
200
201         int total_width = subband_width + subband_width;
202
203         /* Even coefficients */
204         for (n = 0; n < subband_width; n++)
205         {
206                 for (x = 0; x < total_width; x+=8)
207                 {
208                         // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
209
210                         int16x8_t l_n = vld1q_s16(l_ptr);
211                         int16x8_t h_n = vld1q_s16(h_ptr);
212
213                         int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));;
214                         if (n == 0)
215                                 tmp_n = vaddq_s16(tmp_n, h_n);
216                         else
217                         {
218                                 int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
219                                 tmp_n = vaddq_s16(tmp_n, h_n_m);
220                         }
221                         tmp_n = vshrq_n_s16(tmp_n, 1);
222
223                         int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
224                         vst1q_s16(dst_ptr, dst_n);
225
226                         l_ptr+=8;
227                         h_ptr+=8;
228                         dst_ptr+=8;
229                 }
230                 dst_ptr+=total_width;
231         }
232
233         h_ptr = h;
234         dst_ptr = dst + total_width;
235
236         /* Odd coefficients */
237         for (n = 0; n < subband_width; n++)
238         {
239                 for (x = 0; x < total_width; x+=8)
240                 {
241                 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
242                 int16x8_t h_n = vld1q_s16(h_ptr);
243                 int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
244
245                 h_n = vshlq_n_s16(h_n, 1);
246
247                 int16x8_t tmp_n = dst_n_m;
248                 if (n == subband_width - 1)
249                         tmp_n = vaddq_s16(tmp_n, dst_n_m);
250                 else
251                 {
252                         int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
253                         tmp_n = vaddq_s16(tmp_n, dst_n_p);
254                 }
255                 tmp_n = vshrq_n_s16(tmp_n, 1);
256
257                 int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
258                 vst1q_s16(dst_ptr, dst_n);
259
260                 h_ptr+=8;
261                 dst_ptr+=8;
262         }
263         dst_ptr+=total_width;
264 }
265 }
266
267 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width)
269 {
270         sint16 * hl, * lh, * hh, * ll;
271         sint16 * l_dst, * h_dst;
272
273         /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
274         /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
275         /* The lower part L uses LL(3) and HL(0). */
276         /* The higher part H uses LH(1) and HH(2). */
277
278         ll = buffer + subband_width * subband_width * 3;
279         hl = buffer;
280         l_dst = idwt;
281
282         rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
283
284         lh = buffer + subband_width * subband_width;
285         hh = buffer + subband_width * subband_width * 2;
286         h_dst = idwt + subband_width * subband_width * 2;
287
288         rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
289
290         /* Inverse DWT in vertical direction, results are stored in original buffer. */
291         rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
292 }
293
294 void
295 rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer)
296 {
297         rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
298         rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
299         rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
300 }
301
302
303
304 int isNeonSupported()
305 {
306 #if defined(ANDROID)
307         if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM)
308         {
309                 DEBUG_RFX("NEON optimization disabled - No ARM CPU found");
310                 return 0;
311         }
312
313         uint64_t features = android_getCpuFeatures();
314         if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
315         {
316                 if (features & ANDROID_CPU_ARM_FEATURE_NEON)
317                 {
318                         DEBUG_RFX("NEON optimization enabled!");
319                         return 1;
320                 }
321                 DEBUG_RFX("NEON optimization disabled - CPU not NEON capable");
322         }
323         else
324                 DEBUG_RFX("NEON optimization disabled - No ARMv7 CPU found");
325
326         return 0;
327 #else
328         return 1;
329 #endif
330 }
331
332
333 void rfx_init_neon(RFX_CONTEXT * context)
334 {
335
336
337         if(isNeonSupported())
338         {
339                 DEBUG_RFX("Using NEON optimizations");
340
341                 IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
342                 IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON");
343                 IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON");
344
345                 context->decode_ycbcr_to_rgb = rfx_decode_YCbCr_to_RGB_NEON;
346                 context->quantization_decode = rfx_quantization_decode_NEON;
347                 context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
348         }
349 }
350
351 #endif // __ARM_NEON__
352