2 FreeRDP: A Remote Desktop Protocol client.
3 RemoteFX Codec Library - NEON Optimizations
5 Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
7 Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
11 http://www.apache.org/licenses/LICENSE-2.0
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
20 #if defined(__ARM_NEON__)
27 #include "rfx_types.h"
31 #include <cpu-features.h>
35 void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
37 int16x8_t zero = vdupq_n_s16(0);
38 int16x8_t max = vdupq_n_s16(255);
39 int16x8_t y_add = vdupq_n_s16(128);
41 int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer;
42 int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer;
43 int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer;
46 for (i = 0; i < 4096 / 8; i++)
48 int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]);
49 y = vaddq_s16(y, y_add);
51 int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]);
53 // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
54 int16x8_t r = vaddq_s16(y, cr);
55 r = vaddq_s16(r, vshrq_n_s16(cr, 2));
56 r = vaddq_s16(r, vshrq_n_s16(cr, 3));
57 r = vaddq_s16(r, vshrq_n_s16(cr, 5));
58 r = vminq_s16(vmaxq_s16(r, zero), max);
59 vst1q_s16((sint16*)&y_r_buf[i], r);
62 int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]);
64 // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
65 int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
66 g = vsubq_s16(g, vshrq_n_s16(cb, 4));
67 g = vsubq_s16(g, vshrq_n_s16(cb, 5));
68 g = vsubq_s16(g, vshrq_n_s16(cr, 1));
69 g = vsubq_s16(g, vshrq_n_s16(cr, 3));
70 g = vsubq_s16(g, vshrq_n_s16(cr, 4));
71 g = vsubq_s16(g, vshrq_n_s16(cr, 5));
72 g = vminq_s16(vmaxq_s16(g, zero), max);
73 vst1q_s16((sint16*)&cb_g_buf[i], g);
75 // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
76 int16x8_t b = vaddq_s16(y, cb);
77 b = vaddq_s16(b, vshrq_n_s16(cb, 1));
78 b = vaddq_s16(b, vshrq_n_s16(cb, 2));
79 b = vaddq_s16(b, vshrq_n_s16(cb, 6));
80 b = vminq_s16(vmaxq_s16(b, zero), max);
81 vst1q_s16((sint16*)&cr_b_buf[i], b);
86 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
87 rfx_quantization_decode_block_NEON(sint16 * buffer, const int buffer_size, const uint32 factor)
91 int16x8_t quantFactors = vdupq_n_s16(factor - 6);
92 int16x8_t* buf = (int16x8_t*)buffer;
93 int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
97 int16x8_t val = vld1q_s16((sint16*)buf);
98 val = vshlq_s16(val, quantFactors);
99 vst1q_s16((sint16*)buf, val);
102 while(buf < buf_end);
106 rfx_quantization_decode_NEON(sint16 * buffer, const uint32 * quantization_values)
108 rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */
109 rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
110 rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
111 rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5]); /* HL2 */
112 rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4]); /* LH2 */
113 rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6]); /* HH2 */
114 rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2]); /* HL3 */
115 rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1]); /* LH3 */
116 rfx_quantization_decode_block_NEON(buffer + 3968, 64, quantization_values[3]); /* HH3 */
117 rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0]); /* LL3 */
122 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
128 sint16 * dst_ptr = dst;
130 for (y = 0; y < subband_width; y++)
132 /* Even coefficients */
133 for (n = 0; n < subband_width; n+=8)
135 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
136 int16x8_t l_n = vld1q_s16(l_ptr);
138 int16x8_t h_n = vld1q_s16(h_ptr);
139 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
143 int16_t first = vgetq_lane_s16(h_n_m, 1);
144 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
147 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
148 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
149 tmp_n = vshrq_n_s16(tmp_n, 1);
151 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
153 vst1q_s16(l_ptr, dst_n);
158 l_ptr -= subband_width;
159 h_ptr -= subband_width;
161 /* Odd coefficients */
162 for (n = 0; n < subband_width; n+=8)
164 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
166 int16x8_t h_n = vld1q_s16(h_ptr);
168 h_n = vshlq_n_s16(h_n, 1);
171 dst_n.val[0] = vld1q_s16(l_ptr);
172 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
173 if (n == subband_width - 8)
175 int16_t last = vgetq_lane_s16(dst_n_p, 6);
176 dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
179 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
180 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
182 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
184 vst2q_s16(dst_ptr, dst_n);
193 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
199 sint16 * dst_ptr = dst;
201 int total_width = subband_width + subband_width;
203 /* Even coefficients */
204 for (n = 0; n < subband_width; n++)
206 for (x = 0; x < total_width; x+=8)
208 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
210 int16x8_t l_n = vld1q_s16(l_ptr);
211 int16x8_t h_n = vld1q_s16(h_ptr);
213 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));;
215 tmp_n = vaddq_s16(tmp_n, h_n);
218 int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
219 tmp_n = vaddq_s16(tmp_n, h_n_m);
221 tmp_n = vshrq_n_s16(tmp_n, 1);
223 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
224 vst1q_s16(dst_ptr, dst_n);
230 dst_ptr+=total_width;
234 dst_ptr = dst + total_width;
236 /* Odd coefficients */
237 for (n = 0; n < subband_width; n++)
239 for (x = 0; x < total_width; x+=8)
241 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
242 int16x8_t h_n = vld1q_s16(h_ptr);
243 int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
245 h_n = vshlq_n_s16(h_n, 1);
247 int16x8_t tmp_n = dst_n_m;
248 if (n == subband_width - 1)
249 tmp_n = vaddq_s16(tmp_n, dst_n_m);
252 int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
253 tmp_n = vaddq_s16(tmp_n, dst_n_p);
255 tmp_n = vshrq_n_s16(tmp_n, 1);
257 int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
258 vst1q_s16(dst_ptr, dst_n);
263 dst_ptr+=total_width;
267 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width)
270 sint16 * hl, * lh, * hh, * ll;
271 sint16 * l_dst, * h_dst;
273 /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
274 /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
275 /* The lower part L uses LL(3) and HL(0). */
276 /* The higher part H uses LH(1) and HH(2). */
278 ll = buffer + subband_width * subband_width * 3;
282 rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
284 lh = buffer + subband_width * subband_width;
285 hh = buffer + subband_width * subband_width * 2;
286 h_dst = idwt + subband_width * subband_width * 2;
288 rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
290 /* Inverse DWT in vertical direction, results are stored in original buffer. */
291 rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
295 rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer)
297 rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
298 rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
299 rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
304 int isNeonSupported()
307 if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM)
309 DEBUG_RFX("NEON optimization disabled - No ARM CPU found");
313 uint64_t features = android_getCpuFeatures();
314 if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
316 if (features & ANDROID_CPU_ARM_FEATURE_NEON)
318 DEBUG_RFX("NEON optimization enabled!");
321 DEBUG_RFX("NEON optimization disabled - CPU not NEON capable");
324 DEBUG_RFX("NEON optimization disabled - No ARMv7 CPU found");
333 void rfx_init_neon(RFX_CONTEXT * context)
337 if(isNeonSupported())
339 DEBUG_RFX("Using NEON optimizations");
341 IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
342 IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON");
343 IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON");
345 context->decode_ycbcr_to_rgb = rfx_decode_YCbCr_to_RGB_NEON;
346 context->quantization_decode = rfx_quantization_decode_NEON;
347 context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
351 #endif // __ARM_NEON__