SDL  2.0
yuv_rgb_sse_func.h
Go to the documentation of this file.
1 // Copyright 2016 Adrien Descamps
2 // Distributed under BSD 3-Clause License
3 
4 /* You need to define the following macros before including this file:
5  SSE_FUNCTION_NAME
6  STD_FUNCTION_NAME
7  YUV_FORMAT
8  RGB_FORMAT
9 */
10 /* You may define the following macro, which affects generated code:
11  SSE_ALIGNED
12 */
13 
14 #ifdef SSE_ALIGNED
15 /* Unaligned instructions seem faster, even on aligned data? */
16 /*
17 #define LOAD_SI128 _mm_load_si128
18 #define SAVE_SI128 _mm_stream_si128
19 */
20 #define LOAD_SI128 _mm_loadu_si128
21 #define SAVE_SI128 _mm_storeu_si128
22 #else
23 #define LOAD_SI128 _mm_loadu_si128
24 #define SAVE_SI128 _mm_storeu_si128
25 #endif
26 
27 #define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
28  r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \
29  g_tmp = _mm_add_epi16( \
30  _mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \
31  _mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \
32  b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \
33  R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \
34  G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \
35  B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \
36  R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \
37  G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \
38  B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \
39 
40 #define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \
41  Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
42  Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
43  \
44  R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \
45  G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \
46  B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \
47  R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \
48  G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \
49  B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \
50 
51 #define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \
52 { \
53  __m128i red_mask, tmp1, tmp2, tmp3, tmp4; \
54 \
55  red_mask = _mm_set1_epi16((short)0xF800); \
56  RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \
57  RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \
58  RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \
59  RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \
60  tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \
61  tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \
62  tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \
63  tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \
64  RGB1 = _mm_or_si128(RGB1, tmp1); \
65  RGB2 = _mm_or_si128(RGB2, tmp2); \
66  RGB3 = _mm_or_si128(RGB3, tmp3); \
67  RGB4 = _mm_or_si128(RGB4, tmp4); \
68  tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \
69  tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \
70  tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \
71  tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \
72  RGB1 = _mm_or_si128(RGB1, tmp1); \
73  RGB2 = _mm_or_si128(RGB2, tmp2); \
74  RGB3 = _mm_or_si128(RGB3, tmp3); \
75  RGB4 = _mm_or_si128(RGB4, tmp4); \
76 }
77 
78 #define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
79 RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \
80 RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \
81 RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \
82 RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \
83 RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \
84 RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \
85 
86 #define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
87 R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \
88 R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \
89 G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \
90 G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \
91 B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \
92 B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \
93 
94 #define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
95 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
96 PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
97 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
98 PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
99 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
100 
101 #define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
102 { \
103  __m128i lo_ab, hi_ab, lo_gr, hi_gr; \
104 \
105  lo_ab = _mm_unpacklo_epi8( A1, B1 ); \
106  hi_ab = _mm_unpackhi_epi8( A1, B1 ); \
107  lo_gr = _mm_unpacklo_epi8( G1, R1 ); \
108  hi_gr = _mm_unpackhi_epi8( G1, R1 ); \
109  RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
110  RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
111  RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
112  RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
113 \
114  lo_ab = _mm_unpacklo_epi8( A2, B2 ); \
115  hi_ab = _mm_unpackhi_epi8( A2, B2 ); \
116  lo_gr = _mm_unpacklo_epi8( G2, R2 ); \
117  hi_gr = _mm_unpackhi_epi8( G2, R2 ); \
118  RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
119  RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
120  RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
121  RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
122 }
123 
124 #if RGB_FORMAT == RGB_FORMAT_RGB565
125 
126 #define PACK_PIXEL \
127  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
128  \
129  PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \
130  \
131  PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \
132 
133 #elif RGB_FORMAT == RGB_FORMAT_RGB24
134 
135 #define PACK_PIXEL \
136  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
137  __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
138  \
139  PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
140  \
141  PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
142 
143 #elif RGB_FORMAT == RGB_FORMAT_RGBA
144 
145 #define PACK_PIXEL \
146  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
147  __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
148  __m128i a = _mm_set1_epi8((char)0xFF); \
149  \
150  PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
151  \
152  PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
153 
154 #elif RGB_FORMAT == RGB_FORMAT_BGRA
155 
156 #define PACK_PIXEL \
157  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
158  __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
159  __m128i a = _mm_set1_epi8((char)0xFF); \
160  \
161  PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
162  \
163  PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
164 
165 #elif RGB_FORMAT == RGB_FORMAT_ARGB
166 
167 #define PACK_PIXEL \
168  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
169  __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
170  __m128i a = _mm_set1_epi8((char)0xFF); \
171  \
172  PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
173  \
174  PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
175 
176 #elif RGB_FORMAT == RGB_FORMAT_ABGR
177 
178 #define PACK_PIXEL \
179  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
180  __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
181  __m128i a = _mm_set1_epi8((char)0xFF); \
182  \
183  PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
184  \
185  PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
186 
187 #else
188 #error PACK_PIXEL unimplemented
189 #endif
190 
191 #if RGB_FORMAT == RGB_FORMAT_RGB565
192 
193 #define SAVE_LINE1 \
194  SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
195  SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
196  SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
197  SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
198 
199 #define SAVE_LINE2 \
200  SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \
201  SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \
202  SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \
203  SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \
204 
205 #elif RGB_FORMAT == RGB_FORMAT_RGB24
206 
207 #define SAVE_LINE1 \
208  SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
209  SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
210  SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
211  SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
212  SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
213  SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
214 
215 #define SAVE_LINE2 \
216  SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \
217  SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \
218  SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \
219  SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \
220  SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \
221  SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \
222 
223 #elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
224  RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
225 
226 #define SAVE_LINE1 \
227  SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
228  SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
229  SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
230  SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
231  SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
232  SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
233  SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \
234  SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \
235 
236 #define SAVE_LINE2 \
237  SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \
238  SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \
239  SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \
240  SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \
241  SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \
242  SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \
243  SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \
244  SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \
245 
246 #else
247 #error SAVE_LINE unimplemented
248 #endif
249 
250 #if YUV_FORMAT == YUV_FORMAT_420
251 
252 #define READ_Y(y_ptr) \
253  y = LOAD_SI128((const __m128i*)(y_ptr)); \
254 
255 #define READ_UV \
256  u = LOAD_SI128((const __m128i*)(u_ptr)); \
257  v = LOAD_SI128((const __m128i*)(v_ptr)); \
258 
259 #elif YUV_FORMAT == YUV_FORMAT_422
260 
261 #define READ_Y(y_ptr) \
262 { \
263  __m128i y1, y2; \
264  y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \
265  y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \
266  y = _mm_packus_epi16(y1, y2); \
267 }
268 
269 #define READ_UV \
270 { \
271  __m128i u1, u2, u3, u4, v1, v2, v3, v4; \
272  u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \
273  u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \
274  u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \
275  u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \
276  u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \
277  v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \
278  v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \
279  v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \
280  v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \
281  v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \
282 }
283 
284 #elif YUV_FORMAT == YUV_FORMAT_NV12
285 
286 #define READ_Y(y_ptr) \
287  y = LOAD_SI128((const __m128i*)(y_ptr)); \
288 
289 #define READ_UV \
290 { \
291  __m128i u1, u2, v1, v2; \
292  u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \
293  u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \
294  u = _mm_packus_epi16(u1, u2); \
295  v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \
296  v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \
297  v = _mm_packus_epi16(v1, v2); \
298 }
299 
300 #else
301 #error READ_UV unimplemented
302 #endif
303 
304 #define YUV2RGB_32 \
305  __m128i r_tmp, g_tmp, b_tmp; \
306  __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
307  __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \
308  __m128i y_16_1, y_16_2; \
309  __m128i y, u, v, u_16, v_16; \
310  __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
311  __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
312  \
313  READ_UV \
314  \
315  /* process first 16 pixels of first line */\
316  u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \
317  v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \
318  u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
319  v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
320  \
321  UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
322  r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
323  r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
324  \
325  READ_Y(y_ptr1) \
326  y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
327  y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
328  \
329  ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
330  \
331  r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \
332  g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \
333  b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \
334  \
335  /* process first 16 pixels of second line */\
336  r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
337  r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
338  \
339  READ_Y(y_ptr2) \
340  y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
341  y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
342  \
343  ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
344  \
345  r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \
346  g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \
347  b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \
348  \
349  /* process last 16 pixels of first line */\
350  u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \
351  v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \
352  u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
353  v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
354  \
355  UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
356  r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
357  r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
358  \
359  READ_Y(y_ptr1+16*y_pixel_stride) \
360  y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
361  y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
362  \
363  ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
364  \
365  r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \
366  g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \
367  b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \
368  \
369  /* process last 16 pixels of second line */\
370  r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
371  r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
372  \
373  READ_Y(y_ptr2+16*y_pixel_stride) \
374  y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
375  y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
376  \
377  ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
378  \
379  r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \
380  g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \
381  b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \
382  \
383 
384 
386  const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
387  uint8_t *RGB, uint32_t RGB_stride,
388  YCbCrType yuv_type)
389 {
390  const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
391 #if YUV_FORMAT == YUV_FORMAT_420
392  const int y_pixel_stride = 1;
393  const int uv_pixel_stride = 1;
394  const int uv_x_sample_interval = 2;
395  const int uv_y_sample_interval = 2;
396 #elif YUV_FORMAT == YUV_FORMAT_422
397  const int y_pixel_stride = 2;
398  const int uv_pixel_stride = 4;
399  const int uv_x_sample_interval = 2;
400  const int uv_y_sample_interval = 1;
401 #elif YUV_FORMAT == YUV_FORMAT_NV12
402  const int y_pixel_stride = 1;
403  const int uv_pixel_stride = 2;
404  const int uv_x_sample_interval = 2;
405  const int uv_y_sample_interval = 2;
406 #endif
407 #if RGB_FORMAT == RGB_FORMAT_RGB565
408  const int rgb_pixel_stride = 2;
409 #elif RGB_FORMAT == RGB_FORMAT_RGB24
410  const int rgb_pixel_stride = 3;
411 #elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
412  RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
413  const int rgb_pixel_stride = 4;
414 #else
415 #error Unknown RGB pixel size
416 #endif
417 
418  if (width >= 32) {
419  uint32_t xpos, ypos;
420  for(ypos=0; ypos<(height-(uv_y_sample_interval-1)); ypos+=uv_y_sample_interval)
421  {
422  const uint8_t *y_ptr1=Y+ypos*Y_stride,
423  *y_ptr2=Y+(ypos+1)*Y_stride,
424  *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
425  *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
426 
427  uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
428  *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
429 
430  for(xpos=0; xpos<(width-31); xpos+=32)
431  {
432  YUV2RGB_32
433  {
434  PACK_PIXEL
435  SAVE_LINE1
436  if (uv_y_sample_interval > 1)
437  {
438  SAVE_LINE2
439  }
440  }
441 
442  y_ptr1+=32*y_pixel_stride;
443  y_ptr2+=32*y_pixel_stride;
444  u_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
445  v_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
446  rgb_ptr1+=32*rgb_pixel_stride;
447  rgb_ptr2+=32*rgb_pixel_stride;
448  }
449  }
450 
451  /* Catch the last line, if needed */
452  if (uv_y_sample_interval == 2 && ypos == (height-1))
453  {
454  const uint8_t *y_ptr=Y+ypos*Y_stride,
455  *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
456  *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
457 
458  uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
459 
460  STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
461  }
462  }
463 
464  /* Catch the right column, if needed */
465  {
466  int converted = (width & ~31);
467  if (converted != width)
468  {
469  const uint8_t *y_ptr=Y+converted*y_pixel_stride,
470  *u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval,
471  *v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval;
472 
473  uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride;
474 
475  STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
476  }
477  }
478 }
479 
480 #undef SSE_FUNCTION_NAME
481 #undef STD_FUNCTION_NAME
482 #undef YUV_FORMAT
483 #undef RGB_FORMAT
484 #undef SSE_ALIGNED
485 #undef LOAD_SI128
486 #undef SAVE_SI128
487 #undef UV2RGB_16
488 #undef ADD_Y2RGB_16
489 #undef PACK_RGB24_32_STEP1
490 #undef PACK_RGB24_32_STEP2
491 #undef PACK_RGB24_32
492 #undef PACK_RGBA_32
493 #undef PACK_PIXEL
494 #undef SAVE_LINE1
495 #undef SAVE_LINE2
496 #undef READ_Y
497 #undef READ_UV
498 #undef YUV2RGB_32
Definition: edid.h:20
#define uv_x_sample_interval
void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type)
#define SAVE_LINE2
#define y_pixel_stride
GLint GLint GLsizei width
Definition: SDL_opengl.h:1572
#define YUV2RGB_32
#define PACK_PIXEL
void STD_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type)
static const YUV2RGBParam YUV2RGB[3]
Definition: yuv_rgb.c:42
set set set set set set set set set set set set set set set set set set set set *set set set macro pixldst op &r &cond WK op &r &cond WK op &r &cond WK else op &m &cond &ia op &r &cond WK else op &m &cond &ia elseif elseif else error unsupported base if elseif elseif else error unsupported unaligned pixldst unaligned endm macro pixst base base else pixldst base endif endm macro PF base if bpp PF set rept prefetch_distance PF set OFFSET endr endif endm macro preload_leading_step2 base if bpp ifc DST PF PF else if bpp lsl PF PF lsl PF PF lsl PF PF PF else PF lsl PF lsl PF lsl PF endif SIZE macro preload_middle scratch_holds_offset if bpp if else PF PF endif endif endif endm macro preload_trailing base if bpp if bpp *pix_per_block PF PF lsl PF PF PF PF PF else PF lsl PF lsl PF PF PF PF PF base if bpp if narrow_case &&bpp<=dst_w_bpp) PF bic, WK0, base, #31 PF pld, [WK0] PF add, WK1, base, X, LSL #bpp_shift PF sub, WK1, WK1, #1 PF bic, WK1, WK1, #31 PF cmp, WK1, WK0 PF beq, 90f PF pld, [WK1]90:.else PF bic, WK0, base, #31 PF pld, [WK0] PF add, WK1, base, X, lsl #bpp_shift PF sub, WK1, WK1, #1 PF bic, WK1, WK1, #31 PF cmp, WK1, WK0 PF beq, 92f91:PF add, WK0, WK0, #32 PF cmp, WK0, WK1 PF pld, [WK0] PF bne, 91b92:.endif .endif.endm.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 .if decrementx sub &cond X, X, #8 *numbytes/dst_w_bpp .endif process_tail cond, numbytes, firstreg .if !((flags) &FLAG_PROCESS_DOES_STORE) pixst cond, numbytes, firstreg, DST .endif.endm.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx .if(flags) &FLAG_BRANCH_OVER .ifc cond, mi bpl 100f .endif .ifc cond, cs bcc 100f .endif .ifc cond, ne beq 100f .endif conditional_process1_helper, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx100:.else conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx .endif.endm.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx .if(flags) &(FLAG_DST_READWRITE|FLAG_BRANCH_OVER|FLAG_PROCESS_CORRUPTS_PSR|FLAG_PROCESS_DOES_STORE) test conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx .if(flags) &FLAG_PROCESS_CORRUPTS_PSR test .endif conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx .else test process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 .if decrementx sub &cond1 X, X, #8 *numbytes1/dst_w_bpp sub &cond2 X, X, #8 *numbytes2/dst_w_bpp .endif process_tail cond1, numbytes1, firstreg1 process_tail cond2, numbytes2, firstreg2 pixst cond1, numbytes1, firstreg1, DST pixst cond2, numbytes2, firstreg2, DST .endif.endm.macro test_bits_1_0_ptr .if(flags) &FLAG_PROCESS_CORRUPTS_WK0 movs SCRATCH, X, lsl #32-1 .else movs SCRATCH, WK0, lsl #32-1 .endif.endm.macro test_bits_3_2_ptr .if(flags) &FLAG_PROCESS_CORRUPTS_WK0 movs SCRATCH, X, lsl #32-3 .else movs SCRATCH, WK0, lsl #32-3 .endif.endm.macro leading_15bytes process_head, process_tail .set DECREMENT_X, 1 .if(flags) &FLAG_PROCESS_CORRUPTS_WK0 .set DECREMENT_X, 0 sub X, X, WK0, lsr #dst_bpp_shift str X, [sp, #LINE_SAVED_REG_COUNT *4] mov X, WK0 .endif .if dst_w_bpp==8 conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X .elseif dst_w_bpp==16 test_bits_1_0_ptr conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X .endif conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X .if(flags) &FLAG_PROCESS_CORRUPTS_WK0 ldr X, [sp, #LINE_SAVED_REG_COUNT *4] .endif.endm.macro test_bits_3_2_pix movs SCRATCH, X, lsl #dst_bpp_shift+32-3.endm.macro test_bits_1_0_pix .if dst_w_bpp==8 movs SCRATCH, X, lsl #dst_bpp_shift+32-1 .else movs SCRATCH, X, lsr #1 .endif.endm.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 .if dst_w_bpp==16 test_bits_1_0_pix conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 .elseif dst_w_bpp==8 conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 .endif.endm.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment110:.set SUBBLOCK, 0 .rept pix_per_block *dst_w_bpp/128 process_head, 16, 0, unaligned_src, unaligned_mask, 1 .if(src_bpp > 0) &&(mask_bpp==0) &&((flags) &FLAG_PROCESS_PRESERVES_SCRATCH) preload_middle src_bpp, SRC, 1 .elseif(src_bpp==0) &&(mask_bpp > 0) &&((flags) &FLAG_PROCESS_PRESERVES_SCRATCH) preload_middle mask_bpp, MASK, 1 .else preload_middle src_bpp, SRC, 0 preload_middle mask_bpp, MASK, 0 .endif .if(dst_r_bpp > 0) &&((SUBBLOCK % 2)==0) &&(((flags) &FLAG_NO_PRELOAD_DST)==0) PF pld, [DST, #32 *prefetch_distance - dst_alignment] .endif process_tail, 16, 0 .if !((flags) &FLAG_PROCESS_DOES_STORE) pixst, 16, 0, DST .endif .set SUBBLOCK, SUBBLOCK+1 .endr subs X, X, #pix_per_block bhs 110b.endm.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask .if dst_r_bpp > tst bne process_inner_loop DST_PRELOAD_BIAS endif preload_trailing SRC preload_trailing MASK DST endif add medium_case_inner_loop_and_trailing_pixels unaligned_mask endm macro medium_case_inner_loop_and_trailing_pixels DST endif subs bhs tst beq exit_label trailing_15bytes unaligned_mask endm macro narrow_case_inner_loop_and_trailing_pixels unaligned_mask tst conditional_process1 trailing_15bytes unaligned_mask endm macro switch_on_alignment exit_label if bne endif if bne endif action if endif if bne endif action if endif endif endm macro end_of_line last_one if SINGLE_SCANLINE ifc b endif else if vars_spilled word LINE_SAVED_REGS endif subs Y
YCbCrType
Definition: yuv_rgb.h:22
#define uv_pixel_stride
unsigned char uint8_t
unsigned int uint32_t
GLint GLint GLsizei GLsizei height
Definition: SDL_opengl.h:1572
#define V(value)
Definition: yuv_rgb.c:35
GLfloat param
#define uv_y_sample_interval
#define SAVE_LINE1