SDL  2.0
SDL_blit_A.c
Go to the documentation of this file.
1 /*
2  Simple DirectMedia Layer
3  Copyright (C) 1997-2018 Sam Lantinga <slouken@libsdl.org>
4 
5  This software is provided 'as-is', without any express or implied
6  warranty. In no event will the authors be held liable for any damages
7  arising from the use of this software.
8 
9  Permission is granted to anyone to use this software for any purpose,
10  including commercial applications, and to alter it and redistribute it
11  freely, subject to the following restrictions:
12 
13  1. The origin of this software must not be misrepresented; you must not
14  claim that you wrote the original software. If you use this software
15  in a product, an acknowledgment in the product documentation would be
16  appreciated but is not required.
17  2. Altered source versions must be plainly marked as such, and must not be
18  misrepresented as being the original software.
19  3. This notice may not be removed or altered from any source distribution.
20 */
21 #include "../SDL_internal.h"
22 
23 #include "SDL_video.h"
24 #include "SDL_blit.h"
25 
26 /* Functions to perform alpha blended blitting */
27 
28 /* N->1 blending with per-surface alpha */
29 static void
31 {
32  int width = info->dst_w;
33  int height = info->dst_h;
34  Uint8 *src = info->src;
35  int srcskip = info->src_skip;
36  Uint8 *dst = info->dst;
37  int dstskip = info->dst_skip;
38  Uint8 *palmap = info->table;
39  SDL_PixelFormat *srcfmt = info->src_fmt;
40  SDL_PixelFormat *dstfmt = info->dst_fmt;
41  int srcbpp = srcfmt->BytesPerPixel;
42  Uint32 Pixel;
43  unsigned sR, sG, sB;
44  unsigned dR, dG, dB;
45  const unsigned A = info->a;
46 
47  while (height--) {
48  /* *INDENT-OFF* */
50  {
51  DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
52  dR = dstfmt->palette->colors[*dst].r;
53  dG = dstfmt->palette->colors[*dst].g;
54  dB = dstfmt->palette->colors[*dst].b;
55  ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
56  dR &= 0xff;
57  dG &= 0xff;
58  dB &= 0xff;
59  /* Pack RGB into 8bit pixel */
60  if ( palmap == NULL ) {
61  *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
62  } else {
63  *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
64  }
65  dst++;
66  src += srcbpp;
67  },
68  width);
69  /* *INDENT-ON* */
70  src += srcskip;
71  dst += dstskip;
72  }
73 }
74 
75 /* N->1 blending with pixel alpha */
76 static void
78 {
79  int width = info->dst_w;
80  int height = info->dst_h;
81  Uint8 *src = info->src;
82  int srcskip = info->src_skip;
83  Uint8 *dst = info->dst;
84  int dstskip = info->dst_skip;
85  Uint8 *palmap = info->table;
86  SDL_PixelFormat *srcfmt = info->src_fmt;
87  SDL_PixelFormat *dstfmt = info->dst_fmt;
88  int srcbpp = srcfmt->BytesPerPixel;
89  Uint32 Pixel;
90  unsigned sR, sG, sB, sA;
91  unsigned dR, dG, dB;
92 
93  while (height--) {
94  /* *INDENT-OFF* */
96  {
97  DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
98  dR = dstfmt->palette->colors[*dst].r;
99  dG = dstfmt->palette->colors[*dst].g;
100  dB = dstfmt->palette->colors[*dst].b;
101  ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
102  dR &= 0xff;
103  dG &= 0xff;
104  dB &= 0xff;
105  /* Pack RGB into 8bit pixel */
106  if ( palmap == NULL ) {
107  *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
108  } else {
109  *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
110  }
111  dst++;
112  src += srcbpp;
113  },
114  width);
115  /* *INDENT-ON* */
116  src += srcskip;
117  dst += dstskip;
118  }
119 }
120 
121 /* colorkeyed N->1 blending with per-surface alpha */
122 static void
124 {
125  int width = info->dst_w;
126  int height = info->dst_h;
127  Uint8 *src = info->src;
128  int srcskip = info->src_skip;
129  Uint8 *dst = info->dst;
130  int dstskip = info->dst_skip;
131  Uint8 *palmap = info->table;
132  SDL_PixelFormat *srcfmt = info->src_fmt;
133  SDL_PixelFormat *dstfmt = info->dst_fmt;
134  int srcbpp = srcfmt->BytesPerPixel;
135  Uint32 ckey = info->colorkey;
136  Uint32 Pixel;
137  unsigned sR, sG, sB;
138  unsigned dR, dG, dB;
139  const unsigned A = info->a;
140 
141  while (height--) {
142  /* *INDENT-OFF* */
143  DUFFS_LOOP(
144  {
145  DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
146  if ( Pixel != ckey ) {
147  dR = dstfmt->palette->colors[*dst].r;
148  dG = dstfmt->palette->colors[*dst].g;
149  dB = dstfmt->palette->colors[*dst].b;
150  ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
151  dR &= 0xff;
152  dG &= 0xff;
153  dB &= 0xff;
154  /* Pack RGB into 8bit pixel */
155  if ( palmap == NULL ) {
156  *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
157  } else {
158  *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
159  }
160  }
161  dst++;
162  src += srcbpp;
163  },
164  width);
165  /* *INDENT-ON* */
166  src += srcskip;
167  dst += dstskip;
168  }
169 }
170 
171 #ifdef __MMX__
172 
173 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
174 static void
175 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
176 {
177  int width = info->dst_w;
178  int height = info->dst_h;
179  Uint32 *srcp = (Uint32 *) info->src;
180  int srcskip = info->src_skip >> 2;
181  Uint32 *dstp = (Uint32 *) info->dst;
182  int dstskip = info->dst_skip >> 2;
183  Uint32 dalpha = info->dst_fmt->Amask;
184 
185  __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
186 
187  hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
188  lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
189  dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
190 
191  while (height--) {
192  int n = width;
193  if (n & 1) {
194  Uint32 s = *srcp++;
195  Uint32 d = *dstp;
196  *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
197  + (s & d & 0x00010101)) | dalpha;
198  n--;
199  }
200 
201  for (n >>= 1; n > 0; --n) {
202  dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
203  dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
204 
205  src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
206  src2 = src1; /* 2 x src -> src2(ARGBARGB) */
207 
208  dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
209  src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
210  src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
211  src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
212 
213  dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
214  dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
215  dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
216  dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
217 
218  *(__m64 *) dstp = dst1; /* dst1 -> 2 x dst pixels */
219  dstp += 2;
220  srcp += 2;
221  }
222 
223  srcp += srcskip;
224  dstp += dstskip;
225  }
226  _mm_empty();
227 }
228 
229 /* fast RGB888->(A)RGB888 blending with surface alpha */
230 static void
231 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
232 {
233  SDL_PixelFormat *df = info->dst_fmt;
234  Uint32 chanmask;
235  unsigned alpha = info->a;
236 
237  if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
238  /* only call a128 version when R,G,B occupy lower bits */
239  BlitRGBtoRGBSurfaceAlpha128MMX(info);
240  } else {
241  int width = info->dst_w;
242  int height = info->dst_h;
243  Uint32 *srcp = (Uint32 *) info->src;
244  int srcskip = info->src_skip >> 2;
245  Uint32 *dstp = (Uint32 *) info->dst;
246  int dstskip = info->dst_skip >> 2;
247  Uint32 dalpha = df->Amask;
248  Uint32 amult;
249 
250  __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
251 
252  mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
253  /* form the alpha mult */
254  amult = alpha | (alpha << 8);
255  amult = amult | (amult << 16);
256  chanmask =
257  (0xff << df->Rshift) | (0xff << df->
258  Gshift) | (0xff << df->Bshift);
259  mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
260  mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
261  /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
262  dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
263 
264  while (height--) {
265  int n = width;
266  if (n & 1) {
267  /* One Pixel Blend */
268  src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
269  src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
270 
271  dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
272  dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
273 
274  src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
275  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
276  src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
277  dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
278 
279  dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
280  dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
281  *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
282 
283  ++srcp;
284  ++dstp;
285 
286  n--;
287  }
288 
289  for (n >>= 1; n > 0; --n) {
290  /* Two Pixels Blend */
291  src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
292  src2 = src1; /* 2 x src -> src2(ARGBARGB) */
293  src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
294  src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
295 
296  dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
297  dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
298  dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
299  dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
300 
301  src1 = _mm_sub_pi16(src1, dst1); /* src1 - dst1 -> src1 */
302  src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
303  src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
304  dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
305 
306  src2 = _mm_sub_pi16(src2, dst2); /* src2 - dst2 -> src2 */
307  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
308  src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
309  dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
310 
311  dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
312  dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
313 
314  *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
315 
316  srcp += 2;
317  dstp += 2;
318  }
319  srcp += srcskip;
320  dstp += dstskip;
321  }
322  _mm_empty();
323  }
324 }
325 
326 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
327 static void
328 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
329 {
330  int width = info->dst_w;
331  int height = info->dst_h;
332  Uint32 *srcp = (Uint32 *) info->src;
333  int srcskip = info->src_skip >> 2;
334  Uint32 *dstp = (Uint32 *) info->dst;
335  int dstskip = info->dst_skip >> 2;
336  SDL_PixelFormat *sf = info->src_fmt;
337  Uint32 amask = sf->Amask;
338  Uint32 ashift = sf->Ashift;
339  Uint64 multmask, multmask2;
340 
341  __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
342 
343  mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
344  multmask = 0x00FF;
345  multmask <<= (ashift * 2);
346  multmask2 = 0x00FF00FF00FF00FFULL;
347 
348  while (height--) {
349  /* *INDENT-OFF* */
350  DUFFS_LOOP4({
351  Uint32 alpha = *srcp & amask;
352  if (alpha == 0) {
353  /* do nothing */
354  } else if (alpha == amask) {
355  *dstp = *srcp;
356  } else {
357  src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
358  src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
359 
360  dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
361  dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
362 
363  mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
364  mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
365  mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
366  mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
367  mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
368  mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
369 
370  /* blend */
371  src1 = _mm_mullo_pi16(src1, mm_alpha);
372  src1 = _mm_srli_pi16(src1, 8);
373  dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
374  dst1 = _mm_srli_pi16(dst1, 8);
375  dst1 = _mm_add_pi16(src1, dst1);
376  dst1 = _mm_packs_pu16(dst1, mm_zero);
377 
378  *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
379  }
380  ++srcp;
381  ++dstp;
382  }, width);
383  /* *INDENT-ON* */
384  srcp += srcskip;
385  dstp += dstskip;
386  }
387  _mm_empty();
388 }
389 
390 #endif /* __MMX__ */
391 
392 #if SDL_ARM_SIMD_BLITTERS
393 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
394 
395 static void
396 BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
397 {
398  int32_t width = info->dst_w;
399  int32_t height = info->dst_h;
400  uint16_t *dstp = (uint16_t *)info->dst;
401  int32_t dststride = width + (info->dst_skip >> 1);
402  uint32_t *srcp = (uint32_t *)info->src;
403  int32_t srcstride = width + (info->src_skip >> 2);
404 
405  BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
406 }
407 
408 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
409 
410 static void
411 BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
412 {
413  int32_t width = info->dst_w;
414  int32_t height = info->dst_h;
415  uint32_t *dstp = (uint32_t *)info->dst;
416  int32_t dststride = width + (info->dst_skip >> 2);
417  uint32_t *srcp = (uint32_t *)info->src;
418  int32_t srcstride = width + (info->src_skip >> 2);
419 
420  BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
421 }
422 #endif
423 
424 #if SDL_ARM_NEON_BLITTERS
425 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
426 
427 static void
428 BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
429 {
430  int32_t width = info->dst_w;
431  int32_t height = info->dst_h;
432  uint16_t *dstp = (uint16_t *)info->dst;
433  int32_t dststride = width + (info->dst_skip >> 1);
434  uint32_t *srcp = (uint32_t *)info->src;
435  int32_t srcstride = width + (info->src_skip >> 2);
436 
437  BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
438 }
439 
440 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
441 
442 static void
443 BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
444 {
445  int32_t width = info->dst_w;
446  int32_t height = info->dst_h;
447  uint32_t *dstp = (uint32_t *)info->dst;
448  int32_t dststride = width + (info->dst_skip >> 2);
449  uint32_t *srcp = (uint32_t *)info->src;
450  int32_t srcstride = width + (info->src_skip >> 2);
451 
452  BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
453 }
454 #endif
455 
456 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
457 static void
459 {
460  int width = info->dst_w;
461  int height = info->dst_h;
462  Uint32 *srcp = (Uint32 *) info->src;
463  int srcskip = info->src_skip >> 2;
464  Uint32 *dstp = (Uint32 *) info->dst;
465  int dstskip = info->dst_skip >> 2;
466 
467  while (height--) {
468  /* *INDENT-OFF* */
469  DUFFS_LOOP4({
470  Uint32 s = *srcp++;
471  Uint32 d = *dstp;
472  *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
473  + (s & d & 0x00010101)) | 0xff000000;
474  }, width);
475  /* *INDENT-ON* */
476  srcp += srcskip;
477  dstp += dstskip;
478  }
479 }
480 
481 /* fast RGB888->(A)RGB888 blending with surface alpha */
482 static void
484 {
485  unsigned alpha = info->a;
486  if (alpha == 128) {
488  } else {
489  int width = info->dst_w;
490  int height = info->dst_h;
491  Uint32 *srcp = (Uint32 *) info->src;
492  int srcskip = info->src_skip >> 2;
493  Uint32 *dstp = (Uint32 *) info->dst;
494  int dstskip = info->dst_skip >> 2;
495  Uint32 s;
496  Uint32 d;
497  Uint32 s1;
498  Uint32 d1;
499 
500  while (height--) {
501  /* *INDENT-OFF* */
502  DUFFS_LOOP4({
503  s = *srcp;
504  d = *dstp;
505  s1 = s & 0xff00ff;
506  d1 = d & 0xff00ff;
507  d1 = (d1 + ((s1 - d1) * alpha >> 8))
508  & 0xff00ff;
509  s &= 0xff00;
510  d &= 0xff00;
511  d = (d + ((s - d) * alpha >> 8)) & 0xff00;
512  *dstp = d1 | d | 0xff000000;
513  ++srcp;
514  ++dstp;
515  }, width);
516  /* *INDENT-ON* */
517  srcp += srcskip;
518  dstp += dstskip;
519  }
520  }
521 }
522 
523 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
524 static void
526 {
527  int width = info->dst_w;
528  int height = info->dst_h;
529  Uint32 *srcp = (Uint32 *) info->src;
530  int srcskip = info->src_skip >> 2;
531  Uint32 *dstp = (Uint32 *) info->dst;
532  int dstskip = info->dst_skip >> 2;
533 
534  while (height--) {
535  /* *INDENT-OFF* */
536  DUFFS_LOOP4({
537  Uint32 dalpha;
538  Uint32 d;
539  Uint32 s1;
540  Uint32 d1;
541  Uint32 s = *srcp;
542  Uint32 alpha = s >> 24;
543  /* FIXME: Here we special-case opaque alpha since the
544  compositioning used (>>8 instead of /255) doesn't handle
545  it correctly. Also special-case alpha=0 for speed?
546  Benchmark this! */
547  if (alpha) {
548  if (alpha == SDL_ALPHA_OPAQUE) {
549  *dstp = *srcp;
550  } else {
551  /*
552  * take out the middle component (green), and process
553  * the other two in parallel. One multiply less.
554  */
555  d = *dstp;
556  dalpha = d >> 24;
557  s1 = s & 0xff00ff;
558  d1 = d & 0xff00ff;
559  d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
560  s &= 0xff00;
561  d &= 0xff00;
562  d = (d + ((s - d) * alpha >> 8)) & 0xff00;
563  dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
564  *dstp = d1 | d | (dalpha << 24);
565  }
566  }
567  ++srcp;
568  ++dstp;
569  }, width);
570  /* *INDENT-ON* */
571  srcp += srcskip;
572  dstp += dstskip;
573  }
574 }
575 
576 #ifdef __3dNOW__
577 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
578 static void
579 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
580 {
581  int width = info->dst_w;
582  int height = info->dst_h;
583  Uint32 *srcp = (Uint32 *) info->src;
584  int srcskip = info->src_skip >> 2;
585  Uint32 *dstp = (Uint32 *) info->dst;
586  int dstskip = info->dst_skip >> 2;
587  SDL_PixelFormat *sf = info->src_fmt;
588  Uint32 amask = sf->Amask;
589  Uint32 ashift = sf->Ashift;
590  Uint64 multmask, multmask2;
591 
592  __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
593 
594  mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
595  multmask = 0x00FF;
596  multmask <<= (ashift * 2);
597  multmask2 = 0x00FF00FF00FF00FFULL;
598 
599  while (height--) {
600  /* *INDENT-OFF* */
601  DUFFS_LOOP4({
602  Uint32 alpha;
603 
604  _m_prefetch(srcp + 16);
605  _m_prefetch(dstp + 16);
606 
607  alpha = *srcp & amask;
608  if (alpha == 0) {
609  /* do nothing */
610  } else if (alpha == amask) {
611  *dstp = *srcp;
612  } else {
613  src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
614  src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
615 
616  dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
617  dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
618 
619  mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
620  mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
621  mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
622  mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
623  mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
624  mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
625 
626 
627  /* blend */
628  src1 = _mm_mullo_pi16(src1, mm_alpha);
629  src1 = _mm_srli_pi16(src1, 8);
630  dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
631  dst1 = _mm_srli_pi16(dst1, 8);
632  dst1 = _mm_add_pi16(src1, dst1);
633  dst1 = _mm_packs_pu16(dst1, mm_zero);
634 
635  *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
636  }
637  ++srcp;
638  ++dstp;
639  }, width);
640  /* *INDENT-ON* */
641  srcp += srcskip;
642  dstp += dstskip;
643  }
644  _mm_empty();
645 }
646 
647 #endif /* __3dNOW__ */
648 
649 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
650 
651 /* blend a single 16 bit pixel at 50% */
652 #define BLEND16_50(d, s, mask) \
653  ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
654 
655 /* blend two 16 bit pixels at 50% */
656 #define BLEND2x16_50(d, s, mask) \
657  (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
658  + (s & d & (~(mask | mask << 16))))
659 
660 static void
662 {
663  int width = info->dst_w;
664  int height = info->dst_h;
665  Uint16 *srcp = (Uint16 *) info->src;
666  int srcskip = info->src_skip >> 1;
667  Uint16 *dstp = (Uint16 *) info->dst;
668  int dstskip = info->dst_skip >> 1;
669 
670  while (height--) {
671  if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
672  /*
673  * Source and destination not aligned, pipeline it.
674  * This is mostly a win for big blits but no loss for
675  * small ones
676  */
677  Uint32 prev_sw;
678  int w = width;
679 
680  /* handle odd destination */
681  if ((uintptr_t) dstp & 2) {
682  Uint16 d = *dstp, s = *srcp;
683  *dstp = BLEND16_50(d, s, mask);
684  dstp++;
685  srcp++;
686  w--;
687  }
688  srcp++; /* srcp is now 32-bit aligned */
689 
690  /* bootstrap pipeline with first halfword */
691  prev_sw = ((Uint32 *) srcp)[-1];
692 
693  while (w > 1) {
694  Uint32 sw, dw, s;
695  sw = *(Uint32 *) srcp;
696  dw = *(Uint32 *) dstp;
697 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
698  s = (prev_sw << 16) + (sw >> 16);
699 #else
700  s = (prev_sw >> 16) + (sw << 16);
701 #endif
702  prev_sw = sw;
703  *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
704  dstp += 2;
705  srcp += 2;
706  w -= 2;
707  }
708 
709  /* final pixel if any */
710  if (w) {
711  Uint16 d = *dstp, s;
712 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
713  s = (Uint16) prev_sw;
714 #else
715  s = (Uint16) (prev_sw >> 16);
716 #endif
717  *dstp = BLEND16_50(d, s, mask);
718  srcp++;
719  dstp++;
720  }
721  srcp += srcskip - 1;
722  dstp += dstskip;
723  } else {
724  /* source and destination are aligned */
725  int w = width;
726 
727  /* first odd pixel? */
728  if ((uintptr_t) srcp & 2) {
729  Uint16 d = *dstp, s = *srcp;
730  *dstp = BLEND16_50(d, s, mask);
731  srcp++;
732  dstp++;
733  w--;
734  }
735  /* srcp and dstp are now 32-bit aligned */
736 
737  while (w > 1) {
738  Uint32 sw = *(Uint32 *) srcp;
739  Uint32 dw = *(Uint32 *) dstp;
740  *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
741  srcp += 2;
742  dstp += 2;
743  w -= 2;
744  }
745 
746  /* last odd pixel? */
747  if (w) {
748  Uint16 d = *dstp, s = *srcp;
749  *dstp = BLEND16_50(d, s, mask);
750  srcp++;
751  dstp++;
752  }
753  srcp += srcskip;
754  dstp += dstskip;
755  }
756  }
757 }
758 
759 #ifdef __MMX__
760 
761 /* fast RGB565->RGB565 blending with surface alpha */
762 static void
763 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
764 {
765  unsigned alpha = info->a;
766  if (alpha == 128) {
767  Blit16to16SurfaceAlpha128(info, 0xf7de);
768  } else {
769  int width = info->dst_w;
770  int height = info->dst_h;
771  Uint16 *srcp = (Uint16 *) info->src;
772  int srcskip = info->src_skip >> 1;
773  Uint16 *dstp = (Uint16 *) info->dst;
774  int dstskip = info->dst_skip >> 1;
775  Uint32 s, d;
776 
777  __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
778 
779  alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
780  mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
781  alpha >>= 3; /* downscale alpha to 5 bits */
782 
783  mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
784  mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
785  /* position alpha to allow for mullo and mulhi on diff channels
786  to reduce the number of operations */
787  mm_alpha = _mm_slli_si64(mm_alpha, 3);
788 
789  /* Setup the 565 color channel masks */
790  gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
791  bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
792 
793  while (height--) {
794  /* *INDENT-OFF* */
796  {
797  s = *srcp++;
798  d = *dstp;
799  /*
800  * shift out the middle component (green) to
801  * the high 16 bits, and process all three RGB
802  * components at the same time.
803  */
804  s = (s | s << 16) & 0x07e0f81f;
805  d = (d | d << 16) & 0x07e0f81f;
806  d += (s - d) * alpha >> 5;
807  d &= 0x07e0f81f;
808  *dstp++ = (Uint16)(d | d >> 16);
809  },{
810  s = *srcp++;
811  d = *dstp;
812  /*
813  * shift out the middle component (green) to
814  * the high 16 bits, and process all three RGB
815  * components at the same time.
816  */
817  s = (s | s << 16) & 0x07e0f81f;
818  d = (d | d << 16) & 0x07e0f81f;
819  d += (s - d) * alpha >> 5;
820  d &= 0x07e0f81f;
821  *dstp++ = (Uint16)(d | d >> 16);
822  s = *srcp++;
823  d = *dstp;
824  /*
825  * shift out the middle component (green) to
826  * the high 16 bits, and process all three RGB
827  * components at the same time.
828  */
829  s = (s | s << 16) & 0x07e0f81f;
830  d = (d | d << 16) & 0x07e0f81f;
831  d += (s - d) * alpha >> 5;
832  d &= 0x07e0f81f;
833  *dstp++ = (Uint16)(d | d >> 16);
834  },{
835  src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
836  dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
837 
838  /* red */
839  src2 = src1;
840  src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
841 
842  dst2 = dst1;
843  dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
844 
845  /* blend */
846  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
847  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
848  src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
849  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
850  dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
851 
852  mm_res = dst2; /* RED -> mm_res */
853 
854  /* green -- process the bits in place */
855  src2 = src1;
856  src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
857 
858  dst2 = dst1;
859  dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
860 
861  /* blend */
862  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
863  src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
864  src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
865  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
866 
867  mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
868 
869  /* blue */
870  src2 = src1;
871  src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
872 
873  dst2 = dst1;
874  dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
875 
876  /* blend */
877  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
878  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
879  src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
880  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
881  dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
882 
883  mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
884 
885  *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
886 
887  srcp += 4;
888  dstp += 4;
889  }, width);
890  /* *INDENT-ON* */
891  srcp += srcskip;
892  dstp += dstskip;
893  }
894  _mm_empty();
895  }
896 }
897 
898 /* fast RGB555->RGB555 blending with surface alpha */
899 static void
900 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
901 {
902  unsigned alpha = info->a;
903  if (alpha == 128) {
904  Blit16to16SurfaceAlpha128(info, 0xfbde);
905  } else {
906  int width = info->dst_w;
907  int height = info->dst_h;
908  Uint16 *srcp = (Uint16 *) info->src;
909  int srcskip = info->src_skip >> 1;
910  Uint16 *dstp = (Uint16 *) info->dst;
911  int dstskip = info->dst_skip >> 1;
912  Uint32 s, d;
913 
914  __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
915 
916  alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
917  mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
918  alpha >>= 3; /* downscale alpha to 5 bits */
919 
920  mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
921  mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
922  /* position alpha to allow for mullo and mulhi on diff channels
923  to reduce the number of operations */
924  mm_alpha = _mm_slli_si64(mm_alpha, 3);
925 
926  /* Setup the 555 color channel masks */
927  rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
928  gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
929  bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
930 
931  while (height--) {
932  /* *INDENT-OFF* */
934  {
935  s = *srcp++;
936  d = *dstp;
937  /*
938  * shift out the middle component (green) to
939  * the high 16 bits, and process all three RGB
940  * components at the same time.
941  */
942  s = (s | s << 16) & 0x03e07c1f;
943  d = (d | d << 16) & 0x03e07c1f;
944  d += (s - d) * alpha >> 5;
945  d &= 0x03e07c1f;
946  *dstp++ = (Uint16)(d | d >> 16);
947  },{
948  s = *srcp++;
949  d = *dstp;
950  /*
951  * shift out the middle component (green) to
952  * the high 16 bits, and process all three RGB
953  * components at the same time.
954  */
955  s = (s | s << 16) & 0x03e07c1f;
956  d = (d | d << 16) & 0x03e07c1f;
957  d += (s - d) * alpha >> 5;
958  d &= 0x03e07c1f;
959  *dstp++ = (Uint16)(d | d >> 16);
960  s = *srcp++;
961  d = *dstp;
962  /*
963  * shift out the middle component (green) to
964  * the high 16 bits, and process all three RGB
965  * components at the same time.
966  */
967  s = (s | s << 16) & 0x03e07c1f;
968  d = (d | d << 16) & 0x03e07c1f;
969  d += (s - d) * alpha >> 5;
970  d &= 0x03e07c1f;
971  *dstp++ = (Uint16)(d | d >> 16);
972  },{
973  src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
974  dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
975 
976  /* red -- process the bits in place */
977  src2 = src1;
978  src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
979 
980  dst2 = dst1;
981  dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
982 
983  /* blend */
984  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
985  src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
986  src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
987  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
988  dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
989 
990  mm_res = dst2; /* RED -> mm_res */
991 
992  /* green -- process the bits in place */
993  src2 = src1;
994  src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
995 
996  dst2 = dst1;
997  dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
998 
999  /* blend */
1000  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
1001  src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
1002  src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
1003  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
1004 
1005  mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
1006 
1007  /* blue */
1008  src2 = src1; /* src -> src2 */
1009  src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
1010 
1011  dst2 = dst1; /* dst -> dst2 */
1012  dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
1013 
1014  /* blend */
1015  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
1016  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
1017  src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
1018  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
1019  dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
1020 
1021  mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
1022 
1023  *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
1024 
1025  srcp += 4;
1026  dstp += 4;
1027  }, width);
1028  /* *INDENT-ON* */
1029  srcp += srcskip;
1030  dstp += dstskip;
1031  }
1032  _mm_empty();
1033  }
1034 }
1035 
1036 #endif /* __MMX__ */
1037 
1038 /* fast RGB565->RGB565 blending with surface alpha */
1039 static void
1041 {
1042  unsigned alpha = info->a;
1043  if (alpha == 128) {
1044  Blit16to16SurfaceAlpha128(info, 0xf7de);
1045  } else {
1046  int width = info->dst_w;
1047  int height = info->dst_h;
1048  Uint16 *srcp = (Uint16 *) info->src;
1049  int srcskip = info->src_skip >> 1;
1050  Uint16 *dstp = (Uint16 *) info->dst;
1051  int dstskip = info->dst_skip >> 1;
1052  alpha >>= 3; /* downscale alpha to 5 bits */
1053 
1054  while (height--) {
1055  /* *INDENT-OFF* */
1056  DUFFS_LOOP4({
1057  Uint32 s = *srcp++;
1058  Uint32 d = *dstp;
1059  /*
1060  * shift out the middle component (green) to
1061  * the high 16 bits, and process all three RGB
1062  * components at the same time.
1063  */
1064  s = (s | s << 16) & 0x07e0f81f;
1065  d = (d | d << 16) & 0x07e0f81f;
1066  d += (s - d) * alpha >> 5;
1067  d &= 0x07e0f81f;
1068  *dstp++ = (Uint16)(d | d >> 16);
1069  }, width);
1070  /* *INDENT-ON* */
1071  srcp += srcskip;
1072  dstp += dstskip;
1073  }
1074  }
1075 }
1076 
1077 /* fast RGB555->RGB555 blending with surface alpha */
1078 static void
1080 {
1081  unsigned alpha = info->a; /* downscale alpha to 5 bits */
1082  if (alpha == 128) {
1083  Blit16to16SurfaceAlpha128(info, 0xfbde);
1084  } else {
1085  int width = info->dst_w;
1086  int height = info->dst_h;
1087  Uint16 *srcp = (Uint16 *) info->src;
1088  int srcskip = info->src_skip >> 1;
1089  Uint16 *dstp = (Uint16 *) info->dst;
1090  int dstskip = info->dst_skip >> 1;
1091  alpha >>= 3; /* downscale alpha to 5 bits */
1092 
1093  while (height--) {
1094  /* *INDENT-OFF* */
1095  DUFFS_LOOP4({
1096  Uint32 s = *srcp++;
1097  Uint32 d = *dstp;
1098  /*
1099  * shift out the middle component (green) to
1100  * the high 16 bits, and process all three RGB
1101  * components at the same time.
1102  */
1103  s = (s | s << 16) & 0x03e07c1f;
1104  d = (d | d << 16) & 0x03e07c1f;
1105  d += (s - d) * alpha >> 5;
1106  d &= 0x03e07c1f;
1107  *dstp++ = (Uint16)(d | d >> 16);
1108  }, width);
1109  /* *INDENT-ON* */
1110  srcp += srcskip;
1111  dstp += dstskip;
1112  }
1113  }
1114 }
1115 
1116 /* fast ARGB8888->RGB565 blending with pixel alpha */
1117 static void
1119 {
1120  int width = info->dst_w;
1121  int height = info->dst_h;
1122  Uint32 *srcp = (Uint32 *) info->src;
1123  int srcskip = info->src_skip >> 2;
1124  Uint16 *dstp = (Uint16 *) info->dst;
1125  int dstskip = info->dst_skip >> 1;
1126 
1127  while (height--) {
1128  /* *INDENT-OFF* */
1129  DUFFS_LOOP4({
1130  Uint32 s = *srcp;
1131  unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
1132  /* FIXME: Here we special-case opaque alpha since the
1133  compositioning used (>>8 instead of /255) doesn't handle
1134  it correctly. Also special-case alpha=0 for speed?
1135  Benchmark this! */
1136  if(alpha) {
1137  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1138  *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
1139  } else {
1140  Uint32 d = *dstp;
1141  /*
1142  * convert source and destination to G0RAB65565
1143  * and blend all components at the same time
1144  */
1145  s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
1146  + (s >> 3 & 0x1f);
1147  d = (d | d << 16) & 0x07e0f81f;
1148  d += (s - d) * alpha >> 5;
1149  d &= 0x07e0f81f;
1150  *dstp = (Uint16)(d | d >> 16);
1151  }
1152  }
1153  srcp++;
1154  dstp++;
1155  }, width);
1156  /* *INDENT-ON* */
1157  srcp += srcskip;
1158  dstp += dstskip;
1159  }
1160 }
1161 
1162 /* fast ARGB8888->RGB555 blending with pixel alpha */
1163 static void
1165 {
1166  int width = info->dst_w;
1167  int height = info->dst_h;
1168  Uint32 *srcp = (Uint32 *) info->src;
1169  int srcskip = info->src_skip >> 2;
1170  Uint16 *dstp = (Uint16 *) info->dst;
1171  int dstskip = info->dst_skip >> 1;
1172 
1173  while (height--) {
1174  /* *INDENT-OFF* */
1175  DUFFS_LOOP4({
1176  unsigned alpha;
1177  Uint32 s = *srcp;
1178  alpha = s >> 27; /* downscale alpha to 5 bits */
1179  /* FIXME: Here we special-case opaque alpha since the
1180  compositioning used (>>8 instead of /255) doesn't handle
1181  it correctly. Also special-case alpha=0 for speed?
1182  Benchmark this! */
1183  if(alpha) {
1184  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1185  *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
1186  } else {
1187  Uint32 d = *dstp;
1188  /*
1189  * convert source and destination to G0RAB65565
1190  * and blend all components at the same time
1191  */
1192  s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
1193  + (s >> 3 & 0x1f);
1194  d = (d | d << 16) & 0x03e07c1f;
1195  d += (s - d) * alpha >> 5;
1196  d &= 0x03e07c1f;
1197  *dstp = (Uint16)(d | d >> 16);
1198  }
1199  }
1200  srcp++;
1201  dstp++;
1202  }, width);
1203  /* *INDENT-ON* */
1204  srcp += srcskip;
1205  dstp += dstskip;
1206  }
1207 }
1208 
1209 /* General (slow) N->N blending with per-surface alpha */
1210 static void
1212 {
1213  int width = info->dst_w;
1214  int height = info->dst_h;
1215  Uint8 *src = info->src;
1216  int srcskip = info->src_skip;
1217  Uint8 *dst = info->dst;
1218  int dstskip = info->dst_skip;
1219  SDL_PixelFormat *srcfmt = info->src_fmt;
1220  SDL_PixelFormat *dstfmt = info->dst_fmt;
1221  int srcbpp = srcfmt->BytesPerPixel;
1222  int dstbpp = dstfmt->BytesPerPixel;
1223  Uint32 Pixel;
1224  unsigned sR, sG, sB;
1225  unsigned dR, dG, dB, dA;
1226  const unsigned sA = info->a;
1227 
1228  if (sA) {
1229  while (height--) {
1230  /* *INDENT-OFF* */
1231  DUFFS_LOOP4(
1232  {
1233  DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
1234  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1235  ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1236  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1237  src += srcbpp;
1238  dst += dstbpp;
1239  },
1240  width);
1241  /* *INDENT-ON* */
1242  src += srcskip;
1243  dst += dstskip;
1244  }
1245  }
1246 }
1247 
1248 /* General (slow) colorkeyed N->N blending with per-surface alpha */
1249 static void
1251 {
1252  int width = info->dst_w;
1253  int height = info->dst_h;
1254  Uint8 *src = info->src;
1255  int srcskip = info->src_skip;
1256  Uint8 *dst = info->dst;
1257  int dstskip = info->dst_skip;
1258  SDL_PixelFormat *srcfmt = info->src_fmt;
1259  SDL_PixelFormat *dstfmt = info->dst_fmt;
1260  Uint32 ckey = info->colorkey;
1261  int srcbpp = srcfmt->BytesPerPixel;
1262  int dstbpp = dstfmt->BytesPerPixel;
1263  Uint32 Pixel;
1264  unsigned sR, sG, sB;
1265  unsigned dR, dG, dB, dA;
1266  const unsigned sA = info->a;
1267 
1268  while (height--) {
1269  /* *INDENT-OFF* */
1270  DUFFS_LOOP4(
1271  {
1272  RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
1273  if(sA && Pixel != ckey) {
1274  RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
1275  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1276  ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1277  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1278  }
1279  src += srcbpp;
1280  dst += dstbpp;
1281  },
1282  width);
1283  /* *INDENT-ON* */
1284  src += srcskip;
1285  dst += dstskip;
1286  }
1287 }
1288 
1289 /* General (slow) N->N blending with pixel alpha */
1290 static void
1292 {
1293  int width = info->dst_w;
1294  int height = info->dst_h;
1295  Uint8 *src = info->src;
1296  int srcskip = info->src_skip;
1297  Uint8 *dst = info->dst;
1298  int dstskip = info->dst_skip;
1299  SDL_PixelFormat *srcfmt = info->src_fmt;
1300  SDL_PixelFormat *dstfmt = info->dst_fmt;
1301  int srcbpp;
1302  int dstbpp;
1303  Uint32 Pixel;
1304  unsigned sR, sG, sB, sA;
1305  unsigned dR, dG, dB, dA;
1306 
1307  /* Set up some basic variables */
1308  srcbpp = srcfmt->BytesPerPixel;
1309  dstbpp = dstfmt->BytesPerPixel;
1310 
1311  while (height--) {
1312  /* *INDENT-OFF* */
1313  DUFFS_LOOP4(
1314  {
1315  DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1316  if(sA) {
1317  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1318  ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1319  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1320  }
1321  src += srcbpp;
1322  dst += dstbpp;
1323  },
1324  width);
1325  /* *INDENT-ON* */
1326  src += srcskip;
1327  dst += dstskip;
1328  }
1329 }
1330 
1331 
1334 {
1335  SDL_PixelFormat *sf = surface->format;
1336  SDL_PixelFormat *df = surface->map->dst->format;
1337 
1338  switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
1339  case SDL_COPY_BLEND:
1340  /* Per-pixel alpha blits */
1341  switch (df->BytesPerPixel) {
1342  case 1:
1343  return BlitNto1PixelAlpha;
1344 
1345  case 2:
1346 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
1347  if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1348  && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
1349  && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1350  || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
1351  {
1352 #if SDL_ARM_NEON_BLITTERS
1353  if (SDL_HasNEON())
1354  return BlitARGBto565PixelAlphaARMNEON;
1355 #endif
1356 #if SDL_ARM_SIMD_BLITTERS
1357  if (SDL_HasARMSIMD())
1358  return BlitARGBto565PixelAlphaARMSIMD;
1359 #endif
1360  }
1361 #endif
1362  if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1363  && sf->Gmask == 0xff00
1364  && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1365  || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1366  if (df->Gmask == 0x7e0)
1367  return BlitARGBto565PixelAlpha;
1368  else if (df->Gmask == 0x3e0)
1369  return BlitARGBto555PixelAlpha;
1370  }
1371  return BlitNtoNPixelAlpha;
1372 
1373  case 4:
1374  if (sf->Rmask == df->Rmask
1375  && sf->Gmask == df->Gmask
1376  && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1377 #if defined(__MMX__) || defined(__3dNOW__)
1378  if (sf->Rshift % 8 == 0
1379  && sf->Gshift % 8 == 0
1380  && sf->Bshift % 8 == 0
1381  && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
1382 #ifdef __3dNOW__
1383  if (SDL_Has3DNow())
1384  return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1385 #endif
1386 #ifdef __MMX__
1387  if (SDL_HasMMX())
1388  return BlitRGBtoRGBPixelAlphaMMX;
1389 #endif
1390  }
1391 #endif /* __MMX__ || __3dNOW__ */
1392  if (sf->Amask == 0xff000000) {
1393 #if SDL_ARM_NEON_BLITTERS
1394  if (SDL_HasNEON())
1395  return BlitRGBtoRGBPixelAlphaARMNEON;
1396 #endif
1397 #if SDL_ARM_SIMD_BLITTERS
1398  if (SDL_HasARMSIMD())
1399  return BlitRGBtoRGBPixelAlphaARMSIMD;
1400 #endif
1401  return BlitRGBtoRGBPixelAlpha;
1402  }
1403  }
1404  return BlitNtoNPixelAlpha;
1405 
1406  case 3:
1407  default:
1408  break;
1409  }
1410  return BlitNtoNPixelAlpha;
1411 
1413  if (sf->Amask == 0) {
1414  /* Per-surface alpha blits */
1415  switch (df->BytesPerPixel) {
1416  case 1:
1417  return BlitNto1SurfaceAlpha;
1418 
1419  case 2:
1420  if (surface->map->identity) {
1421  if (df->Gmask == 0x7e0) {
1422 #ifdef __MMX__
1423  if (SDL_HasMMX())
1424  return Blit565to565SurfaceAlphaMMX;
1425  else
1426 #endif
1427  return Blit565to565SurfaceAlpha;
1428  } else if (df->Gmask == 0x3e0) {
1429 #ifdef __MMX__
1430  if (SDL_HasMMX())
1431  return Blit555to555SurfaceAlphaMMX;
1432  else
1433 #endif
1434  return Blit555to555SurfaceAlpha;
1435  }
1436  }
1437  return BlitNtoNSurfaceAlpha;
1438 
1439  case 4:
1440  if (sf->Rmask == df->Rmask
1441  && sf->Gmask == df->Gmask
1442  && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1443 #ifdef __MMX__
1444  if (sf->Rshift % 8 == 0
1445  && sf->Gshift % 8 == 0
1446  && sf->Bshift % 8 == 0 && SDL_HasMMX())
1447  return BlitRGBtoRGBSurfaceAlphaMMX;
1448 #endif
1449  if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1450  return BlitRGBtoRGBSurfaceAlpha;
1451  }
1452  }
1453  return BlitNtoNSurfaceAlpha;
1454 
1455  case 3:
1456  default:
1457  return BlitNtoNSurfaceAlpha;
1458  }
1459  }
1460  break;
1461 
1463  if (sf->Amask == 0) {
1464  if (df->BytesPerPixel == 1) {
1465  return BlitNto1SurfaceAlphaKey;
1466  } else {
1467  return BlitNtoNSurfaceAlphaKey;
1468  }
1469  }
1470  break;
1471  }
1472 
1473  return NULL;
1474 }
1475 
1476 /* vi: set ts=4 sw=4 expandtab: */
Uint8 * table
Definition: SDL_blit.h:67
SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
Definition: SDL_blit_A.c:1333
static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1118
#define BLEND16_50(d, s, mask)
Definition: SDL_blit_A.c:652
static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:30
#define SDL_Has3DNow
static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1250
GLdouble s
Definition: SDL_opengl.h:2063
#define SDL_COPY_COLORKEY
Definition: SDL_blit.h:39
int src_skip
Definition: SDL_blit.h:60
#define RETRIEVE_RGB_PIXEL(buf, bpp, Pixel)
Definition: SDL_blit.h:146
Uint8 g
Definition: SDL_pixels.h:298
GLenum GLenum dst
signed int int32_t
unsigned short uint16_t
#define ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA)
Definition: SDL_blit.h:454
Uint8 BytesPerPixel
Definition: SDL_pixels.h:320
#define ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB)
Definition: SDL_blit.h:445
GLfloat GLfloat GLfloat GLfloat h
static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:123
SDL_PixelFormat * src_fmt
Definition: SDL_blit.h:65
EGLSurface surface
Definition: eglext.h:248
#define ASSEMBLE_RGBA(buf, bpp, fmt, r, g, b, a)
Definition: SDL_blit.h:402
A collection of pixels used in software blitting.
Definition: SDL_surface.h:69
uint16_t Uint16
Definition: SDL_stdinc.h:191
#define SDL_COPY_RLE_MASK
Definition: SDL_blit.h:44
Uint8 b
Definition: SDL_pixels.h:299
int dst_skip
Definition: SDL_blit.h:64
static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1164
GLenum src
static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1291
uint64_t Uint64
Definition: SDL_stdinc.h:216
GLfloat GLfloat GLfloat alpha
GLint GLint GLsizei width
Definition: SDL_opengl.h:1572
Uint32 colorkey
Definition: SDL_blit.h:69
Uint8 * dst
Definition: SDL_blit.h:61
struct SDL_BlitMap * map
Definition: SDL_surface.h:88
static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1079
#define DISEMBLE_RGBA(buf, bpp, fmt, Pixel, r, g, b, a)
Definition: SDL_blit.h:353
Uint8 r
Definition: SDL_pixels.h:297
static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:458
SDL_PRINTF_FORMAT_STRING const char int SDL_PRINTF_FORMAT_STRING const char int SDL_PRINTF_FORMAT_STRING const char int SDL_PRINTF_FORMAT_STRING const char const char SDL_SCANF_FORMAT_STRING const char return SDL_ThreadFunction const char void return Uint32 return Uint32 SDL_AssertionHandler void SDL_SpinLock SDL_atomic_t int int return SDL_atomic_t return void void void return void return int return SDL_AudioSpec SDL_AudioSpec return int int return return int SDL_RWops int SDL_AudioSpec Uint8 ** d
uint8_t Uint8
Definition: SDL_stdinc.h:179
#define RGB_FROM_PIXEL(Pixel, fmt, r, g, b)
Definition: SDL_blit.h:122
static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1040
#define SDL_HasNEON
#define DUFFS_LOOP4(pixel_copy_increment, width)
Definition: SDL_blit.h:488
static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:525
GLenum GLint GLuint mask
#define DUFFS_LOOP(pixel_copy_increment, width)
Definition: SDL_blit.h:500
GLubyte GLubyte GLubyte GLubyte w
static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:483
unsigned int uintptr_t
#define SDL_HasMMX
Uint8 * src
Definition: SDL_blit.h:57
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1
static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
Definition: SDL_blit_A.c:661
SDL_PixelFormat * dst_fmt
Definition: SDL_blit.h:66
SDL_Surface * dst
Definition: SDL_blit.h:88
#define NULL
Definition: begin_code.h:164
SDL_Color * colors
Definition: SDL_pixels.h:307
unsigned int uint32_t
SDL_PixelFormat * format
Definition: SDL_surface.h:72
GLint GLint GLsizei GLsizei height
Definition: SDL_opengl.h:1572
SDL_bool SDL_HasARMSIMD(void)
Definition: SDL_cpuinfo.c:777
#define SDL_COPY_MODULATE_ALPHA
Definition: SDL_blit.h:35
static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:77
uint32_t Uint32
Definition: SDL_stdinc.h:203
GLdouble n
void(* SDL_BlitFunc)(SDL_BlitInfo *info)
Definition: SDL_blit.h:73
static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1211
#define DISEMBLE_RGB(buf, bpp, fmt, Pixel, r, g, b)
Definition: SDL_blit.h:177
#define DUFFS_LOOP_124(pixel_copy_increment1, pixel_copy_increment2, pixel_copy_increment4, width)
Definition: SDL_blit.h:504
SDL_Palette * palette
Definition: SDL_pixels.h:318
#define SDL_ALPHA_OPAQUE
Definition: SDL_pixels.h:46
int identity
Definition: SDL_blit.h:89
#define BLEND2x16_50(d, s, mask)
Definition: SDL_blit_A.c:656
#define SDL_COPY_BLEND
Definition: SDL_blit.h:36
SDL_BlitInfo info
Definition: SDL_blit.h:92
Uint8 a
Definition: SDL_blit.h:70