Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
72 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
73 
74 #include <stdio.h>
75 #include <volk/volk_common.h>
76 
77 
78 #ifdef LV_HAVE_GENERIC
79 
80 
81 static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
82  const float* input,
83  const float* taps,
84  unsigned int num_points)
85 {
86 
87  float dotProduct = 0;
88  const float* aPtr = input;
89  const float* bPtr = taps;
90  unsigned int number = 0;
91 
92  for (number = 0; number < num_points; number++) {
93  dotProduct += ((*aPtr++) * (*bPtr++));
94  }
95 
96  *result = dotProduct;
97 }
98 
99 #endif /*LV_HAVE_GENERIC*/
100 
101 
102 #ifdef LV_HAVE_SSE
103 
104 
105 static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
106  const float* input,
107  const float* taps,
108  unsigned int num_points)
109 {
110 
111  unsigned int number = 0;
112  const unsigned int sixteenthPoints = num_points / 16;
113 
114  float dotProduct = 0;
115  const float* aPtr = input;
116  const float* bPtr = taps;
117 
118  __m128 a0Val, a1Val, a2Val, a3Val;
119  __m128 b0Val, b1Val, b2Val, b3Val;
120  __m128 c0Val, c1Val, c2Val, c3Val;
121 
122  __m128 dotProdVal0 = _mm_setzero_ps();
123  __m128 dotProdVal1 = _mm_setzero_ps();
124  __m128 dotProdVal2 = _mm_setzero_ps();
125  __m128 dotProdVal3 = _mm_setzero_ps();
126 
127  for (; number < sixteenthPoints; number++) {
128 
129  a0Val = _mm_loadu_ps(aPtr);
130  a1Val = _mm_loadu_ps(aPtr + 4);
131  a2Val = _mm_loadu_ps(aPtr + 8);
132  a3Val = _mm_loadu_ps(aPtr + 12);
133  b0Val = _mm_loadu_ps(bPtr);
134  b1Val = _mm_loadu_ps(bPtr + 4);
135  b2Val = _mm_loadu_ps(bPtr + 8);
136  b3Val = _mm_loadu_ps(bPtr + 12);
137 
138  c0Val = _mm_mul_ps(a0Val, b0Val);
139  c1Val = _mm_mul_ps(a1Val, b1Val);
140  c2Val = _mm_mul_ps(a2Val, b2Val);
141  c3Val = _mm_mul_ps(a3Val, b3Val);
142 
143  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
144  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
145  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
146  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
147 
148  aPtr += 16;
149  bPtr += 16;
150  }
151 
152  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
153  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
154  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
155 
156  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
157 
158  _mm_store_ps(dotProductVector,
159  dotProdVal0); // Store the results back into the dot product vector
160 
161  dotProduct = dotProductVector[0];
162  dotProduct += dotProductVector[1];
163  dotProduct += dotProductVector[2];
164  dotProduct += dotProductVector[3];
165 
166  number = sixteenthPoints * 16;
167  for (; number < num_points; number++) {
168  dotProduct += ((*aPtr++) * (*bPtr++));
169  }
170 
171  *result = dotProduct;
172 }
173 
174 #endif /*LV_HAVE_SSE*/
175 
176 #ifdef LV_HAVE_SSE3
177 
178 #include <pmmintrin.h>
179 
180 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
181  const float* input,
182  const float* taps,
183  unsigned int num_points)
184 {
185  unsigned int number = 0;
186  const unsigned int sixteenthPoints = num_points / 16;
187 
188  float dotProduct = 0;
189  const float* aPtr = input;
190  const float* bPtr = taps;
191 
192  __m128 a0Val, a1Val, a2Val, a3Val;
193  __m128 b0Val, b1Val, b2Val, b3Val;
194  __m128 c0Val, c1Val, c2Val, c3Val;
195 
196  __m128 dotProdVal0 = _mm_setzero_ps();
197  __m128 dotProdVal1 = _mm_setzero_ps();
198  __m128 dotProdVal2 = _mm_setzero_ps();
199  __m128 dotProdVal3 = _mm_setzero_ps();
200 
201  for (; number < sixteenthPoints; number++) {
202 
203  a0Val = _mm_loadu_ps(aPtr);
204  a1Val = _mm_loadu_ps(aPtr + 4);
205  a2Val = _mm_loadu_ps(aPtr + 8);
206  a3Val = _mm_loadu_ps(aPtr + 12);
207  b0Val = _mm_loadu_ps(bPtr);
208  b1Val = _mm_loadu_ps(bPtr + 4);
209  b2Val = _mm_loadu_ps(bPtr + 8);
210  b3Val = _mm_loadu_ps(bPtr + 12);
211 
212  c0Val = _mm_mul_ps(a0Val, b0Val);
213  c1Val = _mm_mul_ps(a1Val, b1Val);
214  c2Val = _mm_mul_ps(a2Val, b2Val);
215  c3Val = _mm_mul_ps(a3Val, b3Val);
216 
217  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
218  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
219  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
220  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
221 
222  aPtr += 16;
223  bPtr += 16;
224  }
225 
226  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
227  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
228  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
229 
230  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
231  _mm_store_ps(dotProductVector,
232  dotProdVal0); // Store the results back into the dot product vector
233 
234  dotProduct = dotProductVector[0];
235  dotProduct += dotProductVector[1];
236  dotProduct += dotProductVector[2];
237  dotProduct += dotProductVector[3];
238 
239  number = sixteenthPoints * 16;
240  for (; number < num_points; number++) {
241  dotProduct += ((*aPtr++) * (*bPtr++));
242  }
243 
244  *result = dotProduct;
245 }
246 
247 #endif /*LV_HAVE_SSE3*/
248 
249 #ifdef LV_HAVE_SSE4_1
250 
251 #include <smmintrin.h>
252 
253 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
254  const float* input,
255  const float* taps,
256  unsigned int num_points)
257 {
258  unsigned int number = 0;
259  const unsigned int sixteenthPoints = num_points / 16;
260 
261  float dotProduct = 0;
262  const float* aPtr = input;
263  const float* bPtr = taps;
264 
265  __m128 aVal1, bVal1, cVal1;
266  __m128 aVal2, bVal2, cVal2;
267  __m128 aVal3, bVal3, cVal3;
268  __m128 aVal4, bVal4, cVal4;
269 
270  __m128 dotProdVal = _mm_setzero_ps();
271 
272  for (; number < sixteenthPoints; number++) {
273 
274  aVal1 = _mm_loadu_ps(aPtr);
275  aPtr += 4;
276  aVal2 = _mm_loadu_ps(aPtr);
277  aPtr += 4;
278  aVal3 = _mm_loadu_ps(aPtr);
279  aPtr += 4;
280  aVal4 = _mm_loadu_ps(aPtr);
281  aPtr += 4;
282 
283  bVal1 = _mm_loadu_ps(bPtr);
284  bPtr += 4;
285  bVal2 = _mm_loadu_ps(bPtr);
286  bPtr += 4;
287  bVal3 = _mm_loadu_ps(bPtr);
288  bPtr += 4;
289  bVal4 = _mm_loadu_ps(bPtr);
290  bPtr += 4;
291 
292  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
293  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
294  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
295  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
296 
297  cVal1 = _mm_or_ps(cVal1, cVal2);
298  cVal3 = _mm_or_ps(cVal3, cVal4);
299  cVal1 = _mm_or_ps(cVal1, cVal3);
300 
301  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
302  }
303 
304  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
305  _mm_store_ps(dotProductVector,
306  dotProdVal); // Store the results back into the dot product vector
307 
308  dotProduct = dotProductVector[0];
309  dotProduct += dotProductVector[1];
310  dotProduct += dotProductVector[2];
311  dotProduct += dotProductVector[3];
312 
313  number = sixteenthPoints * 16;
314  for (; number < num_points; number++) {
315  dotProduct += ((*aPtr++) * (*bPtr++));
316  }
317 
318  *result = dotProduct;
319 }
320 
321 #endif /*LV_HAVE_SSE4_1*/
322 
323 #ifdef LV_HAVE_AVX
324 
325 #include <immintrin.h>
326 
327 static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
328  const float* input,
329  const float* taps,
330  unsigned int num_points)
331 {
332 
333  unsigned int number = 0;
334  const unsigned int sixteenthPoints = num_points / 16;
335 
336  float dotProduct = 0;
337  const float* aPtr = input;
338  const float* bPtr = taps;
339 
340  __m256 a0Val, a1Val;
341  __m256 b0Val, b1Val;
342  __m256 c0Val, c1Val;
343 
344  __m256 dotProdVal0 = _mm256_setzero_ps();
345  __m256 dotProdVal1 = _mm256_setzero_ps();
346 
347  for (; number < sixteenthPoints; number++) {
348 
349  a0Val = _mm256_loadu_ps(aPtr);
350  a1Val = _mm256_loadu_ps(aPtr + 8);
351  b0Val = _mm256_loadu_ps(bPtr);
352  b1Val = _mm256_loadu_ps(bPtr + 8);
353 
354  c0Val = _mm256_mul_ps(a0Val, b0Val);
355  c1Val = _mm256_mul_ps(a1Val, b1Val);
356 
357  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
358  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
359 
360  aPtr += 16;
361  bPtr += 16;
362  }
363 
364  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
365 
366  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
367 
368  _mm256_storeu_ps(dotProductVector,
369  dotProdVal0); // Store the results back into the dot product vector
370 
371  dotProduct = dotProductVector[0];
372  dotProduct += dotProductVector[1];
373  dotProduct += dotProductVector[2];
374  dotProduct += dotProductVector[3];
375  dotProduct += dotProductVector[4];
376  dotProduct += dotProductVector[5];
377  dotProduct += dotProductVector[6];
378  dotProduct += dotProductVector[7];
379 
380  number = sixteenthPoints * 16;
381  for (; number < num_points; number++) {
382  dotProduct += ((*aPtr++) * (*bPtr++));
383  }
384 
385  *result = dotProduct;
386 }
387 
388 #endif /*LV_HAVE_AVX*/
389 
390 #if LV_HAVE_AVX2 && LV_HAVE_FMA
391 #include <immintrin.h>
392 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
393  const float* input,
394  const float* taps,
395  unsigned int num_points)
396 {
397  unsigned int number;
398  const unsigned int eighthPoints = num_points / 8;
399 
400  const float* aPtr = input;
401  const float* bPtr = taps;
402 
403  __m256 dotProdVal = _mm256_setzero_ps();
404  __m256 aVal1, bVal1;
405 
406  for (number = 0; number < eighthPoints; number++) {
407 
408  aVal1 = _mm256_loadu_ps(aPtr);
409  bVal1 = _mm256_loadu_ps(bPtr);
410  aPtr += 8;
411  bPtr += 8;
412 
413  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
414  }
415 
416  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
417  _mm256_storeu_ps(dotProductVector,
418  dotProdVal); // Store the results back into the dot product vector
419 
420  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
421  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
422  dotProductVector[6] + dotProductVector[7];
423 
424  for (number = eighthPoints * 8; number < num_points; number++) {
425  dotProduct += ((*aPtr++) * (*bPtr++));
426  }
427 
428  *result = dotProduct;
429 }
430 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
431 
432 #if LV_HAVE_AVX512F
433 #include <immintrin.h>
434 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
435  const float* input,
436  const float* taps,
437  unsigned int num_points)
438 {
439  unsigned int number;
440  const unsigned int sixteenthPoints = num_points / 16;
441 
442  const float* aPtr = input;
443  const float* bPtr = taps;
444 
445  __m512 dotProdVal = _mm512_setzero_ps();
446  __m512 aVal1, bVal1;
447 
448  for (number = 0; number < sixteenthPoints; number++) {
449 
450  aVal1 = _mm512_loadu_ps(aPtr);
451  bVal1 = _mm512_loadu_ps(bPtr);
452  aPtr += 16;
453  bPtr += 16;
454 
455  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
456  }
457 
458  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
459  _mm512_storeu_ps(dotProductVector,
460  dotProdVal); // Store the results back into the dot product vector
461 
462  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
463  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
464  dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
465  dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
466  dotProductVector[12] + dotProductVector[13] +
467  dotProductVector[14] + dotProductVector[15];
468 
469  for (number = sixteenthPoints * 16; number < num_points; number++) {
470  dotProduct += ((*aPtr++) * (*bPtr++));
471  }
472 
473  *result = dotProduct;
474 }
475 #endif /* LV_HAVE_AVX512F */
476 
477 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
478 
479 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
480 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
481 
482 #include <stdio.h>
483 #include <volk/volk_common.h>
484 
485 
486 #ifdef LV_HAVE_GENERIC
487 
488 
489 static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
490  const float* input,
491  const float* taps,
492  unsigned int num_points)
493 {
494 
495  float dotProduct = 0;
496  const float* aPtr = input;
497  const float* bPtr = taps;
498  unsigned int number = 0;
499 
500  for (number = 0; number < num_points; number++) {
501  dotProduct += ((*aPtr++) * (*bPtr++));
502  }
503 
504  *result = dotProduct;
505 }
506 
507 #endif /*LV_HAVE_GENERIC*/
508 
509 
510 #ifdef LV_HAVE_SSE
511 
512 
513 static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
514  const float* input,
515  const float* taps,
516  unsigned int num_points)
517 {
518 
519  unsigned int number = 0;
520  const unsigned int sixteenthPoints = num_points / 16;
521 
522  float dotProduct = 0;
523  const float* aPtr = input;
524  const float* bPtr = taps;
525 
526  __m128 a0Val, a1Val, a2Val, a3Val;
527  __m128 b0Val, b1Val, b2Val, b3Val;
528  __m128 c0Val, c1Val, c2Val, c3Val;
529 
530  __m128 dotProdVal0 = _mm_setzero_ps();
531  __m128 dotProdVal1 = _mm_setzero_ps();
532  __m128 dotProdVal2 = _mm_setzero_ps();
533  __m128 dotProdVal3 = _mm_setzero_ps();
534 
535  for (; number < sixteenthPoints; number++) {
536 
537  a0Val = _mm_load_ps(aPtr);
538  a1Val = _mm_load_ps(aPtr + 4);
539  a2Val = _mm_load_ps(aPtr + 8);
540  a3Val = _mm_load_ps(aPtr + 12);
541  b0Val = _mm_load_ps(bPtr);
542  b1Val = _mm_load_ps(bPtr + 4);
543  b2Val = _mm_load_ps(bPtr + 8);
544  b3Val = _mm_load_ps(bPtr + 12);
545 
546  c0Val = _mm_mul_ps(a0Val, b0Val);
547  c1Val = _mm_mul_ps(a1Val, b1Val);
548  c2Val = _mm_mul_ps(a2Val, b2Val);
549  c3Val = _mm_mul_ps(a3Val, b3Val);
550 
551  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
552  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
553  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
554  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
555 
556  aPtr += 16;
557  bPtr += 16;
558  }
559 
560  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
561  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
562  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
563 
564  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
565 
566  _mm_store_ps(dotProductVector,
567  dotProdVal0); // Store the results back into the dot product vector
568 
569  dotProduct = dotProductVector[0];
570  dotProduct += dotProductVector[1];
571  dotProduct += dotProductVector[2];
572  dotProduct += dotProductVector[3];
573 
574  number = sixteenthPoints * 16;
575  for (; number < num_points; number++) {
576  dotProduct += ((*aPtr++) * (*bPtr++));
577  }
578 
579  *result = dotProduct;
580 }
581 
582 #endif /*LV_HAVE_SSE*/
583 
584 #ifdef LV_HAVE_SSE3
585 
586 #include <pmmintrin.h>
587 
588 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
589  const float* input,
590  const float* taps,
591  unsigned int num_points)
592 {
593  unsigned int number = 0;
594  const unsigned int sixteenthPoints = num_points / 16;
595 
596  float dotProduct = 0;
597  const float* aPtr = input;
598  const float* bPtr = taps;
599 
600  __m128 a0Val, a1Val, a2Val, a3Val;
601  __m128 b0Val, b1Val, b2Val, b3Val;
602  __m128 c0Val, c1Val, c2Val, c3Val;
603 
604  __m128 dotProdVal0 = _mm_setzero_ps();
605  __m128 dotProdVal1 = _mm_setzero_ps();
606  __m128 dotProdVal2 = _mm_setzero_ps();
607  __m128 dotProdVal3 = _mm_setzero_ps();
608 
609  for (; number < sixteenthPoints; number++) {
610 
611  a0Val = _mm_load_ps(aPtr);
612  a1Val = _mm_load_ps(aPtr + 4);
613  a2Val = _mm_load_ps(aPtr + 8);
614  a3Val = _mm_load_ps(aPtr + 12);
615  b0Val = _mm_load_ps(bPtr);
616  b1Val = _mm_load_ps(bPtr + 4);
617  b2Val = _mm_load_ps(bPtr + 8);
618  b3Val = _mm_load_ps(bPtr + 12);
619 
620  c0Val = _mm_mul_ps(a0Val, b0Val);
621  c1Val = _mm_mul_ps(a1Val, b1Val);
622  c2Val = _mm_mul_ps(a2Val, b2Val);
623  c3Val = _mm_mul_ps(a3Val, b3Val);
624 
625  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
626  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
627  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
628  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
629 
630  aPtr += 16;
631  bPtr += 16;
632  }
633 
634  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
635  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
636  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
637 
638  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
639  _mm_store_ps(dotProductVector,
640  dotProdVal0); // Store the results back into the dot product vector
641 
642  dotProduct = dotProductVector[0];
643  dotProduct += dotProductVector[1];
644  dotProduct += dotProductVector[2];
645  dotProduct += dotProductVector[3];
646 
647  number = sixteenthPoints * 16;
648  for (; number < num_points; number++) {
649  dotProduct += ((*aPtr++) * (*bPtr++));
650  }
651 
652  *result = dotProduct;
653 }
654 
655 #endif /*LV_HAVE_SSE3*/
656 
657 #ifdef LV_HAVE_SSE4_1
658 
659 #include <smmintrin.h>
660 
661 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
662  const float* input,
663  const float* taps,
664  unsigned int num_points)
665 {
666  unsigned int number = 0;
667  const unsigned int sixteenthPoints = num_points / 16;
668 
669  float dotProduct = 0;
670  const float* aPtr = input;
671  const float* bPtr = taps;
672 
673  __m128 aVal1, bVal1, cVal1;
674  __m128 aVal2, bVal2, cVal2;
675  __m128 aVal3, bVal3, cVal3;
676  __m128 aVal4, bVal4, cVal4;
677 
678  __m128 dotProdVal = _mm_setzero_ps();
679 
680  for (; number < sixteenthPoints; number++) {
681 
682  aVal1 = _mm_load_ps(aPtr);
683  aPtr += 4;
684  aVal2 = _mm_load_ps(aPtr);
685  aPtr += 4;
686  aVal3 = _mm_load_ps(aPtr);
687  aPtr += 4;
688  aVal4 = _mm_load_ps(aPtr);
689  aPtr += 4;
690 
691  bVal1 = _mm_load_ps(bPtr);
692  bPtr += 4;
693  bVal2 = _mm_load_ps(bPtr);
694  bPtr += 4;
695  bVal3 = _mm_load_ps(bPtr);
696  bPtr += 4;
697  bVal4 = _mm_load_ps(bPtr);
698  bPtr += 4;
699 
700  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
701  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
702  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
703  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
704 
705  cVal1 = _mm_or_ps(cVal1, cVal2);
706  cVal3 = _mm_or_ps(cVal3, cVal4);
707  cVal1 = _mm_or_ps(cVal1, cVal3);
708 
709  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
710  }
711 
712  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
713  _mm_store_ps(dotProductVector,
714  dotProdVal); // Store the results back into the dot product vector
715 
716  dotProduct = dotProductVector[0];
717  dotProduct += dotProductVector[1];
718  dotProduct += dotProductVector[2];
719  dotProduct += dotProductVector[3];
720 
721  number = sixteenthPoints * 16;
722  for (; number < num_points; number++) {
723  dotProduct += ((*aPtr++) * (*bPtr++));
724  }
725 
726  *result = dotProduct;
727 }
728 
729 #endif /*LV_HAVE_SSE4_1*/
730 
731 #ifdef LV_HAVE_AVX
732 
733 #include <immintrin.h>
734 
735 static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
736  const float* input,
737  const float* taps,
738  unsigned int num_points)
739 {
740 
741  unsigned int number = 0;
742  const unsigned int sixteenthPoints = num_points / 16;
743 
744  float dotProduct = 0;
745  const float* aPtr = input;
746  const float* bPtr = taps;
747 
748  __m256 a0Val, a1Val;
749  __m256 b0Val, b1Val;
750  __m256 c0Val, c1Val;
751 
752  __m256 dotProdVal0 = _mm256_setzero_ps();
753  __m256 dotProdVal1 = _mm256_setzero_ps();
754 
755  for (; number < sixteenthPoints; number++) {
756 
757  a0Val = _mm256_load_ps(aPtr);
758  a1Val = _mm256_load_ps(aPtr + 8);
759  b0Val = _mm256_load_ps(bPtr);
760  b1Val = _mm256_load_ps(bPtr + 8);
761 
762  c0Val = _mm256_mul_ps(a0Val, b0Val);
763  c1Val = _mm256_mul_ps(a1Val, b1Val);
764 
765  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
766  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
767 
768  aPtr += 16;
769  bPtr += 16;
770  }
771 
772  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
773 
774  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
775 
776  _mm256_store_ps(dotProductVector,
777  dotProdVal0); // Store the results back into the dot product vector
778 
779  dotProduct = dotProductVector[0];
780  dotProduct += dotProductVector[1];
781  dotProduct += dotProductVector[2];
782  dotProduct += dotProductVector[3];
783  dotProduct += dotProductVector[4];
784  dotProduct += dotProductVector[5];
785  dotProduct += dotProductVector[6];
786  dotProduct += dotProductVector[7];
787 
788  number = sixteenthPoints * 16;
789  for (; number < num_points; number++) {
790  dotProduct += ((*aPtr++) * (*bPtr++));
791  }
792 
793  *result = dotProduct;
794 }
795 #endif /*LV_HAVE_AVX*/
796 
797 
798 #if LV_HAVE_AVX2 && LV_HAVE_FMA
799 #include <immintrin.h>
800 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
801  const float* input,
802  const float* taps,
803  unsigned int num_points)
804 {
805  unsigned int number;
806  const unsigned int eighthPoints = num_points / 8;
807 
808  const float* aPtr = input;
809  const float* bPtr = taps;
810 
811  __m256 dotProdVal = _mm256_setzero_ps();
812  __m256 aVal1, bVal1;
813 
814  for (number = 0; number < eighthPoints; number++) {
815 
816  aVal1 = _mm256_load_ps(aPtr);
817  bVal1 = _mm256_load_ps(bPtr);
818  aPtr += 8;
819  bPtr += 8;
820 
821  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
822  }
823 
824  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
825  _mm256_store_ps(dotProductVector,
826  dotProdVal); // Store the results back into the dot product vector
827 
828  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
829  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
830  dotProductVector[6] + dotProductVector[7];
831 
832  for (number = eighthPoints * 8; number < num_points; number++) {
833  dotProduct += ((*aPtr++) * (*bPtr++));
834  }
835 
836  *result = dotProduct;
837 }
838 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
839 
840 #if LV_HAVE_AVX512F
841 #include <immintrin.h>
842 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
843  const float* input,
844  const float* taps,
845  unsigned int num_points)
846 {
847  unsigned int number;
848  const unsigned int sixteenthPoints = num_points / 16;
849 
850  const float* aPtr = input;
851  const float* bPtr = taps;
852 
853  __m512 dotProdVal = _mm512_setzero_ps();
854  __m512 aVal1, bVal1;
855 
856  for (number = 0; number < sixteenthPoints; number++) {
857 
858  aVal1 = _mm512_load_ps(aPtr);
859  bVal1 = _mm512_load_ps(bPtr);
860  aPtr += 16;
861  bPtr += 16;
862 
863  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
864  }
865 
866  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
867  _mm512_store_ps(dotProductVector,
868  dotProdVal); // Store the results back into the dot product vector
869 
870  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
871  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
872  dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
873  dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
874  dotProductVector[12] + dotProductVector[13] +
875  dotProductVector[14] + dotProductVector[15];
876 
877  for (number = sixteenthPoints * 16; number < num_points; number++) {
878  dotProduct += ((*aPtr++) * (*bPtr++));
879  }
880 
881  *result = dotProduct;
882 }
883 #endif /* LV_HAVE_AVX512F */
884 
885 #ifdef LV_HAVE_NEON
886 #include <arm_neon.h>
887 
888 static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
889  const float* input,
890  const float* taps,
891  unsigned int num_points)
892 {
893 
894  unsigned int quarter_points = num_points / 16;
895  float dotProduct = 0;
896  const float* aPtr = input;
897  const float* bPtr = taps;
898  unsigned int number = 0;
899 
900  float32x4x4_t a_val, b_val, accumulator0;
901  accumulator0.val[0] = vdupq_n_f32(0);
902  accumulator0.val[1] = vdupq_n_f32(0);
903  accumulator0.val[2] = vdupq_n_f32(0);
904  accumulator0.val[3] = vdupq_n_f32(0);
905  // factor of 4 loop unroll with independent accumulators
906  // uses 12 out of 16 neon q registers
907  for (number = 0; number < quarter_points; ++number) {
908  a_val = vld4q_f32(aPtr);
909  b_val = vld4q_f32(bPtr);
910  accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
911  accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
912  accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
913  accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
914  aPtr += 16;
915  bPtr += 16;
916  }
917  accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
918  accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
919  accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
920  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
921  vst1q_f32(accumulator, accumulator0.val[0]);
922  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
923 
924  for (number = quarter_points * 16; number < num_points; number++) {
925  dotProduct += ((*aPtr++) * (*bPtr++));
926  }
927 
928  *result = dotProduct;
929 }
930 
931 #endif
932 
933 
934 #ifdef LV_HAVE_NEON
935 static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
936  const float* input,
937  const float* taps,
938  unsigned int num_points)
939 {
940 
941  unsigned int quarter_points = num_points / 8;
942  float dotProduct = 0;
943  const float* aPtr = input;
944  const float* bPtr = taps;
945  unsigned int number = 0;
946 
947  float32x4x2_t a_val, b_val, accumulator_val;
948  accumulator_val.val[0] = vdupq_n_f32(0);
949  accumulator_val.val[1] = vdupq_n_f32(0);
950  // factor of 2 loop unroll with independent accumulators
951  for (number = 0; number < quarter_points; ++number) {
952  a_val = vld2q_f32(aPtr);
953  b_val = vld2q_f32(bPtr);
954  accumulator_val.val[0] =
955  vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
956  accumulator_val.val[1] =
957  vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
958  aPtr += 8;
959  bPtr += 8;
960  }
961  accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
962  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
963  vst1q_f32(accumulator, accumulator_val.val[0]);
964  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
965 
966  for (number = quarter_points * 8; number < num_points; number++) {
967  dotProduct += ((*aPtr++) * (*bPtr++));
968  }
969 
970  *result = dotProduct;
971 }
972 
973 #endif /* LV_HAVE_NEON */
974 
975 #ifdef LV_HAVE_NEONV7
976 extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
977  const float* aVector,
978  const float* bVector,
979  unsigned int num_points);
980 #endif /* LV_HAVE_NEONV7 */
981 
982 #ifdef LV_HAVE_NEONV7
983 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
984  const float* aVector,
985  const float* bVector,
986  unsigned int num_points);
987 #endif /* LV_HAVE_NEONV7 */
988 
989 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:735
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:513
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:105
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:327
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:81
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:180
static void volk_32f_x2_dot_prod_32f_a_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:489
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:588
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:888
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:935
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56