70 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
71 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
78 #if LV_HAVE_AVX2 && LV_HAVE_FMA
79 #include <immintrin.h>
87 static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(
lv_32fc_t* cVector,
90 unsigned int num_points)
92 unsigned int number = 0;
93 const unsigned int quarterPoints = num_points / 4;
99 for (; number < quarterPoints; number++) {
102 _mm256_loadu_ps((
float*)a);
104 _mm256_loadu_ps((
float*)b);
106 const __m256 yl = _mm256_moveldup_ps(y);
107 const __m256 yh = _mm256_movehdup_ps(y);
109 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1);
111 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh);
113 const __m256 z = _mm256_fmaddsub_ps(
116 _mm256_storeu_ps((
float*)c, z);
123 number = quarterPoints * 4;
124 for (; number < num_points; number++) {
125 *c++ = (*a++) * (*b++);
132 #include <immintrin.h>
138 unsigned int num_points)
140 unsigned int number = 0;
141 const unsigned int quarterPoints = num_points / 4;
148 for (; number < quarterPoints; number++) {
154 _mm256_storeu_ps((
float*)c, z);
161 number = quarterPoints * 4;
163 for (; number < num_points; number++) {
164 *c++ = (*a++) * (*b++);
171 #include <pmmintrin.h>
177 unsigned int num_points)
179 unsigned int number = 0;
180 const unsigned int halfPoints = num_points / 2;
187 for (; number < halfPoints; number++) {
188 x = _mm_loadu_ps((
float*)a);
189 y = _mm_loadu_ps((
float*)b);
191 _mm_storeu_ps((
float*)c, z);
198 if ((num_points % 2) != 0) {
205 #ifdef LV_HAVE_GENERIC
210 unsigned int num_points)
215 unsigned int number = 0;
217 for (number = 0; number < num_points; number++) {
218 *cPtr++ = (*aPtr++) * (*bPtr++);
225 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
226 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
229 #include <inttypes.h>
233 #if LV_HAVE_AVX2 && LV_HAVE_FMA
234 #include <immintrin.h>
242 static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(
lv_32fc_t* cVector,
245 unsigned int num_points)
247 unsigned int number = 0;
248 const unsigned int quarterPoints = num_points / 4;
254 for (; number < quarterPoints; number++) {
257 _mm256_load_ps((
float*)a);
259 _mm256_load_ps((
float*)b);
261 const __m256 yl = _mm256_moveldup_ps(y);
262 const __m256 yh = _mm256_movehdup_ps(y);
264 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1);
266 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh);
268 const __m256 z = _mm256_fmaddsub_ps(
271 _mm256_store_ps((
float*)c, z);
278 number = quarterPoints * 4;
279 for (; number < num_points; number++) {
280 *c++ = (*a++) * (*b++);
287 #include <immintrin.h>
293 unsigned int num_points)
295 unsigned int number = 0;
296 const unsigned int quarterPoints = num_points / 4;
303 for (; number < quarterPoints; number++) {
304 x = _mm256_load_ps((
float*)a);
305 y = _mm256_load_ps((
float*)b);
307 _mm256_store_ps((
float*)c, z);
314 number = quarterPoints * 4;
316 for (; number < num_points; number++) {
317 *c++ = (*a++) * (*b++);
323 #include <pmmintrin.h>
329 unsigned int num_points)
331 unsigned int number = 0;
332 const unsigned int halfPoints = num_points / 2;
339 for (; number < halfPoints; number++) {
340 x = _mm_load_ps((
float*)a);
341 y = _mm_load_ps((
float*)b);
343 _mm_store_ps((
float*)c, z);
350 if ((num_points % 2) != 0) {
357 #ifdef LV_HAVE_GENERIC
362 unsigned int num_points)
367 unsigned int number = 0;
369 for (number = 0; number < num_points; number++) {
370 *cPtr++ = (*aPtr++) * (*bPtr++);
377 #include <arm_neon.h>
382 unsigned int num_points)
386 unsigned int quarter_points = num_points / 4;
387 float32x4x2_t a_val, b_val, c_val;
388 float32x4x2_t tmp_real, tmp_imag;
389 unsigned int number = 0;
391 for (number = 0; number < quarter_points; ++number) {
392 a_val = vld2q_f32((
float*)a_ptr);
393 b_val = vld2q_f32((
float*)b_ptr);
399 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
401 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
405 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
407 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
410 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
411 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
412 vst2q_f32((
float*)cVector, c_val);
419 for (number = quarter_points * 4; number < num_points; number++) {
420 *cVector++ = (*a_ptr++) * (*b_ptr++);
431 unsigned int num_points)
435 unsigned int quarter_points = num_points / 4;
436 float32x4x2_t a_val, b_val;
437 float32x4x2_t tmp_imag;
438 unsigned int number = 0;
440 for (number = 0; number < quarter_points; ++number) {
441 a_val = vld2q_f32((
float*)a_ptr);
442 b_val = vld2q_f32((
float*)b_ptr);
447 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
448 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
451 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
452 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
455 vst2q_f32((
float*)cVector, tmp_imag);
462 for (number = quarter_points * 4; number < num_points; number++) {
463 *cVector++ = (*a_ptr++) * (*b_ptr++);
469 #ifdef LV_HAVE_NEONV7
471 extern void volk_32fc_x2_multiply_32fc_a_neonasm(
lv_32fc_t* cVector,
474 unsigned int num_points);
480 extern void volk_32fc_x2_multiply_32fc_a_orc_impl(
lv_32fc_t* cVector,
483 unsigned int num_points);
485 static inline void volk_32fc_x2_multiply_32fc_u_orc(
lv_32fc_t* cVector,
488 unsigned int num_points)
490 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
static void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:326
static void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:135
static void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:207
static void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:428
static void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:379
static void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:290
static void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:174
static void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:359
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65
static __m128 _mm_complexmul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:32