Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
76 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
77 #define INCLUDED_volk_32fc_index_max_16u_a_H
78 
79 #include <inttypes.h>
80 #include <limits.h>
81 #include <stdio.h>
82 #include <volk/volk_common.h>
83 #include <volk/volk_complex.h>
84 
85 #ifdef LV_HAVE_AVX2
86 #include <immintrin.h>
88 
89 static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
90  lv_32fc_t* src0,
91  uint32_t num_points)
92 {
93  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
94 
95  const __m256i indices_increment = _mm256_set1_epi32(8);
96  /*
97  * At the start of each loop iteration current_indices holds the indices of
98  * the complex numbers loaded from memory. Explanation for odd order is given
99  * in implementation of vector_32fc_index_max_variant0().
100  */
101  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
102 
103  __m256 max_values = _mm256_setzero_ps();
104  __m256i max_indices = _mm256_setzero_si256();
105 
106  for (unsigned i = 0; i < num_points / 8u; ++i) {
107  __m256 in0 = _mm256_load_ps((float*)src0);
108  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
110  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
111  src0 += 8;
112  }
113 
114  // determine maximum value and index in the result of the vectorized loop
115  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
116  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
117  _mm256_store_ps(max_values_buffer, max_values);
118  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
119 
120  float max = 0.f;
121  uint32_t index = 0;
122  for (unsigned i = 0; i < 8; i++) {
123  if (max_values_buffer[i] > max) {
124  max = max_values_buffer[i];
125  index = max_indices_buffer[i];
126  }
127  }
128 
129  // handle tail not processed by the vectorized loop
130  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
131  const float abs_squared =
132  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
133  if (abs_squared > max) {
134  max = abs_squared;
135  index = i;
136  }
137  ++src0;
138  }
139 
140  *target = index;
141 }
142 
143 #endif /*LV_HAVE_AVX2*/
144 
145 #ifdef LV_HAVE_AVX2
146 #include <immintrin.h>
148 
149 static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
150  lv_32fc_t* src0,
151  uint32_t num_points)
152 {
153  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
154 
155  const __m256i indices_increment = _mm256_set1_epi32(8);
156  /*
157  * At the start of each loop iteration current_indices holds the indices of
158  * the complex numbers loaded from memory. Explanation for odd order is given
159  * in implementation of vector_32fc_index_max_variant0().
160  */
161  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
162 
163  __m256 max_values = _mm256_setzero_ps();
164  __m256i max_indices = _mm256_setzero_si256();
165 
166  for (unsigned i = 0; i < num_points / 8u; ++i) {
167  __m256 in0 = _mm256_load_ps((float*)src0);
168  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
170  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
171  src0 += 8;
172  }
173 
174  // determine maximum value and index in the result of the vectorized loop
175  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
176  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
177  _mm256_store_ps(max_values_buffer, max_values);
178  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
179 
180  float max = 0.f;
181  uint32_t index = 0;
182  for (unsigned i = 0; i < 8; i++) {
183  if (max_values_buffer[i] > max) {
184  max = max_values_buffer[i];
185  index = max_indices_buffer[i];
186  }
187  }
188 
189  // handle tail not processed by the vectorized loop
190  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
191  const float abs_squared =
192  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
193  if (abs_squared > max) {
194  max = abs_squared;
195  index = i;
196  }
197  ++src0;
198  }
199 
200  *target = index;
201 }
202 
203 #endif /*LV_HAVE_AVX2*/
204 
205 #ifdef LV_HAVE_SSE3
206 #include <pmmintrin.h>
207 #include <xmmintrin.h>
208 
209 static inline void
210 volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
211 {
212  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
213  const uint32_t num_bytes = num_points * 8;
214 
215  union bit128 holderf;
216  union bit128 holderi;
217  float sq_dist = 0.0;
218 
219  union bit128 xmm5, xmm4;
220  __m128 xmm1, xmm2, xmm3;
221  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
222 
223  xmm5.int_vec = _mm_setzero_si128();
224  xmm4.int_vec = _mm_setzero_si128();
225  holderf.int_vec = _mm_setzero_si128();
226  holderi.int_vec = _mm_setzero_si128();
227 
228  int bound = num_bytes >> 5;
229  int i = 0;
230 
231  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
232  xmm9 = _mm_setzero_si128();
233  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
234  xmm3 = _mm_setzero_ps();
235 
236  for (; i < bound; ++i) {
237  xmm1 = _mm_load_ps((float*)src0);
238  xmm2 = _mm_load_ps((float*)&src0[2]);
239 
240  src0 += 4;
241 
242  xmm1 = _mm_mul_ps(xmm1, xmm1);
243  xmm2 = _mm_mul_ps(xmm2, xmm2);
244 
245  xmm1 = _mm_hadd_ps(xmm1, xmm2);
246 
247  xmm3 = _mm_max_ps(xmm1, xmm3);
248 
249  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
250  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
251 
252  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
253  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
254 
255  xmm9 = _mm_add_epi32(xmm11, xmm12);
256 
257  xmm8 = _mm_add_epi32(xmm8, xmm10);
258  }
259 
260  if (num_bytes >> 4 & 1) {
261  xmm2 = _mm_load_ps((float*)src0);
262 
263  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
264  xmm8 = bit128_p(&xmm1)->int_vec;
265 
266  xmm2 = _mm_mul_ps(xmm2, xmm2);
267 
268  src0 += 2;
269 
270  xmm1 = _mm_hadd_ps(xmm2, xmm2);
271 
272  xmm3 = _mm_max_ps(xmm1, xmm3);
273 
274  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
275 
276  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
277  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
278 
279  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
280  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
281 
282  xmm9 = _mm_add_epi32(xmm11, xmm12);
283 
284  xmm8 = _mm_add_epi32(xmm8, xmm10);
285  }
286 
287  if (num_bytes >> 3 & 1) {
288  sq_dist =
289  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
290 
291  xmm2 = _mm_load1_ps(&sq_dist);
292 
293  xmm1 = xmm3;
294 
295  xmm3 = _mm_max_ss(xmm3, xmm2);
296 
297  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
298  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
299 
300  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
301 
302  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
303  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
304 
305  xmm9 = _mm_add_epi32(xmm11, xmm12);
306  }
307 
308  _mm_store_ps((float*)&(holderf.f), xmm3);
309  _mm_store_si128(&(holderi.int_vec), xmm9);
310 
311  target[0] = holderi.i[0];
312  sq_dist = holderf.f[0];
313  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
314  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
315  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
316  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
317  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
318  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
319 }
320 
321 #endif /*LV_HAVE_SSE3*/
322 
323 #ifdef LV_HAVE_GENERIC
324 static inline void
325 volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
326 {
327  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
328 
329  const uint32_t num_bytes = num_points * 8;
330 
331  float sq_dist = 0.0;
332  float max = 0.0;
333  uint16_t index = 0;
334 
335  uint32_t i = 0;
336 
337  for (; i<num_bytes>> 3; ++i) {
338  sq_dist =
339  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
340 
341  if (sq_dist > max) {
342  index = i;
343  max = sq_dist;
344  }
345  }
346  target[0] = index;
347 }
348 
349 #endif /*LV_HAVE_GENERIC*/
350 
351 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
352 
353 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
354 #define INCLUDED_volk_32fc_index_max_16u_u_H
355 
356 #include <inttypes.h>
357 #include <limits.h>
358 #include <stdio.h>
359 #include <volk/volk_common.h>
360 #include <volk/volk_complex.h>
361 
362 #ifdef LV_HAVE_AVX2
363 #include <immintrin.h>
365 
366 static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
367  lv_32fc_t* src0,
368  uint32_t num_points)
369 {
370  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
371 
372  const __m256i indices_increment = _mm256_set1_epi32(8);
373  /*
374  * At the start of each loop iteration current_indices holds the indices of
375  * the complex numbers loaded from memory. Explanation for odd order is given
376  * in implementation of vector_32fc_index_max_variant0().
377  */
378  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
379 
380  __m256 max_values = _mm256_setzero_ps();
381  __m256i max_indices = _mm256_setzero_si256();
382 
383  for (unsigned i = 0; i < num_points / 8u; ++i) {
384  __m256 in0 = _mm256_loadu_ps((float*)src0);
385  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
387  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
388  src0 += 8;
389  }
390 
391  // determine maximum value and index in the result of the vectorized loop
392  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
393  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
394  _mm256_store_ps(max_values_buffer, max_values);
395  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
396 
397  float max = 0.f;
398  uint32_t index = 0;
399  for (unsigned i = 0; i < 8; i++) {
400  if (max_values_buffer[i] > max) {
401  max = max_values_buffer[i];
402  index = max_indices_buffer[i];
403  }
404  }
405 
406  // handle tail not processed by the vectorized loop
407  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
408  const float abs_squared =
409  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
410  if (abs_squared > max) {
411  max = abs_squared;
412  index = i;
413  }
414  ++src0;
415  }
416 
417  *target = index;
418 }
419 
420 #endif /*LV_HAVE_AVX2*/
421 
422 #ifdef LV_HAVE_AVX2
423 #include <immintrin.h>
425 
426 static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
427  lv_32fc_t* src0,
428  uint32_t num_points)
429 {
430  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
431 
432  const __m256i indices_increment = _mm256_set1_epi32(8);
433  /*
434  * At the start of each loop iteration current_indices holds the indices of
435  * the complex numbers loaded from memory. Explanation for odd order is given
436  * in implementation of vector_32fc_index_max_variant0().
437  */
438  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
439 
440  __m256 max_values = _mm256_setzero_ps();
441  __m256i max_indices = _mm256_setzero_si256();
442 
443  for (unsigned i = 0; i < num_points / 8u; ++i) {
444  __m256 in0 = _mm256_loadu_ps((float*)src0);
445  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
447  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
448  src0 += 8;
449  }
450 
451  // determine maximum value and index in the result of the vectorized loop
452  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
453  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
454  _mm256_store_ps(max_values_buffer, max_values);
455  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
456 
457  float max = 0.f;
458  uint32_t index = 0;
459  for (unsigned i = 0; i < 8; i++) {
460  if (max_values_buffer[i] > max) {
461  max = max_values_buffer[i];
462  index = max_indices_buffer[i];
463  }
464  }
465 
466  // handle tail not processed by the vectorized loop
467  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
468  const float abs_squared =
469  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
470  if (abs_squared > max) {
471  max = abs_squared;
472  index = i;
473  }
474  ++src0;
475  }
476 
477  *target = index;
478 }
479 
480 #endif /*LV_HAVE_AVX2*/
481 
482 #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:210
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:325
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25