Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32f_s32f_x2_convert_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2023 Daniel Estevez <daniel@destevez.net>
4  * Copyright 2012, 2014 Free Software Foundation, Inc.
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
60 #ifndef INCLUDED_volk_32f_s32f_x2_convert_8u_u_H
61 #define INCLUDED_volk_32f_s32f_x2_convert_8u_u_H
62 
63 #include <inttypes.h>
64 
65 static inline void volk_32f_s32f_x2_convert_8u_single(uint8_t* out, const float in)
66 {
67  const float min_val = 0.0f;
68  const float max_val = UINT8_MAX;
69  if (in > max_val) {
70  *out = (uint8_t)(max_val);
71  } else if (in < min_val) {
72  *out = (uint8_t)(min_val);
73  } else {
74  *out = (uint8_t)(rintf(in));
75  }
76 }
77 
78 
79 #ifdef LV_HAVE_GENERIC
80 
81 static inline void volk_32f_s32f_x2_convert_8u_generic(uint8_t* outputVector,
82  const float* inputVector,
83  const float scale,
84  const float bias,
85  unsigned int num_points)
86 {
87  const float* inputVectorPtr = inputVector;
88 
89  for (unsigned int number = 0; number < num_points; number++) {
90  const float r = *inputVectorPtr++ * scale + bias;
91  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
92  }
93 }
94 
95 #endif /* LV_HAVE_GENERIC */
96 
97 
98 #if LV_HAVE_AVX2 && LV_HAVE_FMA
99 #include <immintrin.h>
100 
101 static inline void volk_32f_s32f_x2_convert_8u_u_avx2_fma(uint8_t* outputVector,
102  const float* inputVector,
103  const float scale,
104  const float bias,
105  unsigned int num_points)
106 {
107  const unsigned int thirtysecondPoints = num_points / 32;
108 
109  const float* inputVectorPtr = (const float*)inputVector;
110  uint8_t* outputVectorPtr = outputVector;
111 
112  const float min_val = 0.0f;
113  const float max_val = UINT8_MAX;
114  const __m256 vmin_val = _mm256_set1_ps(min_val);
115  const __m256 vmax_val = _mm256_set1_ps(max_val);
116 
117  const __m256 vScale = _mm256_set1_ps(scale);
118  const __m256 vBias = _mm256_set1_ps(bias);
119 
120  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
121  __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
122  inputVectorPtr += 8;
123  __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
124  inputVectorPtr += 8;
125  __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
126  inputVectorPtr += 8;
127  __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
128  inputVectorPtr += 8;
129 
130  inputVal1 = _mm256_max_ps(
131  _mm256_min_ps(_mm256_fmadd_ps(inputVal1, vScale, vBias), vmax_val), vmin_val);
132  inputVal2 = _mm256_max_ps(
133  _mm256_min_ps(_mm256_fmadd_ps(inputVal2, vScale, vBias), vmax_val), vmin_val);
134  inputVal3 = _mm256_max_ps(
135  _mm256_min_ps(_mm256_fmadd_ps(inputVal3, vScale, vBias), vmax_val), vmin_val);
136  inputVal4 = _mm256_max_ps(
137  _mm256_min_ps(_mm256_fmadd_ps(inputVal4, vScale, vBias), vmax_val), vmin_val);
138 
139  __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
140  __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
141  __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
142  __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
143 
144  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
145  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
146  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
147  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
148 
149  intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
150  const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
151 
152  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
153  outputVectorPtr += 32;
154  }
155 
156  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
157  const float r = inputVector[number] * scale + bias;
158  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
159  }
160 }
161 
162 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
163 
164 
165 #ifdef LV_HAVE_AVX2
166 #include <immintrin.h>
167 
168 static inline void volk_32f_s32f_x2_convert_8u_u_avx2(uint8_t* outputVector,
169  const float* inputVector,
170  const float scale,
171  const float bias,
172  unsigned int num_points)
173 {
174  const unsigned int thirtysecondPoints = num_points / 32;
175 
176  const float* inputVectorPtr = (const float*)inputVector;
177  uint8_t* outputVectorPtr = outputVector;
178 
179  const float min_val = 0.0f;
180  const float max_val = UINT8_MAX;
181  const __m256 vmin_val = _mm256_set1_ps(min_val);
182  const __m256 vmax_val = _mm256_set1_ps(max_val);
183 
184  const __m256 vScale = _mm256_set1_ps(scale);
185  const __m256 vBias = _mm256_set1_ps(bias);
186 
187  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
188  __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
189  inputVectorPtr += 8;
190  __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
191  inputVectorPtr += 8;
192  __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
193  inputVectorPtr += 8;
194  __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
195  inputVectorPtr += 8;
196 
197  inputVal1 = _mm256_max_ps(
198  _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal1, vScale), vBias),
199  vmax_val),
200  vmin_val);
201  inputVal2 = _mm256_max_ps(
202  _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal2, vScale), vBias),
203  vmax_val),
204  vmin_val);
205  inputVal3 = _mm256_max_ps(
206  _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal3, vScale), vBias),
207  vmax_val),
208  vmin_val);
209  inputVal4 = _mm256_max_ps(
210  _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal4, vScale), vBias),
211  vmax_val),
212  vmin_val);
213 
214  __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
215  __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
216  __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
217  __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
218 
219  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
220  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
221  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
222  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
223 
224  intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
225  const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
226 
227  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
228  outputVectorPtr += 32;
229  }
230 
231  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
232  float r = inputVector[number] * scale + bias;
233  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
234  }
235 }
236 
237 #endif /* LV_HAVE_AVX2 */
238 
239 
240 #ifdef LV_HAVE_SSE2
241 #include <emmintrin.h>
242 
243 static inline void volk_32f_s32f_x2_convert_8u_u_sse2(uint8_t* outputVector,
244  const float* inputVector,
245  const float scale,
246  const float bias,
247  unsigned int num_points)
248 {
249  const unsigned int sixteenthPoints = num_points / 16;
250 
251  const float* inputVectorPtr = (const float*)inputVector;
252  uint8_t* outputVectorPtr = outputVector;
253 
254  const float min_val = 0.0f;
255  const float max_val = UINT8_MAX;
256  const __m128 vmin_val = _mm_set_ps1(min_val);
257  const __m128 vmax_val = _mm_set_ps1(max_val);
258 
259  const __m128 vScale = _mm_set_ps1(scale);
260  const __m128 vBias = _mm_set_ps1(bias);
261 
262  for (unsigned int number = 0; number < sixteenthPoints; number++) {
263  __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
264  inputVectorPtr += 4;
265  __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
266  inputVectorPtr += 4;
267  __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
268  inputVectorPtr += 4;
269  __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
270  inputVectorPtr += 4;
271 
272  inputVal1 = _mm_max_ps(
273  _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal1, vScale), vBias), vmax_val),
274  vmin_val);
275  inputVal2 = _mm_max_ps(
276  _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal2, vScale), vBias), vmax_val),
277  vmin_val);
278  inputVal3 = _mm_max_ps(
279  _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal3, vScale), vBias), vmax_val),
280  vmin_val);
281  inputVal4 = _mm_max_ps(
282  _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal4, vScale), vBias), vmax_val),
283  vmin_val);
284 
285  __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
286  __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
287  __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
288  __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
289 
290  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
291  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
292 
293  intInputVal1 = _mm_packus_epi16(intInputVal1, intInputVal3);
294 
295  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
296  outputVectorPtr += 16;
297  }
298 
299  for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
300  const float r = inputVector[number] * scale + bias;
301  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
302  }
303 }
304 
305 #endif /* LV_HAVE_SSE2 */
306 
307 
308 #ifdef LV_HAVE_SSE
309 #include <xmmintrin.h>
310 
311 static inline void volk_32f_s32f_x2_convert_8u_u_sse(uint8_t* outputVector,
312  const float* inputVector,
313  const float scale,
314  const float bias,
315  unsigned int num_points)
316 {
317  const unsigned int quarterPoints = num_points / 4;
318 
319  const float* inputVectorPtr = (const float*)inputVector;
320  uint8_t* outputVectorPtr = outputVector;
321 
322  const float min_val = 0.0f;
323  const float max_val = UINT8_MAX;
324  const __m128 vmin_val = _mm_set_ps1(min_val);
325  const __m128 vmax_val = _mm_set_ps1(max_val);
326 
327  const __m128 vScale = _mm_set_ps1(scale);
328  const __m128 vBias = _mm_set_ps1(bias);
329 
330  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
331 
332  for (unsigned int number = 0; number < quarterPoints; number++) {
333  __m128 ret = _mm_loadu_ps(inputVectorPtr);
334  inputVectorPtr += 4;
335 
336  ret = _mm_max_ps(_mm_min_ps(_mm_add_ps(_mm_mul_ps(ret, vScale), vBias), vmax_val),
337  vmin_val);
338 
339  _mm_store_ps(outputFloatBuffer, ret);
340  for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
341  *outputVectorPtr++ = (uint8_t)(rintf(outputFloatBuffer[inner_loop]));
342  }
343  }
344 
345  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
346  const float r = inputVector[number] * scale + bias;
347  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
348  }
349 }
350 
351 #endif /* LV_HAVE_SSE */
352 
353 
354 #endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_u_H */
355 #ifndef INCLUDED_volk_32f_s32f_x2_convert_8u_a_H
356 #define INCLUDED_volk_32f_s32f_x2_convert_8u_a_H
357 
358 #include <inttypes.h>
359 #include <volk/volk_common.h>
360 
361 #if LV_HAVE_AVX2 && LV_HAVE_FMA
362 #include <immintrin.h>
363 
364 static inline void volk_32f_s32f_x2_convert_8u_a_avx2_fma(uint8_t* outputVector,
365  const float* inputVector,
366  const float scale,
367  const float bias,
368  unsigned int num_points)
369 {
370  const unsigned int thirtysecondPoints = num_points / 32;
371 
372  const float* inputVectorPtr = (const float*)inputVector;
373  uint8_t* outputVectorPtr = outputVector;
374 
375  const float min_val = 0.0f;
376  const float max_val = UINT8_MAX;
377  const __m256 vmin_val = _mm256_set1_ps(min_val);
378  const __m256 vmax_val = _mm256_set1_ps(max_val);
379 
380  const __m256 vScale = _mm256_set1_ps(scale);
381  const __m256 vBias = _mm256_set1_ps(bias);
382 
383  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
384  __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
385  inputVectorPtr += 8;
386  __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
387  inputVectorPtr += 8;
388  __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
389  inputVectorPtr += 8;
390  __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
391  inputVectorPtr += 8;
392 
393  inputVal1 = _mm256_max_ps(
394  _mm256_min_ps(_mm256_fmadd_ps(inputVal1, vScale, vBias), vmax_val), vmin_val);
395  inputVal2 = _mm256_max_ps(
396  _mm256_min_ps(_mm256_fmadd_ps(inputVal2, vScale, vBias), vmax_val), vmin_val);
397  inputVal3 = _mm256_max_ps(
398  _mm256_min_ps(_mm256_fmadd_ps(inputVal3, vScale, vBias), vmax_val), vmin_val);
399  inputVal4 = _mm256_max_ps(
400  _mm256_min_ps(_mm256_fmadd_ps(inputVal4, vScale, vBias), vmax_val), vmin_val);
401 
402  __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
403  __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
404  __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
405  __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
406 
407  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
408  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
409  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
410  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
411 
412  intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
413  const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
414 
415  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
416  outputVectorPtr += 32;
417  }
418 
419  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
420  const float r = inputVector[number] * scale + bias;
421  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
422  }
423 }
424 
425 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
426 
427 
428 #ifdef LV_HAVE_AVX2
429 #include <immintrin.h>
430 
431 static inline void volk_32f_s32f_x2_convert_8u_a_avx2(uint8_t* outputVector,
432  const float* inputVector,
433  const float scale,
434  const float bias,
435  unsigned int num_points)
436 {
437  const unsigned int thirtysecondPoints = num_points / 32;
438 
439  const float* inputVectorPtr = (const float*)inputVector;
440  uint8_t* outputVectorPtr = outputVector;
441 
442  const float min_val = 0.0f;
443  const float max_val = UINT8_MAX;
444  const __m256 vmin_val = _mm256_set1_ps(min_val);
445  const __m256 vmax_val = _mm256_set1_ps(max_val);
446 
447  const __m256 vScale = _mm256_set1_ps(scale);
448  const __m256 vBias = _mm256_set1_ps(bias);
449 
450  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
451  __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
452  inputVectorPtr += 8;
453  __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
454  inputVectorPtr += 8;
455  __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
456  inputVectorPtr += 8;
457  __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
458  inputVectorPtr += 8;
459 
460  inputVal1 = _mm256_max_ps(
461  _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal1, vScale), vBias),
462  vmax_val),
463  vmin_val);
464  inputVal2 = _mm256_max_ps(
465  _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal2, vScale), vBias),
466  vmax_val),
467  vmin_val);
468  inputVal3 = _mm256_max_ps(
469  _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal3, vScale), vBias),
470  vmax_val),
471  vmin_val);
472  inputVal4 = _mm256_max_ps(
473  _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal4, vScale), vBias),
474  vmax_val),
475  vmin_val);
476 
477  __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
478  __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
479  __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
480  __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
481 
482  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
483  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
484  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
485  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
486 
487  intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
488  const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
489 
490  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
491  outputVectorPtr += 32;
492  }
493 
494  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
495  const float r = inputVector[number] * scale + bias;
496  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
497  }
498 }
499 
500 #endif /* LV_HAVE_AVX2 */
501 
502 
503 #ifdef LV_HAVE_SSE2
504 #include <emmintrin.h>
505 
506 static inline void volk_32f_s32f_x2_convert_8u_a_sse2(uint8_t* outputVector,
507  const float* inputVector,
508  const float scale,
509  const float bias,
510  unsigned int num_points)
511 {
512  const unsigned int sixteenthPoints = num_points / 16;
513 
514  const float* inputVectorPtr = (const float*)inputVector;
515  uint8_t* outputVectorPtr = outputVector;
516 
517  const float min_val = 0.0f;
518  const float max_val = UINT8_MAX;
519  const __m128 vmin_val = _mm_set_ps1(min_val);
520  const __m128 vmax_val = _mm_set_ps1(max_val);
521 
522  const __m128 vScale = _mm_set_ps1(scale);
523  const __m128 vBias = _mm_set_ps1(bias);
524 
525  for (unsigned int number = 0; number < sixteenthPoints; number++) {
526  __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
527  inputVectorPtr += 4;
528  __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
529  inputVectorPtr += 4;
530  __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
531  inputVectorPtr += 4;
532  __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
533  inputVectorPtr += 4;
534 
535  inputVal1 = _mm_max_ps(
536  _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal1, vScale), vBias), vmax_val),
537  vmin_val);
538  inputVal2 = _mm_max_ps(
539  _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal2, vScale), vBias), vmax_val),
540  vmin_val);
541  inputVal3 = _mm_max_ps(
542  _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal3, vScale), vBias), vmax_val),
543  vmin_val);
544  inputVal4 = _mm_max_ps(
545  _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal4, vScale), vBias), vmax_val),
546  vmin_val);
547 
548  __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
549  __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
550  __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
551  __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
552 
553  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
554  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
555 
556  intInputVal1 = _mm_packus_epi16(intInputVal1, intInputVal3);
557 
558  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
559  outputVectorPtr += 16;
560  }
561 
562  for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
563  const float r = inputVector[number] * scale + bias;
564  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
565  }
566 }
567 #endif /* LV_HAVE_SSE2 */
568 
569 
570 #ifdef LV_HAVE_SSE
571 #include <xmmintrin.h>
572 
573 static inline void volk_32f_s32f_x2_convert_8u_a_sse(uint8_t* outputVector,
574  const float* inputVector,
575  const float scale,
576  const float bias,
577  unsigned int num_points)
578 {
579  const unsigned int quarterPoints = num_points / 4;
580 
581  const float* inputVectorPtr = (const float*)inputVector;
582  uint8_t* outputVectorPtr = outputVector;
583 
584  const float min_val = 0.0f;
585  const float max_val = UINT8_MAX;
586  const __m128 vmin_val = _mm_set_ps1(min_val);
587  const __m128 vmax_val = _mm_set_ps1(max_val);
588 
589  const __m128 vScalar = _mm_set_ps1(scale);
590  const __m128 vBias = _mm_set_ps1(bias);
591 
592  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
593 
594  for (unsigned int number = 0; number < quarterPoints; number++) {
595  __m128 ret = _mm_load_ps(inputVectorPtr);
596  inputVectorPtr += 4;
597 
598  ret = _mm_max_ps(
599  _mm_min_ps(_mm_add_ps(_mm_mul_ps(ret, vScalar), vBias), vmax_val), vmin_val);
600 
601  _mm_store_ps(outputFloatBuffer, ret);
602  for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
603  *outputVectorPtr++ = (uint8_t)(rintf(outputFloatBuffer[inner_loop]));
604  }
605  }
606 
607  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
608  const float r = inputVector[number] * scale + bias;
609  volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
610  }
611 }
612 
613 #endif /* LV_HAVE_SSE */
614 
615 
616 #endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_a_H */
static float rintf(float x)
Definition: config.h:45
static void volk_32f_s32f_x2_convert_8u_generic(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition: volk_32f_s32f_x2_convert_8u.h:81
static void volk_32f_s32f_x2_convert_8u_a_sse2(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition: volk_32f_s32f_x2_convert_8u.h:506
static void volk_32f_s32f_x2_convert_8u_a_sse(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition: volk_32f_s32f_x2_convert_8u.h:573
static void volk_32f_s32f_x2_convert_8u_u_sse(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition: volk_32f_s32f_x2_convert_8u.h:311
static void volk_32f_s32f_x2_convert_8u_single(uint8_t *out, const float in)
Definition: volk_32f_s32f_x2_convert_8u.h:65
static void volk_32f_s32f_x2_convert_8u_u_sse2(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition: volk_32f_s32f_x2_convert_8u.h:243
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62