Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_avx_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
11 /*
12  * This file is intended to hold AVX intrinsics of intrinsics.
13  * They should be used in VOLK kernels to avoid copy-pasta.
14  */
15 
16 #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
18 #include <immintrin.h>
19 
20 /*
21  * Approximate arctan(x) via polynomial expansion
22  * on the interval [-1, 1]
23  *
24  * Maximum relative error ~6.5e-7
25  * Polynomial evaluated via Horner's method
26  */
27 static inline __m256 _m256_arctan_poly_avx(const __m256 x)
28 {
29  const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30  const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31  const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32  const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33  const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34  const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35  const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
36 
37  const __m256 x_times_x = _mm256_mul_ps(x, x);
38  __m256 arctan;
39  arctan = a13;
40  arctan = _mm256_mul_ps(x_times_x, arctan);
41  arctan = _mm256_add_ps(arctan, a11);
42  arctan = _mm256_mul_ps(x_times_x, arctan);
43  arctan = _mm256_add_ps(arctan, a9);
44  arctan = _mm256_mul_ps(x_times_x, arctan);
45  arctan = _mm256_add_ps(arctan, a7);
46  arctan = _mm256_mul_ps(x_times_x, arctan);
47  arctan = _mm256_add_ps(arctan, a5);
48  arctan = _mm256_mul_ps(x_times_x, arctan);
49  arctan = _mm256_add_ps(arctan, a3);
50  arctan = _mm256_mul_ps(x_times_x, arctan);
51  arctan = _mm256_add_ps(arctan, a1);
52  arctan = _mm256_mul_ps(x, arctan);
53 
54  return arctan;
55 }
56 
57 static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
58 {
59  __m256 yl, yh, tmp1, tmp2;
60  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
61  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
62  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
63  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
64  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
65 
66  // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
67  return _mm256_addsub_ps(tmp1, tmp2);
68 }
69 
70 static inline __m256 _mm256_conjugate_ps(__m256 x)
71 {
72  const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
73  return _mm256_xor_ps(x, conjugator); // conjugate y
74 }
75 
76 static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
77 {
78  const __m256 nswap = _mm256_permute_ps(x, 0xb1);
79  const __m256 dreal = _mm256_moveldup_ps(y);
80  const __m256 dimag = _mm256_movehdup_ps(y);
81 
82  const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
83  const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
84  const __m256 multreal = _mm256_mul_ps(x, dreal);
85  const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
86  return _mm256_add_ps(multreal, multimag);
87 }
88 
89 static inline __m256 _mm256_normalize_ps(__m256 val)
90 {
91  __m256 tmp1 = _mm256_mul_ps(val, val);
92  tmp1 = _mm256_hadd_ps(tmp1, tmp1);
93  tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
94  tmp1 = _mm256_sqrt_ps(tmp1);
95  return _mm256_div_ps(val, tmp1);
96 }
97 
98 static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
99 {
100  __m256 complex1, complex2;
101  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
102  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
103  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
104  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
105  return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
106 }
107 
108 static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
109 {
110  return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
111 }
112 
113 static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
114  const __m256 symbols1,
115  const __m256 points0,
116  const __m256 points1,
117  const __m256 scalar)
118 {
119  /*
120  * Calculate: |y - x|^2 * SNR_lin
121  * Consider 'symbolsX' and 'pointsX' to be complex float
122  * 'symbolsX' are 'y' and 'pointsX' are 'x'
123  */
124  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
125  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
126  const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
127  return _mm256_mul_ps(norms, scalar);
128 }
129 
130 static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
131 {
132  __m256 sign_mask_dummy = _mm256_setzero_ps();
133  const __m128i zeros = _mm_set1_epi8(0x00);
134  const __m128i sign_extract = _mm_set1_epi8(0x80);
135  const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
136  0xff,
137  0xff,
138  0x00,
139  0xff,
140  0xff,
141  0xff,
142  0x01,
143  0xff,
144  0xff,
145  0xff,
146  0x02,
147  0xff,
148  0xff,
149  0xff,
150  0x03);
151  const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
152  0xff,
153  0xff,
154  0x04,
155  0xff,
156  0xff,
157  0xff,
158  0x05,
159  0xff,
160  0xff,
161  0xff,
162  0x06,
163  0xff,
164  0xff,
165  0xff,
166  0x07);
167 
168  fbits = _mm_cmpgt_epi8(fbits, zeros);
169  fbits = _mm_and_si128(fbits, sign_extract);
170  __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
171  __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
172 
173  __m256 sign_mask =
174  _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
175  return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
176  // // This is the desired function call. Though it seems to be missing in GCC.
177  // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
178  // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
179  // _mm_castsi128_ps(sign_bits0));
180 }
181 
182 static inline void
183 _mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
184 {
185  // deinterleave values
186  __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
187  __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
188  *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
189  *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
190 }
191 
192 static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
193 {
194  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
195  const __m256 abs_mask =
196  _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
197 
198  __m256 llr0, llr1;
199  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
200 
201  // calculate result
202  __m256 sign =
203  _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
204  __m256 dst =
205  _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
206  return _mm256_or_ps(dst, sign);
207 }
208 
209 static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
210 {
211  // prepare sign mask for correct +-
212  __m256 sign_mask = _mm256_polar_sign_mask(fbits);
213 
214  __m256 llr0, llr1;
215  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
216 
217  // calculate result
218  llr0 = _mm256_xor_ps(llr0, sign_mask);
219  __m256 dst = _mm256_add_ps(llr0, llr1);
220  return dst;
221 }
222 
223 static inline __m256 _mm256_accumulate_square_sum_ps(
224  __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
225 {
226  aux = _mm256_mul_ps(aux, val);
227  aux = _mm256_sub_ps(aux, acc);
228  aux = _mm256_mul_ps(aux, aux);
229  aux = _mm256_mul_ps(aux, rec);
230  return _mm256_add_ps(sq_acc, aux);
231 }
232 
233 #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
val
Definition: volk_arch_defs.py:57
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:98
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:108
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:183
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:76
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:223
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:57
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:192
static __m256 _mm256_conjugate_ps(__m256 x)
Definition: volk_avx_intrinsics.h:70
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:89
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:113
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition: volk_avx_intrinsics.h:130
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition: volk_avx_intrinsics.h:27
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:209