Agora  1.2.0
Agora project
datatype_conversion.h
Go to the documentation of this file.
1 
6 #ifndef DATATYPE_CONVERSION_H_
7 #define DATATYPE_CONVERSION_H_
8 
9 #include <emmintrin.h>
10 #include <immintrin.h>
11 
12 #include <bitset>
13 
14 #include "utils.h"
15 
16 //#define DATATYPE_MEMORY_CHECK
17 //Needs to be a factor of 2?
18 static constexpr float kShrtFltConvFactor = 32768.0f;
19 
20 #if defined(__AVX512F__)
21 constexpr size_t kAvx512Bits = 512;
22 constexpr size_t kAvx512Bytes = kAvx512Bits / 8;
23 constexpr size_t kAvx512FloatsPerInstr = kAvx512Bytes / sizeof(float);
24 //2 because of the 32->16 ints
25 constexpr size_t kAvx512FloatsPerLoop = kAvx512FloatsPerInstr * 2;
26 constexpr size_t kAvx512ShortsPerInstr = kAvx512Bytes / sizeof(short);
27 //Half because read->expand->use 512 instr
28 constexpr size_t kAvx512ShortsPerLoop = kAvx512ShortsPerInstr / 2;
29 #endif
30 
31 constexpr size_t kAvx2Bits = 256;
32 constexpr size_t kAvx2Bytes = kAvx2Bits / 8;
33 constexpr size_t kAvx2FloatsPerInstr = kAvx2Bytes / sizeof(float);
34 //2 because of the 32->16 ints
35 constexpr size_t kAvx2FloatsPerLoop = kAvx2FloatsPerInstr * 2;
36 constexpr size_t kAvx2ShortsPerInstr = kAvx2Bytes / sizeof(short);
37 //Half because read->expand->use 256 instr
38 constexpr size_t kAvx2ShortsPerLoop = kAvx2ShortsPerInstr / 2;
39 
41 static inline void ConvertShortToFloat(const short* in_buf, float* out_buf,
42  size_t n_elems) {
43  for (size_t i = 0; i < n_elems; i++) {
44  out_buf[i] = static_cast<float>(in_buf[i]) / kShrtFltConvFactor;
45  }
46 }
47 
48 static inline void SimdConvertShortToFloatAVX512(const short* in_buf,
49  float* out_buf,
50  size_t n_elems) {
51 #if defined(__AVX512F__)
52 #if defined(DATATYPE_MEMORY_CHECK)
53  RtAssert(((n_elems % kAvx512ShortsPerLoop) == 0) &&
54  ((reinterpret_cast<intptr_t>(in_buf) % kAvx512Bytes) == 0) &&
55  ((reinterpret_cast<intptr_t>(out_buf) % kAvx512Bytes) == 0),
56  "Data Alignment not correct before calling into AVX optimizations");
57 #endif
58 
59  const bool unaligned =
60  ((reinterpret_cast<intptr_t>(in_buf) % kAvx512Bytes) > 0);
61  const __m512 magic =
62  _mm512_set1_ps(float((1 << 23) + (1 << 15)) / kShrtFltConvFactor);
63  const __m512i magic_i = _mm512_castps_si512(magic);
64  for (size_t i = 0; i < n_elems; i += kAvx512ShortsPerLoop) {
65  // Load shorts with 1/2 instr so we have room for expansion
66  // port 2,3
67  const __m256i val =
68  unaligned
69  ? _mm256_loadu_si256(reinterpret_cast<const __m256i*>(in_buf + i))
70  : _mm256_load_si256(reinterpret_cast<const __m256i*>(in_buf + i));
71  // Expand and interleave with 0x0000
72  const __m512i val_unpacked = _mm512_cvtepu16_epi32(val); // port 5
73  /* convert by xor-ing and subtracting magic value:
74  * VPXOR avoids port5 bottlenecks on Intel CPUs before SKL */
75  const __m512i val_f_int =
76  _mm512_xor_si512(val_unpacked, magic_i); // port 0,1,5
77  const __m512 val_f = _mm512_castsi512_ps(val_f_int); // no instruction
78  const __m512 converted = _mm512_sub_ps(val_f, magic); // port 1,5 ?
79  _mm512_store_ps(out_buf + i, converted); // port 2,3,4,7
80  }
81 #else
82  unused(in_buf);
83  unused(out_buf);
84  unused(n_elems);
85  throw std::runtime_error("AVX512 is not supported");
86 #endif
87 }
88 
89 static inline void SimdConvertShortToFloatAVX2(const short* in_buf,
90  float* out_buf, size_t n_elems) {
91 #if defined(DATATYPE_MEMORY_CHECK)
92  RtAssert(((n_elems % kAvx2ShortsPerLoop) == 0) &&
93  ((reinterpret_cast<intptr_t>(in_buf) % kAvx2Bytes) == 0) &&
94  ((reinterpret_cast<intptr_t>(out_buf) % kAvx2Bytes) == 0),
95  "Data Alignment not correct before calling into AVX optimizations");
96 #endif
97  const bool unaligned =
98  ((reinterpret_cast<intptr_t>(in_buf) % kAvx2Bytes) > 0);
99  //Divisior must be power of 2?
100  const __m256 magic =
101  _mm256_set1_ps(float((1 << 23) + (1 << 15)) / kShrtFltConvFactor);
102  const __m256i magic_i = _mm256_castps_si256(magic);
103  for (size_t i = 0; i < n_elems; i += kAvx2ShortsPerLoop) {
104  // port 2,3
105  const __m128i val =
106  unaligned
107  ? _mm_loadu_si128(reinterpret_cast<const __m128i*>(in_buf + i))
108  : _mm_load_si128(reinterpret_cast<const __m128i*>(in_buf + i));
109 
110  // expand to 32bits and interleave with 0x0000
111  const __m256i val_unpacked = _mm256_cvtepu16_epi32(val); // port 5
112  /* convert by xor-ing and subtracting magic value:
113  * VPXOR avoids port5 bottlenecks on Intel CPUs before SKL */
114  const __m256i val_f_int =
115  _mm256_xor_si256(val_unpacked, magic_i); // port 0,1,5
116  const __m256 val_f = _mm256_castsi256_ps(val_f_int); // no instruction
117  const __m256 converted = _mm256_sub_ps(val_f, magic); // port 1,5 ?
118  _mm256_store_ps(out_buf + i, converted); // port 2,3,4,7
119  }
120 }
121 
122 // Convert a short array [in_buf] to a float array [out_buf]. Each array must
123 // have [n_elems] elements.
124 // in_buf and out_buf must be 64-byte aligned
125 // n_elems must be a multiple of 16
126 static inline void SimdConvertShortToFloat(const short* in_buf, float* out_buf,
127  size_t n_elems) {
128 #if defined(__AVX512F__)
129  return SimdConvertShortToFloatAVX512(in_buf, out_buf, n_elems);
130 #else
131  return SimdConvertShortToFloatAVX2(in_buf, out_buf, n_elems);
132 #endif
133 }
134 
135 // Convert a float array [in_buf] to a short array [out_buf]. Input array must
136 // have [n_elems] elements. Output array must have [n_elems + n_prefix] elements.
137 // in_buf and out_buf must be 64-byte aligned
138 // n_elems must be a multiple of 16 for AVX512
139 // scale_down_factor is used for scaling down values in the input array
140 static inline void SimdConvertFloatToShortAVX512(const float* in_buf,
141  short* out_buf, size_t n_elems,
142  size_t n_prefix,
143  float scale_down_factor) {
144 #if defined(__AVX512F__)
145 #if defined(DATATYPE_MEMORY_CHECK)
146  constexpr size_t kAvx512ShortPerInstr = kAvx512Bytes / sizeof(short);
147  RtAssert(((n_elems % kAvx512FloatsPerInstr) == 0) &&
148  ((n_prefix % kAvx512ShortPerInstr) == 0) &&
149  ((reinterpret_cast<intptr_t>(in_buf) % kAvx512Bytes) == 0) &&
150  ((reinterpret_cast<intptr_t>(out_buf) % kAvx512Bytes) == 0),
151  "Data Alignment not correct before calling into AVX optimizations");
152 #endif
153  const float scale_factor_float = kShrtFltConvFactor / scale_down_factor;
154  const __m512 scale_factor = _mm512_set1_ps(scale_factor_float);
155  const __m512i permute_index = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
156  for (size_t i = 0; i < n_elems; i += kAvx512FloatsPerLoop) {
157  const __m512 in1 = _mm512_load_ps(&in_buf[i]);
158  const __m512 in2 = _mm512_load_ps(&in_buf[i + kAvx512FloatsPerInstr]);
159  const __m512 scaled_in1 = _mm512_mul_ps(in1, scale_factor);
160  const __m512 scaled_in2 = _mm512_mul_ps(in2, scale_factor);
161  const __m512i int32_1 = _mm512_cvtps_epi32(scaled_in1);
162  const __m512i int32_2 = _mm512_cvtps_epi32(scaled_in2);
163  const __m512i short_int16 = _mm512_packs_epi32(int32_1, int32_2);
164  const __m512i shuffled =
165  _mm512_permutexvar_epi64(permute_index, short_int16);
166  _mm512_stream_si512(reinterpret_cast<__m512i*>(&out_buf[i + n_prefix]),
167  shuffled);
168  // Prepend / Set cyclic prefix
169  const size_t repeat_idx = n_elems - n_prefix;
170  if (i >= repeat_idx) {
171  _mm512_stream_si512(reinterpret_cast<__m512i*>(&out_buf[i - repeat_idx]),
172  shuffled);
173  }
174  }
175 #else
176  unused(in_buf);
177  unused(out_buf);
178  unused(n_elems);
179  unused(n_prefix);
180  unused(scale_down_factor);
181  throw std::runtime_error("AVX512 is not supported");
182 #endif
183 }
184 
185 // Convert a float array [in_buf] to a short array [out_buf]. Input array must
186 // have [n_elems] elements. Output array must have [n_elems + n_prefix] elements.
187 // in_buf and out_buf must be 64-byte aligned
188 // n_elems must be a multiple of 8 for AVX2
189 // scale_down_factor is used for scaling down values in the input array
190 static inline void SimdConvertFloatToShortAVX2(const float* in_buf,
191  short* out_buf, size_t n_elems,
192  size_t n_prefix,
193  float scale_down_factor) {
194 #if defined(DATATYPE_MEMORY_CHECK)
195  constexpr size_t kAvx2ShortPerInstr = kAvx2Bytes / sizeof(short);
196  RtAssert(((n_elems % kAvx2FloatsPerLoop) == 0) &&
197  ((n_prefix % kAvx2ShortPerInstr) == 0) &&
198  ((reinterpret_cast<intptr_t>(in_buf) % kAvx2Bytes) == 0) &&
199  ((reinterpret_cast<intptr_t>(out_buf) % kAvx2Bytes) == 0),
200  "Data Alignment not correct before calling into AVX optimizations");
201 #endif
202 
203  const float scale_factor_float = kShrtFltConvFactor / scale_down_factor;
204 
205  const __m256 scale_factor = _mm256_set1_ps(scale_factor_float);
206  //Operates on 2 elements at a time
207  for (size_t i = 0; i < n_elems; i += kAvx2FloatsPerLoop) {
208  const __m256 in1 = _mm256_load_ps(&in_buf[i]);
209  //Grab the next value, and interate over 2 values
210  const __m256 in2 = _mm256_load_ps(&in_buf[i + kAvx2FloatsPerInstr]);
211  const __m256 scaled_in1 = _mm256_mul_ps(in1, scale_factor);
212  const __m256 scaled_in2 = _mm256_mul_ps(in2, scale_factor);
213  //Packed float to 32bit ints (_mm256_cvttps_epi32 vs _mm256_cvtps_epi32)
214  const __m256i integer1 = _mm256_cvtps_epi32(scaled_in1);
215  const __m256i integer2 = _mm256_cvtps_epi32(scaled_in2);
216  //Convert dword to word and saturate
217  const __m256i short_ints = _mm256_packs_epi32(integer1, integer2);
218  //packing shuffles groups of 4 floats
219  const __m256i slice = _mm256_permute4x64_epi64(short_ints, 0xD8);
220  // _mm256_store_si256 or _mm256_stream_si256 (cache vs non-temperal) offset by n_prefix
221  _mm256_stream_si256(reinterpret_cast<__m256i*>(&out_buf[i + n_prefix]),
222  slice);
223  // Prepend / Set cyclic prefix
224  const size_t repeat_idx = n_elems - n_prefix;
225  if (i >= repeat_idx) {
226  _mm256_stream_si256(reinterpret_cast<__m256i*>(&out_buf[i - repeat_idx]),
227  slice);
228  }
229  }
230 }
231 
232 // Convert a float array [in_buf] to a short array [out_buf]. Input array must
233 // have [n_elems] elements. Output array must have [n_elems + n_prefix] elements.
234 // in_buf and out_buf must be 64-byte aligned
235 // n_prefix prepends the output with the last n_prefix values
236 // scale_down_factor is used for scaling down values in the input array
237 static inline void ConvertFloatToShort(const float* in_buf, short* out_buf,
238  size_t n_elems, size_t n_prefix = 0,
239  float scale_down_factor = 1.0f) {
240  for (size_t i = 0; i < n_elems; i++) {
241  short converted_value;
242  const float scaled_value =
243  in_buf[i] * (kShrtFltConvFactor / scale_down_factor);
244 
245  //Saturate the output
246  if (scaled_value >= SHRT_MAX) {
247  converted_value = SHRT_MAX;
248  } else if (scaled_value <= SHRT_MIN) {
249  converted_value = SHRT_MIN;
250  } else {
251  converted_value = static_cast<short>(scaled_value);
252  }
253  out_buf[i + n_prefix] = converted_value;
254  }
255  //Prepend with last cp len
256  for (size_t i = 0; i < n_prefix; i++) {
257  out_buf[i] = out_buf[i + n_elems];
258  }
259 }
260 
261 // Convert a float array [in_buf] to a short array [out_buf]. Input array must
262 // have [n_elems] elements. Output array must have [n_elems + n_prefix] elements.
263 // in_buf and out_buf must be 64-byte aligned
264 // n_elems must be a multiple of 8 for AVX2 and 16 for AVX512
265 // scale_down_factor is used for scaling down values in the input array
266 static inline void SimdConvertFloatToShort(const float* in_buf, short* out_buf,
267  size_t n_elems, size_t n_prefix = 0,
268  float scale_down_factor = 1.0f) {
269 #if defined(__AVX512F__)
270  SimdConvertFloatToShortAVX512(in_buf, out_buf, n_elems, n_prefix,
271  scale_down_factor);
272 #else
273  SimdConvertFloatToShortAVX2(in_buf, out_buf, n_elems, n_prefix,
274  scale_down_factor);
275 #endif
276 }
277 
278 //Assumes complex float == float float
279 static inline void SimdConvertCxFloatToCxShort(
280  const std::complex<float>* in_buf, std::complex<short>* out_buf,
281  size_t n_elems, size_t n_prefix, float scale_down_factor) {
282  const auto* in = reinterpret_cast<const float*>(in_buf);
283  auto* out = reinterpret_cast<short*>(out_buf);
284 #if defined(__AVX512F__)
285  SimdConvertFloatToShortAVX512(in, out, n_elems * 2, n_prefix * 2,
286  scale_down_factor);
287 #else
288  SimdConvertFloatToShortAVX2(in, out, n_elems * 2, n_prefix * 2,
289  scale_down_factor);
290 #endif
291 }
292 
293 // Convert a float IQ array [in_buf] to an uint8_t IQ array [out_buf].
294 // Each float is converted to 12-bit data (2 floats corresponds to 3 uint8_t).
295 // Input array must have [n_elems] elements.
296 // Output array must have [n_elems / 2 * 3] elements.
297 // n_elems must be multiples of 2
298 static inline void ConvertFloatTo12bitIq(const float* in_buf, uint8_t* out_buf,
299  size_t n_elems) {
300 #if defined(DATATYPE_MEMORY_CHECK)
301  RtAssert((n_elems % 2) == 0,
302  "ConvertFloatTo12bitIq n_elems not multiple of 2");
303 #endif
304  size_t index_short = 0;
305  for (size_t i = 0; i < n_elems; i = i + 2) {
306  const auto temp_i =
307  static_cast<unsigned short>(in_buf[i] * kShrtFltConvFactor * 4);
308  const auto temp_q =
309  static_cast<unsigned short>(in_buf[i + 1] * kShrtFltConvFactor * 4);
310  // Take the higher 12 bits and ignore the lower 4 bits
311  out_buf[index_short] = (uint8_t)(temp_i >> 4);
312  out_buf[index_short + 1] =
313  ((uint8_t)(temp_i >> 12)) | ((uint8_t)(temp_q & 0xf0));
314  out_buf[index_short + 2] = (uint8_t)(temp_q >> 8);
315  if (kDebug12BitIQ) {
316  std::cout << "i: " << i << " " << std::bitset<16>(temp_i) << " "
317  << std::bitset<16>(temp_q) << " => "
318  << std::bitset<8>(out_buf[index_short]) << " "
319  << std::bitset<8>(out_buf[index_short + 1]) << " "
320  << std::bitset<8>(out_buf[index_short + 2]) << std::endl;
321  std::printf("Original: %.4f+%.4fi \n", in_buf[i], in_buf[i + 1]);
322  }
323  index_short += 3;
324  }
325 }
326 
327 #ifdef __AVX512F__
328 static inline void SimdConvert16bitIqToFloat(__m256i val, float* out_buf,
329  __m512 magic, __m512i magic_i) {
330  /* interleave with 0x0000 */
331  __m512i val_unpacked = _mm512_cvtepu16_epi32(val); // port 5
332  /* convert by xor-ing and subtracting magic value:
333  * VPXOR avoids port5 bottlenecks on Intel CPUs before SKL */
334  __m512i val_f_int = _mm512_xor_si512(val_unpacked, magic_i); // port 0,1,5
335  __m512 val_f = _mm512_castsi512_ps(val_f_int); // no instruction
336  __m512 converted = _mm512_sub_ps(val_f, magic); // port 1,5 ?
337  _mm512_store_ps(out_buf, converted); // port 2,3,4,7
338 }
339 #endif
340 
341 static inline void Convert12bitIqTo16bitIq(uint8_t* in_buf, uint16_t* out_buf,
342  size_t n_elems) {
343 #if defined(DATATYPE_MEMORY_CHECK)
344  RtAssert(((n_elems % 16) == 0) &&
345  ((reinterpret_cast<intptr_t>(in_buf) % kAvx2Bytes) == 0) &&
346  ((reinterpret_cast<intptr_t>(out_buf) % kAvx2Bytes) == 0),
347  "Convert12bitIqTo16bitIq: Data Alignment not correct before calling "
348  "into AVX optimizations");
349 #endif
350  for (size_t i = 0; i < n_elems; i += 16) {
351  _mm256_loadu_si256((__m256i const*)in_buf);
352  _mm256_loadu_si256((__m256i const*)(in_buf + 16));
353  __m256i temp_i =
354  _mm256_setr_epi16(*(uint16_t*)in_buf, *(uint16_t*)(in_buf + 3),
355  *(uint16_t*)(in_buf + 6), *(uint16_t*)(in_buf + 9),
356  *(uint16_t*)(in_buf + 12), *(uint16_t*)(in_buf + 15),
357  *(uint16_t*)(in_buf + 18), *(uint16_t*)(in_buf + 21),
358  *(uint16_t*)(in_buf + 24), *(uint16_t*)(in_buf + 27),
359  *(uint16_t*)(in_buf + 30), *(uint16_t*)(in_buf + 33),
360  *(uint16_t*)(in_buf + 36), *(uint16_t*)(in_buf + 39),
361  *(uint16_t*)(in_buf + 42), *(uint16_t*)(in_buf + 45));
362 
363  __m256i mask_q = _mm256_set1_epi16(0xfff0);
364  __m256i temp_q =
365  _mm256_setr_epi16(*(uint16_t*)(in_buf + 1), *(uint16_t*)(in_buf + 4),
366  *(uint16_t*)(in_buf + 7), *(uint16_t*)(in_buf + 10),
367  *(uint16_t*)(in_buf + 13), *(uint16_t*)(in_buf + 16),
368  *(uint16_t*)(in_buf + 19), *(uint16_t*)(in_buf + 22),
369  *(uint16_t*)(in_buf + 25), *(uint16_t*)(in_buf + 28),
370  *(uint16_t*)(in_buf + 31), *(uint16_t*)(in_buf + 34),
371  *(uint16_t*)(in_buf + 37), *(uint16_t*)(in_buf + 40),
372  *(uint16_t*)(in_buf + 43), *(uint16_t*)(in_buf + 46));
373 
374  temp_q = _mm256_and_si256(temp_q, mask_q); // Set lower 4 bits to 0
375  temp_i = _mm256_slli_epi16(temp_i, 4); // Shift left by 4 bits
376 
377  __m256i iq_0 = _mm256_unpacklo_epi16(temp_i, temp_q);
378  __m256i iq_1 = _mm256_unpackhi_epi16(temp_i, temp_q);
379  __m256i output_0 = _mm256_permute2f128_si256(iq_0, iq_1, 0x20);
380  __m256i output_1 = _mm256_permute2f128_si256(iq_0, iq_1, 0x31);
381  _mm256_store_si256((__m256i*)(out_buf + i * 2), output_0);
382  _mm256_store_si256((__m256i*)(out_buf + i * 2 + 16), output_1);
383  }
384 
385  // for (size_t i = 0; i < n_elems; i++) {
386  // // out_buf[i * 2]
387  // // = (((uint16_t)in_buf[i * 3]) << 8) | (in_buf[i * 3 + 1] & 0xf0);
388  // // out_buf[i * 2 + 1] = (uint16_t)in_buf[i * 3 + 2] << 4
389  // // | ((uint16_t)(in_buf[i * 3 + 1] & 0xf) << 12);
390  // // if (kDebug12BitIQ) {
391  // std::cout << "i: " << i << " " << std::bitset<8>(in_buf[i * 3]) << " "
392  // << std::bitset<8>(in_buf[i * 3 + 1]) << " "
393  // << std::bitset<8>(in_buf[i * 3 + 2]) << "=>"
394  // << std::bitset<16>(out_buf[i * 2]) << " "
395  // << std::bitset<16>(out_buf[i * 2 + 1]) << std::endl;
396  // // }
397  // }
398 }
399 
400 // Convert an uint8_t IQ array [in_buf] to a float IQ array [out_buf].
401 // Each 12-bit I/Q is converted to a float (3 uint8_t corresponds to 2 floats).
402 // Input array must have [n_elems] elements.
403 // Output array must have [n_elems / 3 * 2] elements.
404 // n_elems must be multiples of 3
405 static inline void SimdConvert12bitIqToFloat(const uint8_t* in_buf,
406  float* out_buf,
407  const uint16_t* in_16bits_buf,
408  size_t n_elems) {
409  unused(in_16bits_buf);
410 #ifdef __AVX512F__
411  const __m512 magic = _mm512_set1_ps(float((1 << 23) + (1 << 15)) / 131072.f);
412  const __m512i magic_i = _mm512_castps_si512(magic);
413 #else
414  const __m256 magic = _mm256_set1_ps(float((1 << 23) + (1 << 15)) / 131072.f);
415  const __m256i magic_i = _mm256_castps_si256(magic);
416 #endif
417 #ifdef __AVX512F__
418  for (size_t i = 0; i < n_elems / 3; i += 32) {
419  __m512i temp_i =
420  _mm512_set_epi16(*(uint16_t*)(in_buf + 93), *(uint16_t*)(in_buf + 90),
421  *(uint16_t*)(in_buf + 87), *(uint16_t*)(in_buf + 84),
422  *(uint16_t*)(in_buf + 81), *(uint16_t*)(in_buf + 78),
423  *(uint16_t*)(in_buf + 75), *(uint16_t*)(in_buf + 72),
424  *(uint16_t*)(in_buf + 69), *(uint16_t*)(in_buf + 66),
425  *(uint16_t*)(in_buf + 63), *(uint16_t*)(in_buf + 60),
426  *(uint16_t*)(in_buf + 57), *(uint16_t*)(in_buf + 54),
427  *(uint16_t*)(in_buf + 51), *(uint16_t*)(in_buf + 48),
428  *(uint16_t*)(in_buf + 45), *(uint16_t*)(in_buf + 42),
429  *(uint16_t*)(in_buf + 39), *(uint16_t*)(in_buf + 36),
430  *(uint16_t*)(in_buf + 33), *(uint16_t*)(in_buf + 30),
431  *(uint16_t*)(in_buf + 27), *(uint16_t*)(in_buf + 24),
432  *(uint16_t*)(in_buf + 21), *(uint16_t*)(in_buf + 18),
433  *(uint16_t*)(in_buf + 15), *(uint16_t*)(in_buf + 12),
434  *(uint16_t*)(in_buf + 9), *(uint16_t*)(in_buf + 6),
435  *(uint16_t*)(in_buf + 3), *(uint16_t*)(in_buf + 0));
436 
437  __m512i mask_q = _mm512_set1_epi16(0xfff0);
438  __m512i temp_q =
439  _mm512_set_epi16(*(uint16_t*)(in_buf + 94), *(uint16_t*)(in_buf + 91),
440  *(uint16_t*)(in_buf + 88), *(uint16_t*)(in_buf + 85),
441  *(uint16_t*)(in_buf + 82), *(uint16_t*)(in_buf + 79),
442  *(uint16_t*)(in_buf + 76), *(uint16_t*)(in_buf + 73),
443  *(uint16_t*)(in_buf + 70), *(uint16_t*)(in_buf + 67),
444  *(uint16_t*)(in_buf + 64), *(uint16_t*)(in_buf + 61),
445  *(uint16_t*)(in_buf + 58), *(uint16_t*)(in_buf + 55),
446  *(uint16_t*)(in_buf + 52), *(uint16_t*)(in_buf + 49),
447  *(uint16_t*)(in_buf + 46), *(uint16_t*)(in_buf + 43),
448  *(uint16_t*)(in_buf + 40), *(uint16_t*)(in_buf + 37),
449  *(uint16_t*)(in_buf + 34), *(uint16_t*)(in_buf + 31),
450  *(uint16_t*)(in_buf + 28), *(uint16_t*)(in_buf + 25),
451  *(uint16_t*)(in_buf + 22), *(uint16_t*)(in_buf + 19),
452  *(uint16_t*)(in_buf + 16), *(uint16_t*)(in_buf + 13),
453  *(uint16_t*)(in_buf + 10), *(uint16_t*)(in_buf + 7),
454  *(uint16_t*)(in_buf + 4), *(uint16_t*)(in_buf + 1));
455 
456  temp_q = _mm512_and_si512(temp_q, mask_q); // Set lower 4 bits to 0
457  temp_i = _mm512_slli_epi16(temp_i, 4); // Shift left by 4 bits
458 
459  __m512i iq_0 = _mm512_unpacklo_epi16(temp_i, temp_q);
460  __m512i iq_1 = _mm512_unpackhi_epi16(temp_i, temp_q);
461  __m512i output_0 = _mm512_permutex2var_epi64(
462  iq_0, _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0), iq_1);
463  __m512i output_1 = _mm512_permutex2var_epi64(
464  iq_0, _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4), iq_1);
465 
466  SimdConvert16bitIqToFloat(_mm512_extracti64x4_epi64(output_0, 0),
467  out_buf + i * 2, magic, magic_i);
468  SimdConvert16bitIqToFloat(_mm512_extracti64x4_epi64(output_0, 1),
469  out_buf + i * 2 + 16, magic, magic_i);
470  SimdConvert16bitIqToFloat(_mm512_extracti64x4_epi64(output_1, 0),
471  out_buf + i * 2 + 32, magic, magic_i);
472  SimdConvert16bitIqToFloat(_mm512_extracti64x4_epi64(output_1, 1),
473  out_buf + i * 2 + 48, magic, magic_i);
474  in_buf += 96;
475  }
476 
477 #else
478  for (size_t i = 0; i < n_elems / 3; i += 16) {
479  // Convert 16 IQ smaples from 48 uint8_t to 32 shorts
480  // convert_12bit_iq_to_16bit_iq(in_buf + i * 3, in_16bits_buf, 16);
481  __m256i temp_i =
482  _mm256_setr_epi16(*(uint16_t*)in_buf, *(uint16_t*)(in_buf + 3),
483  *(uint16_t*)(in_buf + 6), *(uint16_t*)(in_buf + 9),
484  *(uint16_t*)(in_buf + 12), *(uint16_t*)(in_buf + 15),
485  *(uint16_t*)(in_buf + 18), *(uint16_t*)(in_buf + 21),
486  *(uint16_t*)(in_buf + 24), *(uint16_t*)(in_buf + 27),
487  *(uint16_t*)(in_buf + 30), *(uint16_t*)(in_buf + 33),
488  *(uint16_t*)(in_buf + 36), *(uint16_t*)(in_buf + 39),
489  *(uint16_t*)(in_buf + 42), *(uint16_t*)(in_buf + 45));
490 
491  __m256i mask_q = _mm256_set1_epi16(0xfff0);
492  __m256i temp_q =
493  _mm256_setr_epi16(*(uint16_t*)(in_buf + 1), *(uint16_t*)(in_buf + 4),
494  *(uint16_t*)(in_buf + 7), *(uint16_t*)(in_buf + 10),
495  *(uint16_t*)(in_buf + 13), *(uint16_t*)(in_buf + 16),
496  *(uint16_t*)(in_buf + 19), *(uint16_t*)(in_buf + 22),
497  *(uint16_t*)(in_buf + 25), *(uint16_t*)(in_buf + 28),
498  *(uint16_t*)(in_buf + 31), *(uint16_t*)(in_buf + 34),
499  *(uint16_t*)(in_buf + 37), *(uint16_t*)(in_buf + 40),
500  *(uint16_t*)(in_buf + 43), *(uint16_t*)(in_buf + 46));
501 
502  temp_q = _mm256_and_si256(temp_q, mask_q); // Set lower 4 bits to 0
503  temp_i = _mm256_slli_epi16(temp_i, 4); // Shift left by 4 bits
504 
505  __m256i iq_0 = _mm256_unpacklo_epi16(temp_i, temp_q);
506  __m256i iq_1 = _mm256_unpackhi_epi16(temp_i, temp_q);
507  __m256i output_0 = _mm256_permute2f128_si256(iq_0, iq_1, 0x20);
508  __m256i output_1 = _mm256_permute2f128_si256(iq_0, iq_1, 0x31);
509 
510  _mm256_store_si256((__m256i*)(in_16bits_buf), output_0);
511  _mm256_store_si256((__m256i*)(in_16bits_buf + 16), output_1);
512 
513  // Conver short to float
514  for (size_t j = 0; j < 2; j++) {
515  /* get input */
516  __m128i val =
517  _mm_load_si128((__m128i*)(in_16bits_buf + j * 16)); // port 2,3
518 
519  __m128i val1 = _mm_load_si128((__m128i*)(in_16bits_buf + j * 16 + 8));
520  /* interleave with 0x0000 */
521  __m256i val_unpacked = _mm256_cvtepu16_epi32(val); // port 5
522  /* convert by xor-ing and subtracting magic value:
523  * VPXOR avoids port5 bottlenecks on Intel CPUs before SKL */
524  __m256i val_f_int =
525  _mm256_xor_si256(val_unpacked, magic_i); // port 0,1,5
526  __m256 val_f = _mm256_castsi256_ps(val_f_int); // no instruction
527  __m256 converted = _mm256_sub_ps(val_f, magic); // port 1,5 ?
528  _mm256_store_ps(out_buf + i * 2 + j * 16, converted); // port 2,3,4,7
529 
530  __m256i val_unpacked1 = _mm256_cvtepu16_epi32(val1); // port 5
531  __m256i val_f_int1 =
532  _mm256_xor_si256(val_unpacked1, magic_i); // port 0,1,5
533  __m256 val_f1 = _mm256_castsi256_ps(val_f_int1); // no instruction
534  __m256 converted1 = _mm256_sub_ps(val_f1, magic); // port 1,5 ?
535  _mm256_store_ps(out_buf + i * 2 + j * 16 + 8,
536  converted1); // port 2,3,4,7
537  }
538  in_buf += 48;
539  }
540 #endif
541 }
542 
543 // Convert a float16 array [in_buf] to a float32 array [out_buf]. Each array
544 // must have [n_elems] elements
545 // in_buf and out_buf must be 64-byte aligned
546 // n_elems must be a multiple of 16
547 static inline void SimdConvertFloat16ToFloat32(float* out_buf,
548  const float* in_buf,
549  size_t n_elems) {
550 #if defined(DATATYPE_MEMORY_CHECK)
551  RtAssert(((n_elems % 16) == 0) &&
552  ((reinterpret_cast<intptr_t>(in_buf) % 64) == 0) &&
553  ((reinterpret_cast<intptr_t>(out_buf) % 64) == 0),
554  "SimdConvertFloat16ToFloat32: Data Alignment not correct before "
555  "calling into AVX optimizations");
556 #endif
557 #ifdef __AVX512F__
558  for (size_t i = 0; i < n_elems; i += 16) {
559  __m256i val_a = _mm256_load_si256((__m256i*)(in_buf + i / 2));
560  __m512 val = _mm512_cvtph_ps(val_a);
561  _mm512_store_ps(out_buf + i, val);
562  }
563 #else
564  for (size_t i = 0; i < n_elems; i += 8) {
565  __m128i val_a = _mm_load_si128((__m128i*)(in_buf + i / 2));
566  __m256 val = _mm256_cvtph_ps(val_a);
567  _mm256_store_ps(out_buf + i, val);
568  }
569 #endif
570 }
571 
572 // Convert a float32 array [in_buf] to a float16 array [out_buf]. Each array
573 // must have [n_elems] elements
574 // in_buf and out_buf must be 64-byte aligned
575 // n_elems must be a multiple of 16
576 static inline void SimdConvertFloat32ToFloat16(float* out_buf,
577  const float* in_buf,
578  size_t n_elems) {
579 #if defined(DATATYPE_MEMORY_CHECK)
580  RtAssert(((n_elems % 16) == 0) &&
581  ((reinterpret_cast<intptr_t>(in_buf) % 64) == 0) &&
582  ((reinterpret_cast<intptr_t>(out_buf) % 64) == 0),
583  "SimdConvertFloat32ToFloat16: Data Alignment not correct before "
584  "calling into AVX optimizations");
585 #endif
586 
587 #ifdef __AVX512F__
588  for (size_t i = 0; i < n_elems; i += 16) {
589  __m512 val_a = _mm512_load_ps(in_buf + i);
590  __m256i val = _mm512_cvtps_ph(val_a, _MM_FROUND_NO_EXC);
591  _mm256_store_si256(reinterpret_cast<__m256i*>(out_buf + i / 2), val);
592  }
593 #else
594  for (size_t i = 0; i < n_elems; i += 8) {
595  __m256 val_a = _mm256_load_ps(in_buf + i);
596  __m128i val = _mm256_cvtps_ph(val_a, _MM_FROUND_NO_EXC);
597  _mm_store_si128(reinterpret_cast<__m128i*>(out_buf + i / 2), val);
598  }
599 #endif
600 }
601 #endif // DATATYPE_CONVERSION_H_
kDebug12BitIQ
static constexpr bool kDebug12BitIQ
Definition: symbols.h:179
SimdConvertShortToFloatAVX2
static void SimdConvertShortToFloatAVX2(const short *in_buf, float *out_buf, size_t n_elems)
Definition: datatype_conversion.h:89
fmt::v8::printf
auto printf(const S &fmt, const T &... args) -> int
Definition: printf.h:631
ConvertShortToFloat
static void ConvertShortToFloat(const short *in_buf, float *out_buf, size_t n_elems)
Produces outputs -1->+0.999.
Definition: datatype_conversion.h:41
SimdConvertFloatToShortAVX2
static void SimdConvertFloatToShortAVX2(const float *in_buf, short *out_buf, size_t n_elems, size_t n_prefix, float scale_down_factor)
Definition: datatype_conversion.h:190
SimdConvertFloat16ToFloat32
static void SimdConvertFloat16ToFloat32(float *out_buf, const float *in_buf, size_t n_elems)
Definition: datatype_conversion.h:547
unused
#define unused(x)
Definition: utils.h:14
SimdConvertShortToFloatAVX512
static void SimdConvertShortToFloatAVX512(const short *in_buf, float *out_buf, size_t n_elems)
Definition: datatype_conversion.h:48
kAvx2FloatsPerInstr
constexpr size_t kAvx2FloatsPerInstr
Definition: datatype_conversion.h:33
kAvx2Bits
constexpr size_t kAvx2Bits
Definition: datatype_conversion.h:31
SimdConvertShortToFloat
static void SimdConvertShortToFloat(const short *in_buf, float *out_buf, size_t n_elems)
Definition: datatype_conversion.h:126
i
for i
Definition: generate_data.m:107
Catch::cout
std::ostream & cout()
kAvx2ShortsPerLoop
constexpr size_t kAvx2ShortsPerLoop
Definition: datatype_conversion.h:38
SimdConvertFloat32ToFloat16
static void SimdConvertFloat32ToFloat16(float *out_buf, const float *in_buf, size_t n_elems)
Definition: datatype_conversion.h:576
SimdConvertCxFloatToCxShort
static void SimdConvertCxFloatToCxShort(const std::complex< float > *in_buf, std::complex< short > *out_buf, size_t n_elems, size_t n_prefix, float scale_down_factor)
Definition: datatype_conversion.h:279
SimdConvert12bitIqToFloat
static void SimdConvert12bitIqToFloat(const uint8_t *in_buf, float *out_buf, const uint16_t *in_16bits_buf, size_t n_elems)
Definition: datatype_conversion.h:405
kAvx2Bytes
constexpr size_t kAvx2Bytes
Definition: datatype_conversion.h:32
RtAssert
static void RtAssert(bool condition, const char *throw_str)
Definition: utils.h:104
SimdConvertFloatToShort
static void SimdConvertFloatToShort(const float *in_buf, short *out_buf, size_t n_elems, size_t n_prefix=0, float scale_down_factor=1.0f)
Definition: datatype_conversion.h:266
ConvertFloatTo12bitIq
static void ConvertFloatTo12bitIq(const float *in_buf, uint8_t *out_buf, size_t n_elems)
Definition: datatype_conversion.h:298
utils.h
Utility functions for file and text processing.
kAvx2ShortsPerInstr
constexpr size_t kAvx2ShortsPerInstr
Definition: datatype_conversion.h:36
kShrtFltConvFactor
static constexpr float kShrtFltConvFactor
Definition: datatype_conversion.h:18
SimdConvertFloatToShortAVX512
static void SimdConvertFloatToShortAVX512(const float *in_buf, short *out_buf, size_t n_elems, size_t n_prefix, float scale_down_factor)
Definition: datatype_conversion.h:140
ConvertFloatToShort
static void ConvertFloatToShort(const float *in_buf, short *out_buf, size_t n_elems, size_t n_prefix=0, float scale_down_factor=1.0f)
Definition: datatype_conversion.h:237
kAvx2FloatsPerLoop
constexpr size_t kAvx2FloatsPerLoop
Definition: datatype_conversion.h:35
Convert12bitIqTo16bitIq
static void Convert12bitIqTo16bitIq(uint8_t *in_buf, uint16_t *out_buf, size_t n_elems)
Definition: datatype_conversion.h:341