15 #include <sys/resource.h>
16 #include <pmmintrin.h>
17 #include <emmintrin.h>
18 #include <xmmintrin.h>
41 int symbol_count =
RATE * (data_bits + 6);
42 memset(symbols, 0, symbol_count);
48 for(
int i = 0; i < data_bits+(
K-1); i++) {
52 int bit = (b >> (7 - j)) & 1;
55 for(
int k = 0; k <
RATE; k++)
57 int m = sr & polys[k];
58 int par =
parity(sr & polys[k]);
59 symbols[index++] = par;
73 vp->metrics1.t[i] = 63;
87 for (state=0;state <
NUMSTATES/2;state++) {
88 for (i=0; i<
RATE; i++) {
89 Branchtab[i*
NUMSTATES/2+state] = (polys[i] < 0) ^
parity((2*state) & abs(polys[i])) ? 255 : 0;
94 if (posix_memalign((
void**)&vp, 16,
sizeof(
struct v)))
111 unsigned int endstate)
118 #define ADDSHIFT (8-(K-1))
122 #define SUBSHIFT ((K-1)-8)
138 while (nbits-- != 0) {
139 int k = (d[nbits].
w[(endstate >>
ADDSHIFT)/32] >> ((endstate >>
ADDSHIFT)%32)) & 1;
140 endstate = (endstate >> 1) | (k << (
K-2+
ADDSHIFT));
141 data[nbits >> 3] = endstate >>
SUBSHIFT;
193 for (
int s = 0; s < nbits; s++)
208 void viterbi::FULL_SPIRAL(
int nbits,
unsigned char *Y,
unsigned char *X,
const unsigned char *syms,
unsigned char *dec,
unsigned char *Branchtab) {
209 for(
int i9 = 0; i9 <= (nbits/2-1); i9++) {
210 unsigned char a75, a81;
212 short int s20, s21, s26, s27;
213 const unsigned char *a74, *a80, *b6;
214 short int *a110, *a111, *a91, *a93, *a94;
215 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
216 , *a95, *a96, *a97, *a98, *a99;
217 __m128i a105, a106, a86, a87;
218 __m128i a100, a101, a103, a104, a107, a108, a109
219 , a76, a78, a79, a82, a84, a85, a88, a89
220 , a90, d10, d11, d12, d9, m23, m24, m25
221 , m26, m27, m28, m29, m30, s18, s19, s22
222 , s23, s24, s25, s28, s29, t13, t14, t15
224 a71 = ((__m128i *) X);
231 a76 = _mm_set1_epi8(a75);
232 a77 = ((__m128i *) Branchtab);
234 a79 = _mm_xor_si128(a76, a78);
238 a82 = _mm_set1_epi8(a81);
241 a85 = _mm_xor_si128(a82, a84);
242 t13 = _mm_avg_epu8(a79,a85);
243 a86 = ((__m128i ) t13);
244 a87 = _mm_srli_epi16(a86, 2);
245 a88 = ((__m128i ) a87);
246 t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
247 , 63, 63, 63, 63, 63, 63, 63, 63
249 t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
250 , 63, 63, 63, 63, 63, 63, 63, 63
252 m23 = _mm_adds_epu8(s18, t14);
253 m24 = _mm_adds_epu8(s19, t15);
254 m25 = _mm_adds_epu8(s18, t15);
255 m26 = _mm_adds_epu8(s19, t14);
256 a89 = _mm_min_epu8(m24, m23);
257 d9 = _mm_cmpeq_epi8(a89, m24);
258 a90 = _mm_min_epu8(m26, m25);
259 d10 = _mm_cmpeq_epi8(a90, m26);
260 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
261 a91 = ((
short int *) dec);
265 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
268 s22 = _mm_unpacklo_epi8(a89, a90);
269 s23 = _mm_unpackhi_epi8(a89, a90);
270 a95 = ((__m128i *) Y);
280 a101 = _mm_xor_si128(a76, a100);
283 a104 = _mm_xor_si128(a82, a103);
284 t16 = _mm_avg_epu8(a101,a104);
285 a105 = ((__m128i ) t16);
286 a106 = _mm_srli_epi16(a105, 2);
287 a107 = ((__m128i ) a106);
288 t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
289 , 63, 63, 63, 63, 63, 63, 63, 63
291 t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
292 , 63, 63, 63, 63, 63, 63, 63, 63
294 m27 = _mm_adds_epu8(s24, t17);
295 m28 = _mm_adds_epu8(s25, t18);
296 m29 = _mm_adds_epu8(s24, t18);
297 m30 = _mm_adds_epu8(s25, t17);
298 a108 = _mm_min_epu8(m28, m27);
299 d11 = _mm_cmpeq_epi8(a108, m28);
300 a109 = _mm_min_epu8(m30, m29);
301 d12 = _mm_cmpeq_epi8(a109, m30);
302 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
305 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
308 s28 = _mm_unpacklo_epi8(a108, a109);
309 s29 = _mm_unpackhi_epi8(a108, a109);
314 if ((((
unsigned char *) Y)[0]>210)) {
316 m5 = ((__m128i *) Y)[0];
317 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
318 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
319 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
321 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
322 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
323 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
324 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
325 m7 = _mm_unpacklo_epi8(m7, m7);
326 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
327 m6 = _mm_unpacklo_epi64(m7, m7);
328 ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
329 ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
330 ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
331 ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
333 unsigned char a188, a194;
335 short int s48, s49, s54, s55;
336 const unsigned char *a187, *a193, *b15;
337 short int *a204, *a206, *a207, *a223, *a224, *b16;
338 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
339 , *a211, *a212, *a215, *a225, *a226;
340 __m128i a199, a200, a218, a219;
341 __m128i a189, a191, a192, a195, a197, a198, a201
342 , a202, a203, a213, a214, a216, a217, a220, a221
343 , a222, d17, d18, d19, d20, m39, m40, m41
344 , m42, m43, m44, m45, m46, s46, s47, s50
345 , s51, s52, s53, s56, s57, t25, t26, t27
347 a184 = ((__m128i *) Y);
355 a189 = _mm_set1_epi8(a188);
356 a190 = ((__m128i *) Branchtab);
358 a192 = _mm_xor_si128(a189, a191);
361 a195 = _mm_set1_epi8(a194);
364 a198 = _mm_xor_si128(a195, a197);
365 t25 = _mm_avg_epu8(a192,a198);
366 a199 = ((__m128i ) t25);
367 a200 = _mm_srli_epi16(a199, 2);
368 a201 = ((__m128i ) a200);
369 t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
370 , 63, 63, 63, 63, 63, 63, 63, 63
372 t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
373 , 63, 63, 63, 63, 63, 63, 63, 63
375 m39 = _mm_adds_epu8(s46, t26);
376 m40 = _mm_adds_epu8(s47, t27);
377 m41 = _mm_adds_epu8(s46, t27);
378 m42 = _mm_adds_epu8(s47, t26);
379 a202 = _mm_min_epu8(m40, m39);
380 d17 = _mm_cmpeq_epi8(a202, m40);
381 a203 = _mm_min_epu8(m42, m41);
382 d18 = _mm_cmpeq_epi8(a203, m42);
383 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
384 a204 = ((
short int *) dec);
389 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
392 s50 = _mm_unpacklo_epi8(a202, a203);
393 s51 = _mm_unpackhi_epi8(a202, a203);
394 a208 = ((__m128i *) X);
404 a214 = _mm_xor_si128(a189, a213);
407 a217 = _mm_xor_si128(a195, a216);
408 t28 = _mm_avg_epu8(a214,a217);
409 a218 = ((__m128i ) t28);
410 a219 = _mm_srli_epi16(a218, 2);
411 a220 = ((__m128i ) a219);
412 t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
413 , 63, 63, 63, 63, 63, 63, 63, 63
415 t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
416 , 63, 63, 63, 63, 63, 63, 63, 63
418 m43 = _mm_adds_epu8(s52, t29);
419 m44 = _mm_adds_epu8(s53, t30);
420 m45 = _mm_adds_epu8(s52, t30);
421 m46 = _mm_adds_epu8(s53, t29);
422 a221 = _mm_min_epu8(m44, m43);
423 d19 = _mm_cmpeq_epi8(a221, m44);
424 a222 = _mm_min_epu8(m46, m45);
425 d20 = _mm_cmpeq_epi8(a222, m46);
426 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
429 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
432 s56 = _mm_unpacklo_epi8(a221, a222);
433 s57 = _mm_unpackhi_epi8(a221, a222);
438 if ((((
unsigned char *) X)[0]>210)) {
440 m12 = ((__m128i *) X)[0];
441 m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
442 m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
443 m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
445 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
446 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
447 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
448 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
449 m14 = _mm_unpacklo_epi8(m14, m14);
450 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
451 m13 = _mm_unpacklo_epi64(m14, m14);
452 ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
453 ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
454 ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
455 ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);