15 #include <sys/resource.h> 
   16 #include <pmmintrin.h> 
   17 #include <emmintrin.h> 
   18 #include <xmmintrin.h> 
   41         int symbol_count = 
RATE * (data_bits + 6);
 
   42         memset(symbols, 0, symbol_count);
 
   48         for(
int i = 0; i < data_bits+(
K-1); i++) {
 
   52             int bit = (b >> (7 - j)) & 1;
 
   55             for(
int k = 0; k < 
RATE; k++)
 
   57               int m = sr & polys[k];
 
   58               int par = 
parity(sr & polys[k]);
 
   59               symbols[index++] = par;
 
   73         vp->metrics1.t[i] = 63;
 
   87         for (state=0;state < 
NUMSTATES/2;state++) {
 
   88           for (i=0; i<
RATE; i++) {
 
   89             Branchtab[i*
NUMSTATES/2+state] = (polys[i] < 0) ^ 
parity((2*state) & abs(polys[i])) ? 255 : 0;
 
   94       if (posix_memalign((
void**)&vp, 16,
sizeof(
struct v)))
 
  111           unsigned int endstate)  
 
  118     #define ADDSHIFT (8-(K-1)) 
  122     #define SUBSHIFT ((K-1)-8) 
  138       while (nbits-- != 0) {
 
  139         int k = (d[nbits].
w[(endstate >> 
ADDSHIFT)/32] >> ((endstate >> 
ADDSHIFT)%32)) & 1;
 
  140         endstate = (endstate >> 1) | (k << (
K-2+
ADDSHIFT));
 
  141         data[nbits >> 3] = endstate >> 
SUBSHIFT;
 
  193       for (
int s = 0; s < nbits; s++)
 
  208     void viterbi::FULL_SPIRAL(
int nbits, 
unsigned char *Y, 
unsigned char *X, 
const unsigned char *syms, 
unsigned char *dec, 
unsigned char *Branchtab) {
 
  209         for(
int i9 = 0; i9 <= (nbits/2-1); i9++) {
 
  210             unsigned char a75, a81;
 
  212             short int s20, s21, s26, s27;
 
  213             const unsigned char  *a74, *a80, *b6;
 
  214             short int  *a110, *a111, *a91, *a93, *a94;
 
  215             __m128i  *a102, *a112, *a113, *a71, *a72, *a77, *a83
 
  216                     , *a95, *a96, *a97, *a98, *a99;
 
  217             __m128i a105, a106, a86, a87;
 
  218             __m128i a100, a101, a103, a104, a107, a108, a109
 
  219                     , a76, a78, a79, a82, a84, a85, a88, a89
 
  220                     , a90, d10, d11, d12, d9, m23, m24, m25
 
  221                     , m26, m27, m28, m29, m30, s18, s19, s22
 
  222                     , s23, s24, s25, s28, s29, t13, t14, t15
 
  224             a71 = ((__m128i  *) X);
 
  231             a76 = _mm_set1_epi8(a75);
 
  232             a77 = ((__m128i  *) Branchtab);
 
  234             a79 = _mm_xor_si128(a76, a78);
 
  238             a82 = _mm_set1_epi8(a81);
 
  241             a85 = _mm_xor_si128(a82, a84);
 
  242             t13 = _mm_avg_epu8(a79,a85);
 
  243             a86 = ((__m128i ) t13);
 
  244             a87 = _mm_srli_epi16(a86, 2);
 
  245             a88 = ((__m128i ) a87);
 
  246             t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
 
  247         , 63, 63, 63, 63, 63, 63, 63, 63
 
  249             t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
 
  250         , 63, 63, 63, 63, 63, 63, 63, 63
 
  252             m23 = _mm_adds_epu8(s18, t14);
 
  253             m24 = _mm_adds_epu8(s19, t15);
 
  254             m25 = _mm_adds_epu8(s18, t15);
 
  255             m26 = _mm_adds_epu8(s19, t14);
 
  256             a89 = _mm_min_epu8(m24, m23);
 
  257             d9 = _mm_cmpeq_epi8(a89, m24);
 
  258             a90 = _mm_min_epu8(m26, m25);
 
  259             d10 = _mm_cmpeq_epi8(a90, m26);
 
  260             s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
 
  261             a91 = ((
short int  *) dec);
 
  265             s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
 
  268             s22 = _mm_unpacklo_epi8(a89, a90);
 
  269             s23 = _mm_unpackhi_epi8(a89, a90);
 
  270             a95 = ((__m128i  *) Y);
 
  280             a101 = _mm_xor_si128(a76, a100);
 
  283             a104 = _mm_xor_si128(a82, a103);
 
  284             t16 = _mm_avg_epu8(a101,a104);
 
  285             a105 = ((__m128i ) t16);
 
  286             a106 = _mm_srli_epi16(a105, 2);
 
  287             a107 = ((__m128i ) a106);
 
  288             t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
 
  289         , 63, 63, 63, 63, 63, 63, 63, 63
 
  291             t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
 
  292         , 63, 63, 63, 63, 63, 63, 63, 63
 
  294             m27 = _mm_adds_epu8(s24, t17);
 
  295             m28 = _mm_adds_epu8(s25, t18);
 
  296             m29 = _mm_adds_epu8(s24, t18);
 
  297             m30 = _mm_adds_epu8(s25, t17);
 
  298             a108 = _mm_min_epu8(m28, m27);
 
  299             d11 = _mm_cmpeq_epi8(a108, m28);
 
  300             a109 = _mm_min_epu8(m30, m29);
 
  301             d12 = _mm_cmpeq_epi8(a109, m30);
 
  302             s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
 
  305             s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
 
  308             s28 = _mm_unpacklo_epi8(a108, a109);
 
  309             s29 = _mm_unpackhi_epi8(a108, a109);
 
  314             if ((((
unsigned char  *) Y)[0]>210)) {
 
  316                 m5 = ((__m128i  *) Y)[0];
 
  317                 m5 = _mm_min_epu8(m5, ((__m128i  *) Y)[1]);
 
  318                 m5 = _mm_min_epu8(m5, ((__m128i  *) Y)[2]);
 
  319                 m5 = _mm_min_epu8(m5, ((__m128i  *) Y)[3]);
 
  321                 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
 
  322                 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
 
  323                 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
 
  324                 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
 
  325                 m7 = _mm_unpacklo_epi8(m7, m7);
 
  326                 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
 
  327                 m6 = _mm_unpacklo_epi64(m7, m7);
 
  328                 ((__m128i  *) Y)[0] = _mm_subs_epu8(((__m128i  *) Y)[0], m6);
 
  329                 ((__m128i  *) Y)[1] = _mm_subs_epu8(((__m128i  *) Y)[1], m6);
 
  330                 ((__m128i  *) Y)[2] = _mm_subs_epu8(((__m128i  *) Y)[2], m6);
 
  331                 ((__m128i  *) Y)[3] = _mm_subs_epu8(((__m128i  *) Y)[3], m6);
 
  333             unsigned char a188, a194;
 
  335             short int s48, s49, s54, s55;
 
  336             const unsigned char  *a187, *a193, *b15;
 
  337             short int  *a204, *a206, *a207, *a223, *a224, *b16;
 
  338             __m128i  *a184, *a185, *a190, *a196, *a208, *a209, *a210
 
  339                     , *a211, *a212, *a215, *a225, *a226;
 
  340             __m128i a199, a200, a218, a219;
 
  341             __m128i a189, a191, a192, a195, a197, a198, a201
 
  342                     , a202, a203, a213, a214, a216, a217, a220, a221
 
  343                     , a222, d17, d18, d19, d20, m39, m40, m41
 
  344                     , m42, m43, m44, m45, m46, s46, s47, s50
 
  345                     , s51, s52, s53, s56, s57, t25, t26, t27
 
  347             a184 = ((__m128i  *) Y);
 
  355             a189 = _mm_set1_epi8(a188);
 
  356             a190 = ((__m128i  *) Branchtab);
 
  358             a192 = _mm_xor_si128(a189, a191);
 
  361             a195 = _mm_set1_epi8(a194);
 
  364             a198 = _mm_xor_si128(a195, a197);
 
  365             t25 = _mm_avg_epu8(a192,a198);
 
  366             a199 = ((__m128i ) t25);
 
  367             a200 = _mm_srli_epi16(a199, 2);
 
  368             a201 = ((__m128i ) a200);
 
  369             t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
 
  370         , 63, 63, 63, 63, 63, 63, 63, 63
 
  372             t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
 
  373         , 63, 63, 63, 63, 63, 63, 63, 63
 
  375             m39 = _mm_adds_epu8(s46, t26);
 
  376             m40 = _mm_adds_epu8(s47, t27);
 
  377             m41 = _mm_adds_epu8(s46, t27);
 
  378             m42 = _mm_adds_epu8(s47, t26);
 
  379             a202 = _mm_min_epu8(m40, m39);
 
  380             d17 = _mm_cmpeq_epi8(a202, m40);
 
  381             a203 = _mm_min_epu8(m42, m41);
 
  382             d18 = _mm_cmpeq_epi8(a203, m42);
 
  383             s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
 
  384             a204 = ((
short int  *) dec);
 
  389             s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
 
  392             s50 = _mm_unpacklo_epi8(a202, a203);
 
  393             s51 = _mm_unpackhi_epi8(a202, a203);
 
  394             a208 = ((__m128i  *) X);
 
  404             a214 = _mm_xor_si128(a189, a213);
 
  407             a217 = _mm_xor_si128(a195, a216);
 
  408             t28 = _mm_avg_epu8(a214,a217);
 
  409             a218 = ((__m128i ) t28);
 
  410             a219 = _mm_srli_epi16(a218, 2);
 
  411             a220 = ((__m128i ) a219);
 
  412             t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
 
  413         , 63, 63, 63, 63, 63, 63, 63, 63
 
  415             t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
 
  416         , 63, 63, 63, 63, 63, 63, 63, 63
 
  418             m43 = _mm_adds_epu8(s52, t29);
 
  419             m44 = _mm_adds_epu8(s53, t30);
 
  420             m45 = _mm_adds_epu8(s52, t30);
 
  421             m46 = _mm_adds_epu8(s53, t29);
 
  422             a221 = _mm_min_epu8(m44, m43);
 
  423             d19 = _mm_cmpeq_epi8(a221, m44);
 
  424             a222 = _mm_min_epu8(m46, m45);
 
  425             d20 = _mm_cmpeq_epi8(a222, m46);
 
  426             s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
 
  429             s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
 
  432             s56 = _mm_unpacklo_epi8(a221, a222);
 
  433             s57 = _mm_unpackhi_epi8(a221, a222);
 
  438             if ((((
unsigned char  *) X)[0]>210)) {
 
  440                 m12 = ((__m128i  *) X)[0];
 
  441                 m12 = _mm_min_epu8(m12, ((__m128i  *) X)[1]);
 
  442                 m12 = _mm_min_epu8(m12, ((__m128i  *) X)[2]);
 
  443                 m12 = _mm_min_epu8(m12, ((__m128i  *) X)[3]);
 
  445                 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
 
  446                 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
 
  447                 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
 
  448                 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
 
  449                 m14 = _mm_unpacklo_epi8(m14, m14);
 
  450                 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
 
  451                 m13 = _mm_unpacklo_epi64(m14, m14);
 
  452                 ((__m128i  *) X)[0] = _mm_subs_epu8(((__m128i  *) X)[0], m13);
 
  453                 ((__m128i  *) X)[1] = _mm_subs_epu8(((__m128i  *) X)[1], m13);
 
  454                 ((__m128i  *) X)[2] = _mm_subs_epu8(((__m128i  *) X)[2], m13);
 
  455                 ((__m128i  *) X)[3] = _mm_subs_epu8(((__m128i  *) X)[3], m13);