12 #include <immintrin.h>    19 const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
    20 const __m128i INIT0 = _mm_set_epi64x(0x6a09e667bb67ae85ull, 0x510e527f9b05688cull);
    21 const __m128i INIT1 = _mm_set_epi64x(0x3c6ef372a54ff53aull, 0x1f83d9ab5be0cd19ull);
    23 void inline  __attribute__((always_inline)) QuadRound(__m128i& state0, __m128i& state1, uint64_t k1, uint64_t k0)
    25     const __m128i msg = _mm_set_epi64x(k1, k0);
    26     state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    27     state0 = _mm_sha256rnds2_epu32(state0, state1, _mm_shuffle_epi32(msg, 0x0e));
    30 void inline  __attribute__((always_inline)) QuadRound(__m128i& state0, __m128i& state1, __m128i m, uint64_t k1, uint64_t k0)
    32     const __m128i msg = _mm_add_epi32(m, _mm_set_epi64x(k1, k0));
    33     state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
    34     state0 = _mm_sha256rnds2_epu32(state0, state1, _mm_shuffle_epi32(msg, 0x0e));
    37 void inline  __attribute__((always_inline)) ShiftMessageA(__m128i& m0, __m128i m1)
    39     m0 = _mm_sha256msg1_epu32(m0, m1);
    42 void inline  __attribute__((always_inline)) ShiftMessageC(__m128i& m0, __m128i m1, __m128i& m2)
    44     m2 = _mm_sha256msg2_epu32(_mm_add_epi32(m2, _mm_alignr_epi8(m1, m0, 4)), m1);
    47 void inline __attribute__((always_inline)) ShiftMessageB(__m128i& m0, __m128i m1, __m128i& m2)
    49     ShiftMessageC(m0, m1, m2);
    50     ShiftMessageA(m0, m1);
    53 void inline __attribute__((always_inline)) Shuffle(__m128i& s0, __m128i& s1)
    55     const __m128i t1 = _mm_shuffle_epi32(s0, 0xB1);
    56     const __m128i t2 = _mm_shuffle_epi32(s1, 0x1B);
    57     s0 = _mm_alignr_epi8(t1, t2, 0x08);
    58     s1 = _mm_blend_epi16(t2, t1, 0xF0);
    61 void inline __attribute__((always_inline)) Unshuffle(__m128i& s0, __m128i& s1)
    63     const __m128i t1 = _mm_shuffle_epi32(s0, 0x1B);
    64     const __m128i t2 = _mm_shuffle_epi32(s1, 0xB1);
    65     s0 = _mm_blend_epi16(t1, t2, 0xF0);
    66     s1 = _mm_alignr_epi8(t2, t1, 0x08);
    69 __m128i 
inline  __attribute__((always_inline)) Load(
const unsigned char* in)
    71     return _mm_shuffle_epi8(_mm_loadu_si128((
const __m128i*)in), MASK);
    74 void inline  __attribute__((always_inline)) Save(
unsigned char* out, __m128i s)
    76     _mm_storeu_si128((__m128i*)out, _mm_shuffle_epi8(s, MASK));
    81 void Transform(uint32_t* s, 
const unsigned char* chunk, 
size_t blocks)
    83     __m128i m0, m1, m2, m3, s0, s1, so0, so1;
    86     s0 = _mm_loadu_si128((
const __m128i*)s);
    87     s1 = _mm_loadu_si128((
const __m128i*)(s + 4));
    97         QuadRound(s0, s1, m0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull);
    98         m1 = Load(chunk + 16);
    99         QuadRound(s0, s1, m1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull);
   100         ShiftMessageA(m0, m1);
   101         m2 = Load(chunk + 32);
   102         QuadRound(s0, s1, m2, 0x550c7dc3243185beull, 0x12835b01d807aa98ull);
   103         ShiftMessageA(m1, m2);
   104         m3 = Load(chunk + 48);
   105         QuadRound(s0, s1, m3, 0xc19bf1749bdc06a7ull, 0x80deb1fe72be5d74ull);
   106         ShiftMessageB(m2, m3, m0);
   107         QuadRound(s0, s1, m0, 0x240ca1cc0fc19dc6ull, 0xefbe4786E49b69c1ull);
   108         ShiftMessageB(m3, m0, m1);
   109         QuadRound(s0, s1, m1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full);
   110         ShiftMessageB(m0, m1, m2);
   111         QuadRound(s0, s1, m2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull);
   112         ShiftMessageB(m1, m2, m3);
   113         QuadRound(s0, s1, m3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull);
   114         ShiftMessageB(m2, m3, m0);
   115         QuadRound(s0, s1, m0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull);
   116         ShiftMessageB(m3, m0, m1);
   117         QuadRound(s0, s1, m1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull);
   118         ShiftMessageB(m0, m1, m2);
   119         QuadRound(s0, s1, m2, 0xc76c51A3c24b8b70ull, 0xa81a664ba2bfe8a1ull);
   120         ShiftMessageB(m1, m2, m3);
   121         QuadRound(s0, s1, m3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull);
   122         ShiftMessageB(m2, m3, m0);
   123         QuadRound(s0, s1, m0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull);
   124         ShiftMessageB(m3, m0, m1);
   125         QuadRound(s0, s1, m1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull);
   126         ShiftMessageC(m0, m1, m2);
   127         QuadRound(s0, s1, m2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull);
   128         ShiftMessageC(m1, m2, m3);
   129         QuadRound(s0, s1, m3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull);
   132         s0 = _mm_add_epi32(s0, so0);
   133         s1 = _mm_add_epi32(s1, so1);
   140     _mm_storeu_si128((__m128i*)s, s0);
   141     _mm_storeu_si128((__m128i*)(s + 4), s1);
   149     __m128i am0, am1, am2, am3, as0, as1, aso0, aso1;
   150     __m128i bm0, bm1, bm2, bm3, bs0, bs1, bso0, bso1;
   157     QuadRound(as0, as1, am0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull);
   158     QuadRound(bs0, bs1, bm0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull);
   161     QuadRound(as0, as1, am1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull);
   162     QuadRound(bs0, bs1, bm1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull);
   163     ShiftMessageA(am0, am1);
   164     ShiftMessageA(bm0, bm1);
   167     QuadRound(as0, as1, am2, 0x550c7dc3243185beull, 0x12835b01d807aa98ull);
   168     QuadRound(bs0, bs1, bm2, 0x550c7dc3243185beull, 0x12835b01d807aa98ull);
   169     ShiftMessageA(am1, am2);
   170     ShiftMessageA(bm1, bm2);
   172     bm3 = Load(in + 112);
   173     QuadRound(as0, as1, am3, 0xc19bf1749bdc06a7ull, 0x80deb1fe72be5d74ull);
   174     QuadRound(bs0, bs1, bm3, 0xc19bf1749bdc06a7ull, 0x80deb1fe72be5d74ull);
   175     ShiftMessageB(am2, am3, am0);
   176     ShiftMessageB(bm2, bm3, bm0);
   177     QuadRound(as0, as1, am0, 0x240ca1cc0fc19dc6ull, 0xefbe4786E49b69c1ull);
   178     QuadRound(bs0, bs1, bm0, 0x240ca1cc0fc19dc6ull, 0xefbe4786E49b69c1ull);
   179     ShiftMessageB(am3, am0, am1);
   180     ShiftMessageB(bm3, bm0, bm1);
   181     QuadRound(as0, as1, am1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full);
   182     QuadRound(bs0, bs1, bm1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full);
   183     ShiftMessageB(am0, am1, am2);
   184     ShiftMessageB(bm0, bm1, bm2);
   185     QuadRound(as0, as1, am2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull);
   186     QuadRound(bs0, bs1, bm2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull);
   187     ShiftMessageB(am1, am2, am3);
   188     ShiftMessageB(bm1, bm2, bm3);
   189     QuadRound(as0, as1, am3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull);
   190     QuadRound(bs0, bs1, bm3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull);
   191     ShiftMessageB(am2, am3, am0);
   192     ShiftMessageB(bm2, bm3, bm0);
   193     QuadRound(as0, as1, am0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull);
   194     QuadRound(bs0, bs1, bm0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull);
   195     ShiftMessageB(am3, am0, am1);
   196     ShiftMessageB(bm3, bm0, bm1);
   197     QuadRound(as0, as1, am1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull);
   198     QuadRound(bs0, bs1, bm1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull);
   199     ShiftMessageB(am0, am1, am2);
   200     ShiftMessageB(bm0, bm1, bm2);
   201     QuadRound(as0, as1, am2, 0xc76c51A3c24b8b70ull, 0xa81a664ba2bfe8a1ull);
   202     QuadRound(bs0, bs1, bm2, 0xc76c51A3c24b8b70ull, 0xa81a664ba2bfe8a1ull);
   203     ShiftMessageB(am1, am2, am3);
   204     ShiftMessageB(bm1, bm2, bm3);
   205     QuadRound(as0, as1, am3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull);
   206     QuadRound(bs0, bs1, bm3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull);
   207     ShiftMessageB(am2, am3, am0);
   208     ShiftMessageB(bm2, bm3, bm0);
   209     QuadRound(as0, as1, am0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull);
   210     QuadRound(bs0, bs1, bm0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull);
   211     ShiftMessageB(am3, am0, am1);
   212     ShiftMessageB(bm3, bm0, bm1);
   213     QuadRound(as0, as1, am1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull);
   214     QuadRound(bs0, bs1, bm1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull);
   215     ShiftMessageC(am0, am1, am2);
   216     ShiftMessageC(bm0, bm1, bm2);
   217     QuadRound(as0, as1, am2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull);
   218     QuadRound(bs0, bs1, bm2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull);
   219     ShiftMessageC(am1, am2, am3);
   220     ShiftMessageC(bm1, bm2, bm3);
   221     QuadRound(as0, as1, am3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull);
   222     QuadRound(bs0, bs1, bm3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull);
   223     as0 = _mm_add_epi32(as0, INIT0);
   224     bs0 = _mm_add_epi32(bs0, INIT0);
   225     as1 = _mm_add_epi32(as1, INIT1);
   226     bs1 = _mm_add_epi32(bs1, INIT1);
   233     QuadRound(as0, as1, 0xe9b5dba5b5c0fbcfull, 0x71374491c28a2f98ull);
   234     QuadRound(bs0, bs1, 0xe9b5dba5b5c0fbcfull, 0x71374491c28a2f98ull);
   235     QuadRound(as0, as1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull);
   236     QuadRound(bs0, bs1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull);
   237     QuadRound(as0, as1, 0x550c7dc3243185beull, 0x12835b01d807aa98ull);
   238     QuadRound(bs0, bs1, 0x550c7dc3243185beull, 0x12835b01d807aa98ull);
   239     QuadRound(as0, as1, 0xc19bf3749bdc06a7ull, 0x80deb1fe72be5d74ull);
   240     QuadRound(bs0, bs1, 0xc19bf3749bdc06a7ull, 0x80deb1fe72be5d74ull);
   241     QuadRound(as0, as1, 0x240cf2540fe1edc6ull, 0xf0fe4786649b69c1ull);
   242     QuadRound(bs0, bs1, 0x240cf2540fe1edc6ull, 0xf0fe4786649b69c1ull);
   243     QuadRound(as0, as1, 0x16f988fa61b9411eull, 0x6cc984be4fe9346full);
   244     QuadRound(bs0, bs1, 0x16f988fa61b9411eull, 0x6cc984be4fe9346full);
   245     QuadRound(as0, as1, 0xb9d99ec7b019fc65ull, 0xa88e5a6df2c65152ull);
   246     QuadRound(bs0, bs1, 0xb9d99ec7b019fc65ull, 0xa88e5a6df2c65152ull);
   247     QuadRound(as0, as1, 0xc7353eb0fdb1232bull, 0xe70eeaa09a1231c3ull);
   248     QuadRound(bs0, bs1, 0xc7353eb0fdb1232bull, 0xe70eeaa09a1231c3ull);
   249     QuadRound(as0, as1, 0xdc1eeefd5a0f118full, 0xcb976d5f3069bad5ull);
   250     QuadRound(bs0, bs1, 0xdc1eeefd5a0f118full, 0xcb976d5f3069bad5ull);
   251     QuadRound(as0, as1, 0xe15d5b1658f4ca9dull, 0xde0b7a040a35b689ull);
   252     QuadRound(bs0, bs1, 0xe15d5b1658f4ca9dull, 0xde0b7a040a35b689ull);
   253     QuadRound(as0, as1, 0x6fab9537a507ea32ull, 0x37088980007f3e86ull);
   254     QuadRound(bs0, bs1, 0x6fab9537a507ea32ull, 0x37088980007f3e86ull);
   255     QuadRound(as0, as1, 0xc0bbbe37cdaa3b6dull, 0x0d8cd6f117406110ull);
   256     QuadRound(bs0, bs1, 0xc0bbbe37cdaa3b6dull, 0x0d8cd6f117406110ull);
   257     QuadRound(as0, as1, 0x6fd15ca70b02e931ull, 0xdb48a36383613bdaull);
   258     QuadRound(bs0, bs1, 0x6fd15ca70b02e931ull, 0xdb48a36383613bdaull);
   259     QuadRound(as0, as1, 0x6d4378906ed41a95ull, 0x31338431521afacaull);
   260     QuadRound(bs0, bs1, 0x6d4378906ed41a95ull, 0x31338431521afacaull);
   261     QuadRound(as0, as1, 0x532fb63cb5c9a0e6ull, 0x9eccabbdc39c91f2ull);
   262     QuadRound(bs0, bs1, 0x532fb63cb5c9a0e6ull, 0x9eccabbdc39c91f2ull);
   263     QuadRound(as0, as1, 0x4c191d76a4954b68ull, 0x07237ea3d2c741c6ull);
   264     QuadRound(bs0, bs1, 0x4c191d76a4954b68ull, 0x07237ea3d2c741c6ull);
   265     as0 = _mm_add_epi32(as0, aso0);
   266     bs0 = _mm_add_epi32(bs0, bso0);
   267     as1 = _mm_add_epi32(as1, aso1);
   268     bs1 = _mm_add_epi32(bs1, bso1);
   281     QuadRound(as0, as1, am0, 0xe9b5dba5B5c0fbcfull, 0x71374491428a2f98ull);
   282     QuadRound(bs0, bs1, bm0, 0xe9b5dba5B5c0fbcfull, 0x71374491428a2f98ull);
   283     QuadRound(as0, as1, am1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull);
   284     QuadRound(bs0, bs1, bm1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull);
   285     ShiftMessageA(am0, am1);
   286     ShiftMessageA(bm0, bm1);
   287     bm2 = am2 = _mm_set_epi64x(0x0ull, 0x80000000ull);
   288     QuadRound(as0, as1, 0x550c7dc3243185beull, 0x12835b015807aa98ull);
   289     QuadRound(bs0, bs1, 0x550c7dc3243185beull, 0x12835b015807aa98ull);
   290     ShiftMessageA(am1, am2);
   291     ShiftMessageA(bm1, bm2);
   292     bm3 = am3 = _mm_set_epi64x(0x10000000000ull, 0x0ull);
   293     QuadRound(as0, as1, 0xc19bf2749bdc06a7ull, 0x80deb1fe72be5d74ull);
   294     QuadRound(bs0, bs1, 0xc19bf2749bdc06a7ull, 0x80deb1fe72be5d74ull);
   295     ShiftMessageB(am2, am3, am0);
   296     ShiftMessageB(bm2, bm3, bm0);
   297     QuadRound(as0, as1, am0, 0x240ca1cc0fc19dc6ull, 0xefbe4786e49b69c1ull);
   298     QuadRound(bs0, bs1, bm0, 0x240ca1cc0fc19dc6ull, 0xefbe4786e49b69c1ull);
   299     ShiftMessageB(am3, am0, am1);
   300     ShiftMessageB(bm3, bm0, bm1);
   301     QuadRound(as0, as1, am1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full);
   302     QuadRound(bs0, bs1, bm1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full);
   303     ShiftMessageB(am0, am1, am2);
   304     ShiftMessageB(bm0, bm1, bm2);
   305     QuadRound(as0, as1, am2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull);
   306     QuadRound(bs0, bs1, bm2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull);
   307     ShiftMessageB(am1, am2, am3);
   308     ShiftMessageB(bm1, bm2, bm3);
   309     QuadRound(as0, as1, am3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull);
   310     QuadRound(bs0, bs1, bm3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull);
   311     ShiftMessageB(am2, am3, am0);
   312     ShiftMessageB(bm2, bm3, bm0);
   313     QuadRound(as0, as1, am0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull);
   314     QuadRound(bs0, bs1, bm0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull);
   315     ShiftMessageB(am3, am0, am1);
   316     ShiftMessageB(bm3, bm0, bm1);
   317     QuadRound(as0, as1, am1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull);
   318     QuadRound(bs0, bs1, bm1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull);
   319     ShiftMessageB(am0, am1, am2);
   320     ShiftMessageB(bm0, bm1, bm2);
   321     QuadRound(as0, as1, am2, 0xc76c51a3c24b8b70ull, 0xa81a664ba2bfe8A1ull);
   322     QuadRound(bs0, bs1, bm2, 0xc76c51a3c24b8b70ull, 0xa81a664ba2bfe8A1ull);
   323     ShiftMessageB(am1, am2, am3);
   324     ShiftMessageB(bm1, bm2, bm3);
   325     QuadRound(as0, as1, am3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull);
   326     QuadRound(bs0, bs1, bm3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull);
   327     ShiftMessageB(am2, am3, am0);
   328     ShiftMessageB(bm2, bm3, bm0);
   329     QuadRound(as0, as1, am0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull);
   330     QuadRound(bs0, bs1, bm0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull);
   331     ShiftMessageB(am3, am0, am1);
   332     ShiftMessageB(bm3, bm0, bm1);
   333     QuadRound(as0, as1, am1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull);
   334     QuadRound(bs0, bs1, bm1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull);
   335     ShiftMessageC(am0, am1, am2);
   336     ShiftMessageC(bm0, bm1, bm2);
   337     QuadRound(as0, as1, am2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull);
   338     QuadRound(bs0, bs1, bm2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull);
   339     ShiftMessageC(am1, am2, am3);
   340     ShiftMessageC(bm1, bm2, bm3);
   341     QuadRound(as0, as1, am3, 0xc67178f2bef9a3f7ull, 0xa4506ceb90befffaull);
   342     QuadRound(bs0, bs1, bm3, 0xc67178f2bef9a3f7ull, 0xa4506ceb90befffaull);
   343     as0 = _mm_add_epi32(as0, INIT0);
   344     bs0 = _mm_add_epi32(bs0, INIT0);
   345     as1 = _mm_add_epi32(as1, INIT1);
   346     bs1 = _mm_add_epi32(bs1, INIT1);
 void Transform_2way(unsigned char *out, const unsigned char *in)
 
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)