> 文章列表 > FIPS202:AVX2 版本的 C/C++ 实现

FIPS202:AVX2 版本的 C/C++ 实现

FIPS202:AVX2 版本的 C/C++ 实现

文章目录

  • Keccak4x
    • align.h
    • brg_endian.h
    • SIMD256-config.h
    • KeccakP-1600-times4-SnP.h
    • KeccakP-1600-unrolling.macros.h
    • KeccakP-1600-times4-SIMD256.c
  • fips202
    • fips202.h
    • fips202.c
    • fips202x4.h
    • fips202x4.c

花了好大力气,终于把
AVX2 版本
FIPS202好了。兼容
VS2022 以及
GCC,另外在
fips202x4.c 中添加了一些函数。

原始版本的源码(从 NIST PQC 实现中扒出来的),其中有好多的强制类型转换VS2022 不兼容。

Keccak4x

align.h

/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/#ifndef _align_h_
#define _align_h_/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
#ifdef ALIGN
#undef ALIGN
#endif#if defined(__GNUC__)
#define ALIGN(x) __attribute__ ((aligned(x)))
#elif defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#elif defined(__ARMCC_VERSION)
#define ALIGN(x) __align(x)
#else
#define ALIGN(x)
#endif#endif

brg_endian.h

/*---------------------------------------------------------------------------Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.LICENSE TERMSThe redistribution and use of this software (with or without changes)is allowed without the payment of fees or royalties provided that:1. source code distributions include the above copyright notice, thislist of conditions and the following disclaimer;2. binary distributions include the above copyright notice, this listof conditions and the following disclaimer in their documentation;3. the name of the copyright holder is not used to endorse productsbuilt using this software without specific written permission.DISCLAIMERThis software is provided 'as is' with no explicit or implied warrantiesin respect of its properties, including, but not limited to, correctnessand/or fitness for purpose.---------------------------------------------------------------------------Issue Date: 20/12/2007Changes for ARM 9/9/2010
*/#ifndef _BRG_ENDIAN_H
#define _BRG_ENDIAN_H#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */#if 0
/* Include files where endian defines and byteswap functions may reside */
#if defined( __sun )
#  include <sys/isa_defs.h>
#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
#  include <sys/endian.h>
#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \\defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
#  include <machine/endian.h>
#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
#  if !defined( __MINGW32__ ) && !defined( _AIX )
#    include <endian.h>
#    if !defined( __BEOS__ )
#      include <byteswap.h>
#    endif
#  endif
#endif
#endif/* Now attempt to set the define for platform byte order using any  */
/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
/* seem to encompass most endian symbol definitions                 */#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#  endif
#elif defined( BIG_ENDIAN )
#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( LITTLE_ENDIAN )
#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#  endif
#elif defined( _BIG_ENDIAN )
#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( _LITTLE_ENDIAN )
#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#  endif
#elif defined( __BIG_ENDIAN )
#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN )
#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#  endif
#elif defined( __BIG_ENDIAN__ )
#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN__ )
#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif/*  if the platform byte order could not be determined, then try to */
/*  set this define using common machine defines                    */
#if !defined(PLATFORM_BYTE_ORDER)#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \\defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \\defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \\defined( vax )       || defined( vms )     || defined( VMS )        || \\defined( __VMS )     || defined( _M_X64 )
#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \\defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \\defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \\defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \\defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \\defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \\defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN#elif defined(__arm__)
# ifdef __BIG_ENDIAN
#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# else
#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif 1     /* **** EDIT HERE IF NECESSARY **** */
#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif 0     /* **** EDIT HERE IF NECESSARY **** */
#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#else
#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
#endif#endif#endif

SIMD256-config.h

#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled"
#define KeccakP1600times4_fullUnrolling
#define KeccakP1600times4_useAVX2

KeccakP-1600-times4-SnP.h

/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/#ifndef _KeccakP_1600_times4_SnP_h_
#define _KeccakP_1600_times4_SnP_h_/** For the documentation, see PlSnP-documentation.h.*/#include "SIMD256-config.h"#define KeccakP1600times4_implementation        "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
#define KeccakP1600times4_statesSizeInBytes     800
#define KeccakP1600times4_statesAlignment       32
#define KeccakF1600times4_FastLoop_supported
#define KeccakP1600times4_12rounds_FastLoop_supported#include <stddef.h>#define KeccakP1600times4_StaticInitialize()
void KeccakP1600times4_InitializeAll(void *states);
#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \\((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
void KeccakP1600times4_PermuteAll_12rounds(void *states);
void KeccakP1600times4_PermuteAll_24rounds(void *states);
void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex,  const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);#endif

KeccakP-1600-unrolling.macros.h

/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/#if (defined(FullUnrolling))
#define rounds24 \\prepareTheta \\thetaRhoPiChiIotaPrepareTheta( 0, A, E) \\thetaRhoPiChiIotaPrepareTheta( 1, E, A) \\thetaRhoPiChiIotaPrepareTheta( 2, A, E) \\thetaRhoPiChiIotaPrepareTheta( 3, E, A) \\thetaRhoPiChiIotaPrepareTheta( 4, A, E) \\thetaRhoPiChiIotaPrepareTheta( 5, E, A) \\thetaRhoPiChiIotaPrepareTheta( 6, A, E) \\thetaRhoPiChiIotaPrepareTheta( 7, E, A) \\thetaRhoPiChiIotaPrepareTheta( 8, A, E) \\thetaRhoPiChiIotaPrepareTheta( 9, E, A) \\thetaRhoPiChiIotaPrepareTheta(10, A, E) \\thetaRhoPiChiIotaPrepareTheta(11, E, A) \\thetaRhoPiChiIotaPrepareTheta(12, A, E) \\thetaRhoPiChiIotaPrepareTheta(13, E, A) \\thetaRhoPiChiIotaPrepareTheta(14, A, E) \\thetaRhoPiChiIotaPrepareTheta(15, E, A) \\thetaRhoPiChiIotaPrepareTheta(16, A, E) \\thetaRhoPiChiIotaPrepareTheta(17, E, A) \\thetaRhoPiChiIotaPrepareTheta(18, A, E) \\thetaRhoPiChiIotaPrepareTheta(19, E, A) \\thetaRhoPiChiIotaPrepareTheta(20, A, E) \\thetaRhoPiChiIotaPrepareTheta(21, E, A) \\thetaRhoPiChiIotaPrepareTheta(22, A, E) \\thetaRhoPiChiIota(23, E, A) \\

#define rounds12 \\prepareTheta \\thetaRhoPiChiIotaPrepareTheta(12, A, E) \\thetaRhoPiChiIotaPrepareTheta(13, E, A) \\thetaRhoPiChiIotaPrepareTheta(14, A, E) \\thetaRhoPiChiIotaPrepareTheta(15, E, A) \\thetaRhoPiChiIotaPrepareTheta(16, A, E) \\thetaRhoPiChiIotaPrepareTheta(17, E, A) \\thetaRhoPiChiIotaPrepareTheta(18, A, E) \\thetaRhoPiChiIotaPrepareTheta(19, E, A) \\thetaRhoPiChiIotaPrepareTheta(20, A, E) \\thetaRhoPiChiIotaPrepareTheta(21, E, A) \\thetaRhoPiChiIotaPrepareTheta(22, A, E) \\thetaRhoPiChiIota(23, E, A) \\

#elif (Unrolling == 12)
#define rounds24 \\prepareTheta \\for(i=0; i<24; i+=12) { \\thetaRhoPiChiIotaPrepareTheta(i   , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \\} \\

#define rounds12 \\prepareTheta \\thetaRhoPiChiIotaPrepareTheta(12, A, E) \\thetaRhoPiChiIotaPrepareTheta(13, E, A) \\thetaRhoPiChiIotaPrepareTheta(14, A, E) \\thetaRhoPiChiIotaPrepareTheta(15, E, A) \\thetaRhoPiChiIotaPrepareTheta(16, A, E) \\thetaRhoPiChiIotaPrepareTheta(17, E, A) \\thetaRhoPiChiIotaPrepareTheta(18, A, E) \\thetaRhoPiChiIotaPrepareTheta(19, E, A) \\thetaRhoPiChiIotaPrepareTheta(20, A, E) \\thetaRhoPiChiIotaPrepareTheta(21, E, A) \\thetaRhoPiChiIotaPrepareTheta(22, A, E) \\thetaRhoPiChiIota(23, E, A) \\

#elif (Unrolling == 6)
#define rounds24 \\prepareTheta \\for(i=0; i<24; i+=6) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \\} \\

#define rounds12 \\prepareTheta \\for(i=12; i<24; i+=6) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \\} \\

#elif (Unrolling == 4)
#define rounds24 \\prepareTheta \\for(i=0; i<24; i+=4) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \\} \\

#define rounds12 \\prepareTheta \\for(i=12; i<24; i+=4) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \\thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \\} \\

#elif (Unrolling == 3)
#define rounds24 \\prepareTheta \\for(i=0; i<24; i+=3) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \\copyStateVariables(A, E) \\} \\

#define rounds12 \\prepareTheta \\for(i=12; i<24; i+=3) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \\copyStateVariables(A, E) \\} \\

#elif (Unrolling == 2)
#define rounds24 \\prepareTheta \\for(i=0; i<24; i+=2) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\} \\

#define rounds12 \\prepareTheta \\for(i=12; i<24; i+=2) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\} \\

#elif (Unrolling == 1)
#define rounds24 \\prepareTheta \\for(i=0; i<24; i++) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\copyStateVariables(A, E) \\} \\

#define rounds12 \\prepareTheta \\for(i=12; i<24; i++) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\copyStateVariables(A, E) \\} \\

#else
#error "Unrolling is not correctly specified!"
#endif#define roundsN(__nrounds) \\prepareTheta \\i = 24 - (__nrounds); \\if ((i&1) != 0) { \\thetaRhoPiChiIotaPrepareTheta(i, A, E) \\copyStateVariables(A, E) \\++i; \\} \\for( /* empty */; i<24; i+=2) { \\thetaRhoPiChiIotaPrepareTheta(i  , A, E) \\thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \\}

KeccakP-1600-times4-SIMD256.c

/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <smmintrin.h>
#include <wmmintrin.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "align.h"
#include "KeccakP-1600-times4-SnP.h"
#include "SIMD256-config.h"#include "brg_endian.h"
#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
#error Expecting a little-endian platform
#endiftypedef unsigned char UINT8;
typedef unsigned long long int UINT64;
typedef __m128i V128;
typedef __m256i V256;#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)#if defined(KeccakP1600times4_useAVX2)#define ANDnu256(a, b)          _mm256_andnot_si256(a, b)#define CONST256(a)             _mm256_load_si256((const V256 *)&(a))//#define CONST256_64(a)          (V256)_mm256_broadcast_sd((const double*)(&a))#define CONST256_64(a)          _mm256_castpd_si256(_mm256_broadcast_sd((const double*)(&a)))#define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))#define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))#define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))#define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))#define ROL64in256_8(d, a)      d = _mm256_shuffle_epi8(a, CONST256(rho8))#define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};#define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)#define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)#define STORE2_128(ah, al, v)   _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)#define XOR256(a, b)            _mm256_xor_si256(a, b)#define XOReq256(a, b)          a = _mm256_xor_si256(a, b)#define UNPACKL( a, b )         _mm256_unpacklo_epi64((a), (b))#define UNPACKH( a, b )         _mm256_unpackhi_epi64((a), (b))//#define PERM128( a, b, c )      (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)#define PERM128( a, b, c )      _mm256_permute2x128_si256(a, b, c)//#define SHUFFLE64( a, b, c )    (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)#define SHUFFLE64( a, b, c )    _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c))#define UNINTLEAVE()            lanesL01 = UNPACKL( lanes0, lanes1 ),                   \\lanesH01 = UNPACKH( lanes0, lanes1 ),                   \\lanesL23 = UNPACKL( lanes2, lanes3 ),                   \\lanesH23 = UNPACKH( lanes2, lanes3 ),                   \\lanes0 = PERM128( lanesL01, lanesL23, 0x20 ),           \\lanes2 = PERM128( lanesL01, lanesL23, 0x31 ),           \\lanes1 = PERM128( lanesH01, lanesH23, 0x20 ),           \\lanes3 = PERM128( lanesH01, lanesH23, 0x31 )#define INTLEAVE()              lanesL01 = PERM128( lanes0, lanes2, 0x20 ),             \\lanesH01 = PERM128( lanes1, lanes3, 0x20 ),             \\lanesL23 = PERM128( lanes0, lanes2, 0x31 ),             \\lanesH23 = PERM128( lanes1, lanes3, 0x31 ),             \\lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ),         \\lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ),         \\lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ),         \\lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )#endif#define SnP_laneLengthInBytes 8void KeccakP1600times4_InitializeAll(void *states)
{memset(states, 0, KeccakP1600times4_statesSizeInBytes);
}void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
{unsigned int sizeLeft = length;unsigned int lanePosition = offset/SnP_laneLengthInBytes;unsigned int offsetInLane = offset%SnP_laneLengthInBytes;const unsigned char *curData = data;UINT64 *statesAsLanes = (UINT64 *)states;if ((sizeLeft > 0) && (offsetInLane != 0)) {unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;UINT64 lane = 0;if (bytesInLane > sizeLeft)bytesInLane = sizeLeft;memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;sizeLeft -= bytesInLane;lanePosition++;curData += bytesInLane;}while(sizeLeft >= SnP_laneLengthInBytes) {UINT64 lane = *((const UINT64*)curData);statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;sizeLeft -= SnP_laneLengthInBytes;lanePosition++;curData += SnP_laneLengthInBytes;}if (sizeLeft > 0) {UINT64 lane = 0;memcpy(&lane, curData, sizeLeft);statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;}
}void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{V256 *stateAsLanes = (V256 *)states;unsigned int i;const UINT64 *curData0 = (const UINT64 *)data;const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;#define Xor_In( argIndex )  XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))#define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\\lanes1 = LOAD256u( curData1[argIndex]),\\lanes2 = LOAD256u( curData2[argIndex]),\\lanes3 = LOAD256u( curData3[argIndex]),\\INTLEAVE(),\\XOReq256( stateAsLanes[argIndex+0], lanes0 ),\\XOReq256( stateAsLanes[argIndex+1], lanes1 ),\\XOReq256( stateAsLanes[argIndex+2], lanes2 ),\\XOReq256( stateAsLanes[argIndex+3], lanes3 )if ( laneCount >= 16 )  {Xor_In4( 0 );Xor_In4( 4 );Xor_In4( 8 );Xor_In4( 12 );if ( laneCount >= 20 )  {Xor_In4( 16 );for(i=20; i<laneCount; i++)Xor_In( i );}else {for(i=16; i<laneCount; i++)Xor_In( i );}}else {for(i=0; i<laneCount; i++)Xor_In( i );}#undef  Xor_In#undef  Xor_In4
}void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
{unsigned int sizeLeft = length;unsigned int lanePosition = offset/SnP_laneLengthInBytes;unsigned int offsetInLane = offset%SnP_laneLengthInBytes;const unsigned char *curData = data;UINT64 *statesAsLanes = (UINT64 *)states;if ((sizeLeft > 0) && (offsetInLane != 0)) {unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;if (bytesInLane > sizeLeft)bytesInLane = sizeLeft;memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);sizeLeft -= bytesInLane;lanePosition++;curData += bytesInLane;}while(sizeLeft >= SnP_laneLengthInBytes) {UINT64 lane = *((const UINT64*)curData);statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;sizeLeft -= SnP_laneLengthInBytes;lanePosition++;curData += SnP_laneLengthInBytes;}if (sizeLeft > 0) {memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);}
}void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{V256 *stateAsLanes = (V256 *)states;unsigned int i;const UINT64 *curData0 = (const UINT64 *)data;const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;#define OverWr( argIndex )  STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))#define OverWr4( argIndex )     lanes0 = LOAD256u( curData0[argIndex]),\\lanes1 = LOAD256u( curData1[argIndex]),\\lanes2 = LOAD256u( curData2[argIndex]),\\lanes3 = LOAD256u( curData3[argIndex]),\\INTLEAVE(),\\STORE256( stateAsLanes[argIndex+0], lanes0 ),\\STORE256( stateAsLanes[argIndex+1], lanes1 ),\\STORE256( stateAsLanes[argIndex+2], lanes2 ),\\STORE256( stateAsLanes[argIndex+3], lanes3 )if ( laneCount >= 16 )  {OverWr4( 0 );OverWr4( 4 );OverWr4( 8 );OverWr4( 12 );if ( laneCount >= 20 )  {OverWr4( 16 );for(i=20; i<laneCount; i++)OverWr( i );}else {for(i=16; i<laneCount; i++)OverWr( i );}}else {for(i=0; i<laneCount; i++)OverWr( i );}#undef  OverWr#undef  OverWr4
}void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
{unsigned int sizeLeft = byteCount;unsigned int lanePosition = 0;UINT64 *statesAsLanes = (UINT64 *)states;while(sizeLeft >= SnP_laneLengthInBytes) {statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;sizeLeft -= SnP_laneLengthInBytes;lanePosition++;}if (sizeLeft > 0) {memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);}
}void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
{unsigned int sizeLeft = length;unsigned int lanePosition = offset/SnP_laneLengthInBytes;unsigned int offsetInLane = offset%SnP_laneLengthInBytes;unsigned char *curData = data;const UINT64 *statesAsLanes = (const UINT64 *)states;if ((sizeLeft > 0) && (offsetInLane != 0)) {unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;if (bytesInLane > sizeLeft)bytesInLane = sizeLeft;memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);sizeLeft -= bytesInLane;lanePosition++;curData += bytesInLane;}while(sizeLeft >= SnP_laneLengthInBytes) {*(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];sizeLeft -= SnP_laneLengthInBytes;lanePosition++;curData += SnP_laneLengthInBytes;}if (sizeLeft > 0) {memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);}
}void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{UINT64 *curData0 = (UINT64 *)data;UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);const V256 *stateAsLanes = (const V256 *)states;const UINT64 *stateAsLanes64 = (const UINT64*)states;V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;unsigned int i;#define Extr( argIndex )    curData0[argIndex] = stateAsLanes64[4*(argIndex)],      \\curData1[argIndex] = stateAsLanes64[4*(argIndex)+1],    \\curData2[argIndex] = stateAsLanes64[4*(argIndex)+2],    \\curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]#define Extr4( argIndex )   lanes0 = LOAD256( stateAsLanes[argIndex+0] ),           \\lanes1 = LOAD256( stateAsLanes[argIndex+1] ),           \\lanes2 = LOAD256( stateAsLanes[argIndex+2] ),           \\lanes3 = LOAD256( stateAsLanes[argIndex+3] ),           \\UNINTLEAVE(),                                           \\STORE256u( curData0[argIndex], lanes0 ),                \\STORE256u( curData1[argIndex], lanes1 ),                \\STORE256u( curData2[argIndex], lanes2 ),                \\STORE256u( curData3[argIndex], lanes3 )if ( laneCount >= 16 )  {Extr4( 0 );Extr4( 4 );Extr4( 8 );Extr4( 12 );if ( laneCount >= 20 )  {Extr4( 16 );for(i=20; i<laneCount; i++)Extr( i );}else {for(i=16; i<laneCount; i++)Extr( i );}}else {for(i=0; i<laneCount; i++)Extr( i );}#undef  Extr#undef  Extr4
}void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
{unsigned int sizeLeft = length;unsigned int lanePosition = offset/SnP_laneLengthInBytes;unsigned int offsetInLane = offset%SnP_laneLengthInBytes;const unsigned char *curInput = input;unsigned char *curOutput = output;const UINT64 *statesAsLanes = (const UINT64 *)states;if ((sizeLeft > 0) && (offsetInLane != 0)) {unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);if (bytesInLane > sizeLeft)bytesInLane = sizeLeft;sizeLeft -= bytesInLane;do {*(curOutput++) = *(curInput++) ^ (unsigned char)lane;lane >>= 8;} while ( --bytesInLane != 0);lanePosition++;}while(sizeLeft >= SnP_laneLengthInBytes) {*((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];sizeLeft -= SnP_laneLengthInBytes;lanePosition++;curInput += SnP_laneLengthInBytes;curOutput += SnP_laneLengthInBytes;}if (sizeLeft != 0) {UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];do {*(curOutput++) = *(curInput++) ^ (unsigned char)lane;lane >>= 8;} while ( --sizeLeft != 0);}
}void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
{const UINT64 *curInput0 = (UINT64 *)input;const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);UINT64 *curOutput0 = (UINT64 *)output;UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);const V256 *stateAsLanes = (const V256 *)states;const UINT64 *stateAsLanes64 = (const UINT64*)states;V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;unsigned int i;#define ExtrXor( argIndex ) \\curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\\curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\\curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\\curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]#define ExtrXor4( argIndex ) \\lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\\lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\\lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\\lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\\UNINTLEAVE(),\\lanesL01 = LOAD256u( curInput0[argIndex]),\\lanesH01 = LOAD256u( curInput1[argIndex]),\\lanesL23 = LOAD256u( curInput2[argIndex]),\\lanesH23 = LOAD256u( curInput3[argIndex]),\\XOReq256( lanes0, lanesL01 ),\\XOReq256( lanes1, lanesH01 ),\\XOReq256( lanes2, lanesL23 ),\\XOReq256( lanes3, lanesH23 ),\\STORE256u( curOutput0[argIndex], lanes0 ),\\STORE256u( curOutput1[argIndex], lanes1 ),\\STORE256u( curOutput2[argIndex], lanes2 ),\\STORE256u( curOutput3[argIndex], lanes3 )if ( laneCount >= 16 )  {ExtrXor4( 0 );ExtrXor4( 4 );ExtrXor4( 8 );ExtrXor4( 12 );if ( laneCount >= 20 )  {ExtrXor4( 16 );for(i=20; i<laneCount; i++)ExtrXor( i );}else {for(i=16; i<laneCount; i++)ExtrXor( i );}}else {for(i=0; i<laneCount; i++)ExtrXor( i );}#undef  ExtrXor#undef  ExtrXor4
}#define declareABCDE \\V256 Aba, Abe, Abi, Abo, Abu; \\V256 Aga, Age, Agi, Ago, Agu; \\V256 Aka, Ake, Aki, Ako, Aku; \\V256 Ama, Ame, Ami, Amo, Amu; \\V256 Asa, Ase, Asi, Aso, Asu; \\V256 Bba, Bbe, Bbi, Bbo, Bbu; \\V256 Bga, Bge, Bgi, Bgo, Bgu; \\V256 Bka, Bke, Bki, Bko, Bku; \\V256 Bma, Bme, Bmi, Bmo, Bmu; \\V256 Bsa, Bse, Bsi, Bso, Bsu; \\V256 Ca, Ce, Ci, Co, Cu; \\V256 Ca1, Ce1, Ci1, Co1, Cu1; \\V256 Da, De, Di, Do, Du; \\V256 Eba, Ebe, Ebi, Ebo, Ebu; \\V256 Ega, Ege, Egi, Ego, Egu; \\V256 Eka, Eke, Eki, Eko, Eku; \\V256 Ema, Eme, Emi, Emo, Emu; \\V256 Esa, Ese, Esi, Eso, Esu; \\

#define prepareTheta \\Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \\Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \\Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \\Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \\Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \\

/* --- Theta Rho Pi Chi Iota Prepare-theta */
/* --- 64-bit lanes mapped to 64-bit words */
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \\ROL64in256(Ce1, Ce, 1); \\Da = XOR256(Cu, Ce1); \\ROL64in256(Ci1, Ci, 1); \\De = XOR256(Ca, Ci1); \\ROL64in256(Co1, Co, 1); \\Di = XOR256(Ce, Co1); \\ROL64in256(Cu1, Cu, 1); \\Do = XOR256(Ci, Cu1); \\ROL64in256(Ca1, Ca, 1); \\Du = XOR256(Co, Ca1); \\
\\XOReq256(A##ba, Da); \\Bba = A##ba; \\XOReq256(A##ge, De); \\ROL64in256(Bbe, A##ge, 44); \\XOReq256(A##ki, Di); \\ROL64in256(Bbi, A##ki, 43); \\E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \\XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \\Ca = E##ba; \\XOReq256(A##mo, Do); \\ROL64in256(Bbo, A##mo, 21); \\E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \\Ce = E##be; \\XOReq256(A##su, Du); \\ROL64in256(Bbu, A##su, 14); \\E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \\Ci = E##bi; \\E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \\Co = E##bo; \\E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \\Cu = E##bu; \\
\\XOReq256(A##bo, Do); \\ROL64in256(Bga, A##bo, 28); \\XOReq256(A##gu, Du); \\ROL64in256(Bge, A##gu, 20); \\XOReq256(A##ka, Da); \\ROL64in256(Bgi, A##ka, 3); \\E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \\XOReq256(Ca, E##ga); \\XOReq256(A##me, De); \\ROL64in256(Bgo, A##me, 45); \\E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \\XOReq256(Ce, E##ge); \\XOReq256(A##si, Di); \\ROL64in256(Bgu, A##si, 61); \\E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \\XOReq256(Ci, E##gi); \\E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \\XOReq256(Co, E##go); \\E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \\XOReq256(Cu, E##gu); \\
\\XOReq256(A##be, De); \\ROL64in256(Bka, A##be, 1); \\XOReq256(A##gi, Di); \\ROL64in256(Bke, A##gi, 6); \\XOReq256(A##ko, Do); \\ROL64in256(Bki, A##ko, 25); \\E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \\XOReq256(Ca, E##ka); \\XOReq256(A##mu, Du); \\ROL64in256_8(Bko, A##mu); \\E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \\XOReq256(Ce, E##ke); \\XOReq256(A##sa, Da); \\ROL64in256(Bku, A##sa, 18); \\E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \\XOReq256(Ci, E##ki); \\E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \\XOReq256(Co, E##ko); \\E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \\XOReq256(Cu, E##ku); \\
\\XOReq256(A##bu, Du); \\ROL64in256(Bma, A##bu, 27); \\XOReq256(A##ga, Da); \\ROL64in256(Bme, A##ga, 36); \\XOReq256(A##ke, De); \\ROL64in256(Bmi, A##ke, 10); \\E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \\XOReq256(Ca, E##ma); \\XOReq256(A##mi, Di); \\ROL64in256(Bmo, A##mi, 15); \\E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \\XOReq256(Ce, E##me); \\XOReq256(A##so, Do); \\ROL64in256_56(Bmu, A##so); \\E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \\XOReq256(Ci, E##mi); \\E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \\XOReq256(Co, E##mo); \\E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \\XOReq256(Cu, E##mu); \\
\\XOReq256(A##bi, Di); \\ROL64in256(Bsa, A##bi, 62); \\XOReq256(A##go, Do); \\ROL64in256(Bse, A##go, 55); \\XOReq256(A##ku, Du); \\ROL64in256(Bsi, A##ku, 39); \\E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \\XOReq256(Ca, E##sa); \\XOReq256(A##ma, Da); \\ROL64in256(Bso, A##ma, 41); \\E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \\XOReq256(Ce, E##se); \\XOReq256(A##se, De); \\ROL64in256(Bsu, A##se, 2); \\E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \\XOReq256(Ci, E##si); \\E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \\XOReq256(Co, E##so); \\E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \\XOReq256(Cu, E##su); \\
\\

/* --- Theta Rho Pi Chi Iota */
/* --- 64-bit lanes mapped to 64-bit words */
#define thetaRhoPiChiIota(i, A, E) \\ROL64in256(Ce1, Ce, 1); \\Da = XOR256(Cu, Ce1); \\ROL64in256(Ci1, Ci, 1); \\De = XOR256(Ca, Ci1); \\ROL64in256(Co1, Co, 1); \\Di = XOR256(Ce, Co1); \\ROL64in256(Cu1, Cu, 1); \\Do = XOR256(Ci, Cu1); \\ROL64in256(Ca1, Ca, 1); \\Du = XOR256(Co, Ca1); \\
\\XOReq256(A##ba, Da); \\Bba = A##ba; \\XOReq256(A##ge, De); \\ROL64in256(Bbe, A##ge, 44); \\XOReq256(A##ki, Di); \\ROL64in256(Bbi, A##ki, 43); \\E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \\XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \\XOReq256(A##mo, Do); \\ROL64in256(Bbo, A##mo, 21); \\E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \\XOReq256(A##su, Du); \\ROL64in256(Bbu, A##su, 14); \\E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \\E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \\E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \\
\\XOReq256(A##bo, Do); \\ROL64in256(Bga, A##bo, 28); \\XOReq256(A##gu, Du); \\ROL64in256(Bge, A##gu, 20); \\XOReq256(A##ka, Da); \\ROL64in256(Bgi, A##ka, 3); \\E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \\XOReq256(A##me, De); \\ROL64in256(Bgo, A##me, 45); \\E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \\XOReq256(A##si, Di); \\ROL64in256(Bgu, A##si, 61); \\E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \\E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \\E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \\
\\XOReq256(A##be, De); \\ROL64in256(Bka, A##be, 1); \\XOReq256(A##gi, Di); \\ROL64in256(Bke, A##gi, 6); \\XOReq256(A##ko, Do); \\ROL64in256(Bki, A##ko, 25); \\E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \\XOReq256(A##mu, Du); \\ROL64in256_8(Bko, A##mu); \\E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \\XOReq256(A##sa, Da); \\ROL64in256(Bku, A##sa, 18); \\E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \\E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \\E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \\
\\XOReq256(A##bu, Du); \\ROL64in256(Bma, A##bu, 27); \\XOReq256(A##ga, Da); \\ROL64in256(Bme, A##ga, 36); \\XOReq256(A##ke, De); \\ROL64in256(Bmi, A##ke, 10); \\E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \\XOReq256(A##mi, Di); \\ROL64in256(Bmo, A##mi, 15); \\E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \\XOReq256(A##so, Do); \\ROL64in256_56(Bmu, A##so); \\E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \\E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \\E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \\
\\XOReq256(A##bi, Di); \\ROL64in256(Bsa, A##bi, 62); \\XOReq256(A##go, Do); \\ROL64in256(Bse, A##go, 55); \\XOReq256(A##ku, Du); \\ROL64in256(Bsi, A##ku, 39); \\E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \\XOReq256(A##ma, Da); \\ROL64in256(Bso, A##ma, 41); \\E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \\XOReq256(A##se, De); \\ROL64in256(Bsu, A##se, 2); \\E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \\E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \\E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \\
\\

static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {0x0000000000000001ULL,0x0000000000008082ULL,0x800000000000808aULL,0x8000000080008000ULL,0x000000000000808bULL,0x0000000080000001ULL,0x8000000080008081ULL,0x8000000000008009ULL,0x000000000000008aULL,0x0000000000000088ULL,0x0000000080008009ULL,0x000000008000000aULL,0x000000008000808bULL,0x800000000000008bULL,0x8000000000008089ULL,0x8000000000008003ULL,0x8000000000008002ULL,0x8000000000000080ULL,0x000000000000800aULL,0x800000008000000aULL,0x8000000080008081ULL,0x8000000000008080ULL,0x0000000080000001ULL,0x8000000080008008ULL};#define copyFromState(X, state) \\X##ba = LOAD256(state[ 0]); \\X##be = LOAD256(state[ 1]); \\X##bi = LOAD256(state[ 2]); \\X##bo = LOAD256(state[ 3]); \\X##bu = LOAD256(state[ 4]); \\X##ga = LOAD256(state[ 5]); \\X##ge = LOAD256(state[ 6]); \\X##gi = LOAD256(state[ 7]); \\X##go = LOAD256(state[ 8]); \\X##gu = LOAD256(state[ 9]); \\X##ka = LOAD256(state[10]); \\X##ke = LOAD256(state[11]); \\X##ki = LOAD256(state[12]); \\X##ko = LOAD256(state[13]); \\X##ku = LOAD256(state[14]); \\X##ma = LOAD256(state[15]); \\X##me = LOAD256(state[16]); \\X##mi = LOAD256(state[17]); \\X##mo = LOAD256(state[18]); \\X##mu = LOAD256(state[19]); \\X##sa = LOAD256(state[20]); \\X##se = LOAD256(state[21]); \\X##si = LOAD256(state[22]); \\X##so = LOAD256(state[23]); \\X##su = LOAD256(state[24]); \\

#define copyToState(state, X) \\STORE256(state[ 0], X##ba); \\STORE256(state[ 1], X##be); \\STORE256(state[ 2], X##bi); \\STORE256(state[ 3], X##bo); \\STORE256(state[ 4], X##bu); \\STORE256(state[ 5], X##ga); \\STORE256(state[ 6], X##ge); \\STORE256(state[ 7], X##gi); \\STORE256(state[ 8], X##go); \\STORE256(state[ 9], X##gu); \\STORE256(state[10], X##ka); \\STORE256(state[11], X##ke); \\STORE256(state[12], X##ki); \\STORE256(state[13], X##ko); \\STORE256(state[14], X##ku); \\STORE256(state[15], X##ma); \\STORE256(state[16], X##me); \\STORE256(state[17], X##mi); \\STORE256(state[18], X##mo); \\STORE256(state[19], X##mu); \\STORE256(state[20], X##sa); \\STORE256(state[21], X##se); \\STORE256(state[22], X##si); \\STORE256(state[23], X##so); \\STORE256(state[24], X##su); \\

#define copyStateVariables(X, Y) \\X##ba = Y##ba; \\X##be = Y##be; \\X##bi = Y##bi; \\X##bo = Y##bo; \\X##bu = Y##bu; \\X##ga = Y##ga; \\X##ge = Y##ge; \\X##gi = Y##gi; \\X##go = Y##go; \\X##gu = Y##gu; \\X##ka = Y##ka; \\X##ke = Y##ke; \\X##ki = Y##ki; \\X##ko = Y##ko; \\X##ku = Y##ku; \\X##ma = Y##ma; \\X##me = Y##me; \\X##mi = Y##mi; \\X##mo = Y##mo; \\X##mu = Y##mu; \\X##sa = Y##sa; \\X##se = Y##se; \\X##si = Y##si; \\X##so = Y##so; \\X##su = Y##su; \\
#ifdef KeccakP1600times4_fullUnrolling
#define FullUnrolling
#else
#define Unrolling KeccakP1600times4_unrolling
#endif
#include "KeccakP-1600-unrolling.macros.h"void KeccakP1600times4_PermuteAll_24rounds(void *states)
{V256 *statesAsLanes = (V256 *)states;declareABCDE#ifndef KeccakP1600times4_fullUnrollingunsigned int i;#endifcopyFromState(A, statesAsLanes)rounds24copyToState(statesAsLanes, A)
}void KeccakP1600times4_PermuteAll_12rounds(void *states)
{V256 *statesAsLanes = (V256 *)states;declareABCDE#ifndef KeccakP1600times4_fullUnrollingunsigned int i;#endifcopyFromState(A, statesAsLanes)rounds12copyToState(statesAsLanes, A)
}size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
{if (laneCount == 21) {
#if 0const unsigned char *dataStart = data;const UINT64 *curData0 = (const UINT64 *)data;const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {V256 *stateAsLanes = (V256 *)states;V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;#define Xor_In( argIndex ) \\XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))#define Xor_In4( argIndex ) \\lanes0 = LOAD256u( curData0[argIndex]),\\lanes1 = LOAD256u( curData1[argIndex]),\\lanes2 = LOAD256u( curData2[argIndex]),\\lanes3 = LOAD256u( curData3[argIndex]),\\INTLEAVE(),\\XOReq256( stateAsLanes[argIndex+0], lanes0 ),\\XOReq256( stateAsLanes[argIndex+1], lanes1 ),\\XOReq256( stateAsLanes[argIndex+2], lanes2 ),\\XOReq256( stateAsLanes[argIndex+3], lanes3 )Xor_In4( 0 );Xor_In4( 4 );Xor_In4( 8 );Xor_In4( 12 );Xor_In4( 16 );Xor_In( 20 );#undef  Xor_In#undef  Xor_In4KeccakP1600times4_PermuteAll_24rounds(states);curData0 += laneOffsetSerial;curData1 += laneOffsetSerial;curData2 += laneOffsetSerial;curData3 += laneOffsetSerial;dataByteLen -= laneOffsetSerial*8;}return (const unsigned char *)curData0 - dataStart;
#else
//        unsigned int i;const unsigned char *dataStart = data;const UINT64 *curData0 = (const UINT64 *)data;const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);V256 *statesAsLanes = (V256 *)states;declareABCDEcopyFromState(A, statesAsLanes)while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {#define XOR_In( Xxx, argIndex ) \\XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))XOR_In( Aba, 0 );XOR_In( Abe, 1 );XOR_In( Abi, 2 );XOR_In( Abo, 3 );XOR_In( Abu, 4 );XOR_In( Aga, 5 );XOR_In( Age, 6 );XOR_In( Agi, 7 );XOR_In( Ago, 8 );XOR_In( Agu, 9 );XOR_In( Aka, 10 );XOR_In( Ake, 11 );XOR_In( Aki, 12 );XOR_In( Ako, 13 );XOR_In( Aku, 14 );XOR_In( Ama, 15 );XOR_In( Ame, 16 );XOR_In( Ami, 17 );XOR_In( Amo, 18 );XOR_In( Amu, 19 );XOR_In( Asa, 20 );#undef XOR_Inrounds24curData0 += laneOffsetSerial;curData1 += laneOffsetSerial;curData2 += laneOffsetSerial;curData3 += laneOffsetSerial;dataByteLen -= laneOffsetSerial*8;}copyToState(statesAsLanes, A)return (const unsigned char *)curData0 - dataStart;
#endif}else {
//        unsigned int i;const unsigned char *dataStart = data;while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);KeccakP1600times4_PermuteAll_24rounds(states);data += laneOffsetSerial*8;dataByteLen -= laneOffsetSerial*8;}return data - dataStart;}
}size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
{if (laneCount == 21) {
#if 0const unsigned char *dataStart = data;const UINT64 *curData0 = (const UINT64 *)data;const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {V256 *stateAsLanes = states;V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;#define Xor_In( argIndex ) \\XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))#define Xor_In4( argIndex ) \\lanes0 = LOAD256u( curData0[argIndex]),\\lanes1 = LOAD256u( curData1[argIndex]),\\lanes2 = LOAD256u( curData2[argIndex]),\\lanes3 = LOAD256u( curData3[argIndex]),\\INTLEAVE(),\\XOReq256( stateAsLanes[argIndex+0], lanes0 ),\\XOReq256( stateAsLanes[argIndex+1], lanes1 ),\\XOReq256( stateAsLanes[argIndex+2], lanes2 ),\\XOReq256( stateAsLanes[argIndex+3], lanes3 )Xor_In4( 0 );Xor_In4( 4 );Xor_In4( 8 );Xor_In4( 12 );Xor_In4( 16 );Xor_In( 20 );#undef  Xor_In#undef  Xor_In4KeccakP1600times4_PermuteAll_12rounds(states);curData0 += laneOffsetSerial;curData1 += laneOffsetSerial;curData2 += laneOffsetSerial;curData3 += laneOffsetSerial;dataByteLen -= laneOffsetSerial*8;}return (const unsigned char *)curData0 - dataStart;
#else
//        unsigned int i;const unsigned char *dataStart = data;const UINT64 *curData0 = (const UINT64 *)data;const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);V256 *statesAsLanes = states;declareABCDEcopyFromState(A, statesAsLanes)while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {#define XOR_In( Xxx, argIndex ) \\XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))XOR_In( Aba, 0 );XOR_In( Abe, 1 );XOR_In( Abi, 2 );XOR_In( Abo, 3 );XOR_In( Abu, 4 );XOR_In( Aga, 5 );XOR_In( Age, 6 );XOR_In( Agi, 7 );XOR_In( Ago, 8 );XOR_In( Agu, 9 );XOR_In( Aka, 10 );XOR_In( Ake, 11 );XOR_In( Aki, 12 );XOR_In( Ako, 13 );XOR_In( Aku, 14 );XOR_In( Ama, 15 );XOR_In( Ame, 16 );XOR_In( Ami, 17 );XOR_In( Amo, 18 );XOR_In( Amu, 19 );XOR_In( Asa, 20 );#undef XOR_Inrounds12curData0 += laneOffsetSerial;curData1 += laneOffsetSerial;curData2 += laneOffsetSerial;curData3 += laneOffsetSerial;dataByteLen -= laneOffsetSerial*8;}copyToState(statesAsLanes, A)return (const unsigned char *)curData0 - dataStart;
#endif}else {
//        unsigned int i;const unsigned char *dataStart = data;while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);KeccakP1600times4_PermuteAll_12rounds(states);data += laneOffsetSerial*8;dataByteLen -= laneOffsetSerial*8;}return data - dataStart;}
}

fips202

fips202.h

#ifndef FIPS202_H
#define FIPS202_H#include <stddef.h>
#include <stdint.h>#define SHAKE128_RATE 168
#define SHAKE256_RATE 136
#define SHA3_256_RATE 136
#define SHA3_512_RATE 72#define FIPS202_NAMESPACE(s) fips202_ref_##stypedef struct {uint64_t s[25];unsigned int pos;
} keccak_state;#define shake128_init FIPS202_NAMESPACE(shake128_init)
void shake128_init(keccak_state *state);
#define shake128_absorb FIPS202_NAMESPACE(shake128_absorb)
void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen);
#define shake128_finalize FIPS202_NAMESPACE(shake128_finalize)
void shake128_finalize(keccak_state *state);
#define shake128_squeeze FIPS202_NAMESPACE(shake128_squeeze)
void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state);
#define shake128_absorb_once FIPS202_NAMESPACE(shake128_absorb_once)
void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen);
#define shake128_squeezeblocks FIPS202_NAMESPACE(shake128_squeezeblocks)
void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state);#define shake256_init FIPS202_NAMESPACE(shake256_init)
void shake256_init(keccak_state *state);
#define shake256_absorb FIPS202_NAMESPACE(shake256_absorb)
void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen);
#define shake256_finalize FIPS202_NAMESPACE(shake256_finalize)
void shake256_finalize(keccak_state *state);
#define shake256_squeeze FIPS202_NAMESPACE(shake256_squeeze)
void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state);
#define shake256_absorb_once FIPS202_NAMESPACE(shake256_absorb_once)
void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen);
#define shake256_squeezeblocks FIPS202_NAMESPACE(shake256_squeezeblocks)
void shake256_squeezeblocks(uint8_t *out, size_t nblocks,  keccak_state *state);#define shake128 FIPS202_NAMESPACE(shake128)
void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen);
#define shake256 FIPS202_NAMESPACE(shake256)
void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen);
#define sha3_256 FIPS202_NAMESPACE(sha3_256)
void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen);
#define sha3_512 FIPS202_NAMESPACE(sha3_512)
void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen);#endif

fips202.c

/* Based on the public domain implementation in crypto_hash/keccakc512/simple/ from* http://bench.cr.yp.to/supercop.html by Ronny Van Keer and the public domain "TweetFips202"* implementation from https://twitter.com/tweetfips202 by Gilles Van Assche, Daniel J. Bernstein,* and Peter Schwabe */#include <stddef.h>
#include <stdint.h>
#include "fips202.h"#define NROUNDS 24
#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))/*************************************************
* Name:        load64
*
* Description: Load 8 bytes into uint64_t in little-endian order
*
* Arguments:   - const uint8_t *x: pointer to input byte array
*
* Returns the loaded 64-bit unsigned integer
**************************************************/
static uint64_t load64(const uint8_t x[8]) {unsigned int i;uint64_t r = 0;for(i=0;i<8;i++)r |= (uint64_t)x[i] << 8*i;return r;
}/*************************************************
* Name:        store64
*
* Description: Store a 64-bit integer to array of 8 bytes in little-endian order
*
* Arguments:   - uint8_t *x: pointer to the output byte array (allocated)
*              - uint64_t u: input 64-bit unsigned integer
**************************************************/
static void store64(uint8_t x[8], uint64_t u) {unsigned int i;for(i=0;i<8;i++)x[i] = u >> 8*i;
}/* Keccak round constants */
static const uint64_t KeccakF_RoundConstants[NROUNDS] = {(uint64_t)0x0000000000000001ULL,(uint64_t)0x0000000000008082ULL,(uint64_t)0x800000000000808aULL,(uint64_t)0x8000000080008000ULL,(uint64_t)0x000000000000808bULL,(uint64_t)0x0000000080000001ULL,(uint64_t)0x8000000080008081ULL,(uint64_t)0x8000000000008009ULL,(uint64_t)0x000000000000008aULL,(uint64_t)0x0000000000000088ULL,(uint64_t)0x0000000080008009ULL,(uint64_t)0x000000008000000aULL,(uint64_t)0x000000008000808bULL,(uint64_t)0x800000000000008bULL,(uint64_t)0x8000000000008089ULL,(uint64_t)0x8000000000008003ULL,(uint64_t)0x8000000000008002ULL,(uint64_t)0x8000000000000080ULL,(uint64_t)0x000000000000800aULL,(uint64_t)0x800000008000000aULL,(uint64_t)0x8000000080008081ULL,(uint64_t)0x8000000000008080ULL,(uint64_t)0x0000000080000001ULL,(uint64_t)0x8000000080008008ULL
};/*************************************************
* Name:        KeccakF1600_StatePermute
*
* Description: The Keccak F1600 Permutation
*
* Arguments:   - uint64_t *state: pointer to input/output Keccak state
**************************************************/
static void KeccakF1600_StatePermute(uint64_t state[25])
{int round;uint64_t Aba, Abe, Abi, Abo, Abu;uint64_t Aga, Age, Agi, Ago, Agu;uint64_t Aka, Ake, Aki, Ako, Aku;uint64_t Ama, Ame, Ami, Amo, Amu;uint64_t Asa, Ase, Asi, Aso, Asu;uint64_t BCa, BCe, BCi, BCo, BCu;uint64_t Da, De, Di, Do, Du;uint64_t Eba, Ebe, Ebi, Ebo, Ebu;uint64_t Ega, Ege, Egi, Ego, Egu;uint64_t Eka, Eke, Eki, Eko, Eku;uint64_t Ema, Eme, Emi, Emo, Emu;uint64_t Esa, Ese, Esi, Eso, Esu;//copyFromState(A, state)Aba = state[ 0];Abe = state[ 1];Abi = state[ 2];Abo = state[ 3];Abu = state[ 4];Aga = state[ 5];Age = state[ 6];Agi = state[ 7];Ago = state[ 8];Agu = state[ 9];Aka = state[10];Ake = state[11];Aki = state[12];Ako = state[13];Aku = state[14];Ama = state[15];Ame = state[16];Ami = state[17];Amo = state[18];Amu = state[19];Asa = state[20];Ase = state[21];Asi = state[22];Aso = state[23];Asu = state[24];for(round = 0; round < NROUNDS; round += 2) {//    prepareThetaBCa = Aba^Aga^Aka^Ama^Asa;BCe = Abe^Age^Ake^Ame^Ase;BCi = Abi^Agi^Aki^Ami^Asi;BCo = Abo^Ago^Ako^Amo^Aso;BCu = Abu^Agu^Aku^Amu^Asu;//thetaRhoPiChiIotaPrepareTheta(round, A, E)Da = BCu^ROL(BCe, 1);De = BCa^ROL(BCi, 1);Di = BCe^ROL(BCo, 1);Do = BCi^ROL(BCu, 1);Du = BCo^ROL(BCa, 1);Aba ^= Da;BCa = Aba;Age ^= De;BCe = ROL(Age, 44);Aki ^= Di;BCi = ROL(Aki, 43);Amo ^= Do;BCo = ROL(Amo, 21);Asu ^= Du;BCu = ROL(Asu, 14);Eba =   BCa ^((~BCe)&  BCi );Eba ^= (uint64_t)KeccakF_RoundConstants[round];Ebe =   BCe ^((~BCi)&  BCo );Ebi =   BCi ^((~BCo)&  BCu );Ebo =   BCo ^((~BCu)&  BCa );Ebu =   BCu ^((~BCa)&  BCe );Abo ^= Do;BCa = ROL(Abo, 28);Agu ^= Du;BCe = ROL(Agu, 20);Aka ^= Da;BCi = ROL(Aka,  3);Ame ^= De;BCo = ROL(Ame, 45);Asi ^= Di;BCu = ROL(Asi, 61);Ega =   BCa ^((~BCe)&  BCi );Ege =   BCe ^((~BCi)&  BCo );Egi =   BCi ^((~BCo)&  BCu );Ego =   BCo ^((~BCu)&  BCa );Egu =   BCu ^((~BCa)&  BCe );Abe ^= De;BCa = ROL(Abe,  1);Agi ^= Di;BCe = ROL(Agi,  6);Ako ^= Do;BCi = ROL(Ako, 25);Amu ^= Du;BCo = ROL(Amu,  8);Asa ^= Da;BCu = ROL(Asa, 18);Eka =   BCa ^((~BCe)&  BCi );Eke =   BCe ^((~BCi)&  BCo );Eki =   BCi ^((~BCo)&  BCu );Eko =   BCo ^((~BCu)&  BCa );Eku =   BCu ^((~BCa)&  BCe );Abu ^= Du;BCa = ROL(Abu, 27);Aga ^= Da;BCe = ROL(Aga, 36);Ake ^= De;BCi = ROL(Ake, 10);Ami ^= Di;BCo = ROL(Ami, 15);Aso ^= Do;BCu = ROL(Aso, 56);Ema =   BCa ^((~BCe)&  BCi );Eme =   BCe ^((~BCi)&  BCo );Emi =   BCi ^((~BCo)&  BCu );Emo =   BCo ^((~BCu)&  BCa );Emu =   BCu ^((~BCa)&  BCe );Abi ^= Di;BCa = ROL(Abi, 62);Ago ^= Do;BCe = ROL(Ago, 55);Aku ^= Du;BCi = ROL(Aku, 39);Ama ^= Da;BCo = ROL(Ama, 41);Ase ^= De;BCu = ROL(Ase,  2);Esa =   BCa ^((~BCe)&  BCi );Ese =   BCe ^((~BCi)&  BCo );Esi =   BCi ^((~BCo)&  BCu );Eso =   BCo ^((~BCu)&  BCa );Esu =   BCu ^((~BCa)&  BCe );//    prepareThetaBCa = Eba^Ega^Eka^Ema^Esa;BCe = Ebe^Ege^Eke^Eme^Ese;BCi = Ebi^Egi^Eki^Emi^Esi;BCo = Ebo^Ego^Eko^Emo^Eso;BCu = Ebu^Egu^Eku^Emu^Esu;//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)Da = BCu^ROL(BCe, 1);De = BCa^ROL(BCi, 1);Di = BCe^ROL(BCo, 1);Do = BCi^ROL(BCu, 1);Du = BCo^ROL(BCa, 1);Eba ^= Da;BCa = Eba;Ege ^= De;BCe = ROL(Ege, 44);Eki ^= Di;BCi = ROL(Eki, 43);Emo ^= Do;BCo = ROL(Emo, 21);Esu ^= Du;BCu = ROL(Esu, 14);Aba =   BCa ^((~BCe)&  BCi );Aba ^= (uint64_t)KeccakF_RoundConstants[round+1];Abe =   BCe ^((~BCi)&  BCo );Abi =   BCi ^((~BCo)&  BCu );Abo =   BCo ^((~BCu)&  BCa );Abu =   BCu ^((~BCa)&  BCe );Ebo ^= Do;BCa = ROL(Ebo, 28);Egu ^= Du;BCe = ROL(Egu, 20);Eka ^= Da;BCi = ROL(Eka, 3);Eme ^= De;BCo = ROL(Eme, 45);Esi ^= Di;BCu = ROL(Esi, 61);Aga =   BCa ^((~BCe)&  BCi );Age =   BCe ^((~BCi)&  BCo );Agi =   BCi ^((~BCo)&  BCu );Ago =   BCo ^((~BCu)&  BCa );Agu =   BCu ^((~BCa)&  BCe );Ebe ^= De;BCa = ROL(Ebe, 1);Egi ^= Di;BCe = ROL(Egi, 6);Eko ^= Do;BCi = ROL(Eko, 25);Emu ^= Du;BCo = ROL(Emu, 8);Esa ^= Da;BCu = ROL(Esa, 18);Aka =   BCa ^((~BCe)&  BCi );Ake =   BCe ^((~BCi)&  BCo );Aki =   BCi ^((~BCo)&  BCu );Ako =   BCo ^((~BCu)&  BCa );Aku =   BCu ^((~BCa)&  BCe );Ebu ^= Du;BCa = ROL(Ebu, 27);Ega ^= Da;BCe = ROL(Ega, 36);Eke ^= De;BCi = ROL(Eke, 10);Emi ^= Di;BCo = ROL(Emi, 15);Eso ^= Do;BCu = ROL(Eso, 56);Ama =   BCa ^((~BCe)&  BCi );Ame =   BCe ^((~BCi)&  BCo );Ami =   BCi ^((~BCo)&  BCu );Amo =   BCo ^((~BCu)&  BCa );Amu =   BCu ^((~BCa)&  BCe );Ebi ^= Di;BCa = ROL(Ebi, 62);Ego ^= Do;BCe = ROL(Ego, 55);Eku ^= Du;BCi = ROL(Eku, 39);Ema ^= Da;BCo = ROL(Ema, 41);Ese ^= De;BCu = ROL(Ese, 2);Asa =   BCa ^((~BCe)&  BCi );Ase =   BCe ^((~BCi)&  BCo );Asi =   BCi ^((~BCo)&  BCu );Aso =   BCo ^((~BCu)&  BCa );Asu =   BCu ^((~BCa)&  BCe );}//copyToState(state, A)state[ 0] = Aba;state[ 1] = Abe;state[ 2] = Abi;state[ 3] = Abo;state[ 4] = Abu;state[ 5] = Aga;state[ 6] = Age;state[ 7] = Agi;state[ 8] = Ago;state[ 9] = Agu;state[10] = Aka;state[11] = Ake;state[12] = Aki;state[13] = Ako;state[14] = Aku;state[15] = Ama;state[16] = Ame;state[17] = Ami;state[18] = Amo;state[19] = Amu;state[20] = Asa;state[21] = Ase;state[22] = Asi;state[23] = Aso;state[24] = Asu;
}/*************************************************
* Name:        keccak_init
*
* Description: Initializes the Keccak state.
*
* Arguments:   - uint64_t *s: pointer to Keccak state
**************************************************/
static void keccak_init(uint64_t s[25])
{unsigned int i;for(i=0;i<25;i++)s[i] = 0;
}/*************************************************
* Name:        keccak_absorb
*
* Description: Absorb step of Keccak; incremental.
*
* Arguments:   - uint64_t *s: pointer to Keccak state
*              - unsigned int pos: position in current block to be absorbed
*              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
*              - const uint8_t *in: pointer to input to be absorbed into s
*              - size_t inlen: length of input in bytes
*
* Returns new position pos in current block
**************************************************/
static unsigned int keccak_absorb(uint64_t s[25],unsigned int pos,unsigned int r,const uint8_t *in,size_t inlen)
{unsigned int i;while(pos+inlen >= r) {for(i=pos;i<r;i++)s[i/8] ^= (uint64_t)*in++ << 8*(i%8);inlen -= r-pos;KeccakF1600_StatePermute(s);pos = 0;}for(i=pos;i<pos+inlen;i++)s[i/8] ^= (uint64_t)*in++ << 8*(i%8);return i;
}/*************************************************
* Name:        keccak_finalize
*
* Description: Finalize absorb step.
*
* Arguments:   - uint64_t *s: pointer to Keccak state
*              - unsigned int pos: position in current block to be absorbed
*              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
*              - uint8_t p: domain separation byte
**************************************************/
static void keccak_finalize(uint64_t s[25], unsigned int pos, unsigned int r, uint8_t p)
{s[pos/8] ^= (uint64_t)p << 8*(pos%8);s[r/8-1] ^= 1ULL << 63;
}/*************************************************
* Name:        keccak_squeeze
*
* Description: Squeeze step of Keccak. Squeezes arbitratrily many bytes.
*              Modifies the state. Can be called multiple times to keep
*              squeezing, i.e., is incremental.
*
* Arguments:   - uint8_t *out: pointer to output
*              - size_t outlen: number of bytes to be squeezed (written to out)
*              - uint64_t *s: pointer to input/output Keccak state
*              - unsigned int pos: number of bytes in current block already squeezed
*              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
*
* Returns new position pos in current block
**************************************************/
static unsigned int keccak_squeeze(uint8_t *out,size_t outlen,uint64_t s[25],unsigned int pos,unsigned int r)
{unsigned int i;while(outlen) {if(pos == r) {KeccakF1600_StatePermute(s);pos = 0;}for(i=pos;i < r && i < pos+outlen; i++)*out++ = s[i/8] >> 8*(i%8);outlen -= i-pos;pos = i;}return pos;
}/*************************************************
* Name:        keccak_absorb_once
*
* Description: Absorb step of Keccak;
*              non-incremental, starts by zeroeing the state.
*
* Arguments:   - uint64_t *s: pointer to (uninitialized) output Keccak state
*              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
*              - const uint8_t *in: pointer to input to be absorbed into s
*              - size_t inlen: length of input in bytes
*              - uint8_t p: domain-separation byte for different Keccak-derived functions
**************************************************/
static void keccak_absorb_once(uint64_t s[25],unsigned int r,const uint8_t *in,size_t inlen,uint8_t p)
{unsigned int i;for(i=0;i<25;i++)s[i] = 0;while(inlen >= r) {for(i=0;i<r/8;i++)s[i] ^= load64(in+8*i);in += r;inlen -= r;KeccakF1600_StatePermute(s);}for(i=0;i<inlen;i++)s[i/8] ^= (uint64_t)in[i] << 8*(i%8);s[i/8] ^= (uint64_t)p << 8*(i%8);s[(r-1)/8] ^= 1ULL << 63;
}/*************************************************
* Name:        keccak_squeezeblocks
*
* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each.
*              Modifies the state. Can be called multiple times to keep
*              squeezing, i.e., is incremental. Assumes zero bytes of current
*              block have already been squeezed.
*
* Arguments:   - uint8_t *out: pointer to output blocks
*              - size_t nblocks: number of blocks to be squeezed (written to out)
*              - uint64_t *s: pointer to input/output Keccak state
*              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
**************************************************/
static void keccak_squeezeblocks(uint8_t *out,size_t nblocks,uint64_t s[25],unsigned int r)
{unsigned int i;while(nblocks) {KeccakF1600_StatePermute(s);for(i=0;i<r/8;i++)store64(out+8*i, s[i]);out += r;nblocks -= 1;}
}/*************************************************
* Name:        shake128_init
*
* Description: Initilizes Keccak state for use as SHAKE128 XOF
*
* Arguments:   - keccak_state *state: pointer to (uninitialized) Keccak state
**************************************************/
void shake128_init(keccak_state *state)
{keccak_init(state->s);state->pos = 0;
}/*************************************************
* Name:        shake128_absorb
*
* Description: Absorb step of the SHAKE128 XOF; incremental.
*
* Arguments:   - keccak_state *state: pointer to (initialized) output Keccak state
*              - const uint8_t *in: pointer to input to be absorbed into s
*              - size_t inlen: length of input in bytes
**************************************************/
void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen)
{state->pos = keccak_absorb(state->s, state->pos, SHAKE128_RATE, in, inlen);
}/*************************************************
* Name:        shake128_finalize
*
* Description: Finalize absorb step of the SHAKE128 XOF.
*
* Arguments:   - keccak_state *state: pointer to Keccak state
**************************************************/
void shake128_finalize(keccak_state *state)
{keccak_finalize(state->s, state->pos, SHAKE128_RATE, 0x1F);state->pos = SHAKE128_RATE;
}/*************************************************
* Name:        shake128_squeeze
*
* Description: Squeeze step of SHAKE128 XOF. Squeezes arbitraily many
*              bytes. Can be called multiple times to keep squeezing.
*
* Arguments:   - uint8_t *out: pointer to output blocks
*              - size_t outlen : number of bytes to be squeezed (written to output)
*              - keccak_state *s: pointer to input/output Keccak state
**************************************************/
void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state)
{state->pos = keccak_squeeze(out, outlen, state->s, state->pos, SHAKE128_RATE);
}/*************************************************
* Name:        shake128_absorb_once
*
* Description: Initialize, absorb into and finalize SHAKE128 XOF; non-incremental.
*
* Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
*              - const uint8_t *in: pointer to input to be absorbed into s
*              - size_t inlen: length of input in bytes
**************************************************/
void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen)
{keccak_absorb_once(state->s, SHAKE128_RATE, in, inlen, 0x1F);state->pos = SHAKE128_RATE;
}/*************************************************
* Name:        shake128_squeezeblocks
*
* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of
*              SHAKE128_RATE bytes each. Can be called multiple times
*              to keep squeezing. Assumes new block has not yet been
*              started (state->pos = SHAKE128_RATE).
*
* Arguments:   - uint8_t *out: pointer to output blocks
*              - size_t nblocks: number of blocks to be squeezed (written to output)
*              - keccak_state *s: pointer to input/output Keccak state
**************************************************/
void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state)
{keccak_squeezeblocks(out, nblocks, state->s, SHAKE128_RATE);
}/*************************************************
* Name:        shake256_init
*
* Description: Initilizes Keccak state for use as SHAKE256 XOF
*
* Arguments:   - keccak_state *state: pointer to (uninitialized) Keccak state
**************************************************/
void shake256_init(keccak_state *state)
{keccak_init(state->s);state->pos = 0;
}/*************************************************
* Name:        shake256_absorb
*
* Description: Absorb step of the SHAKE256 XOF; incremental.
*
* Arguments:   - keccak_state *state: pointer to (initialized) output Keccak state
*              - const uint8_t *in: pointer to input to be absorbed into s
*              - size_t inlen: length of input in bytes
**************************************************/
void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen)
{state->pos = keccak_absorb(state->s, state->pos, SHAKE256_RATE, in, inlen);
}/*************************************************
* Name:        shake256_finalize
*
* Description: Finalize absorb step of the SHAKE256 XOF.
*
* Arguments:   - keccak_state *state: pointer to Keccak state
**************************************************/
void shake256_finalize(keccak_state *state)
{keccak_finalize(state->s, state->pos, SHAKE256_RATE, 0x1F);state->pos = SHAKE256_RATE;
}/*************************************************
* Name:        shake256_squeeze
*
* Description: Squeeze step of SHAKE256 XOF. Squeezes arbitraily many
*              bytes. Can be called multiple times to keep squeezing.
*
* Arguments:   - uint8_t *out: pointer to output blocks
*              - size_t outlen : number of bytes to be squeezed (written to output)
*              - keccak_state *s: pointer to input/output Keccak state
**************************************************/
void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state)
{state->pos = keccak_squeeze(out, outlen, state->s, state->pos, SHAKE256_RATE);
}/*************************************************
* Name:        shake256_absorb_once
*
* Description: Initialize, absorb into and finalize SHAKE256 XOF; non-incremental.
*
* Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
*              - const uint8_t *in: pointer to input to be absorbed into s
*              - size_t inlen: length of input in bytes
**************************************************/
void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen)
{keccak_absorb_once(state->s, SHAKE256_RATE, in, inlen, 0x1F);state->pos = SHAKE256_RATE;
}/*************************************************
* Name:        shake256_squeezeblocks
*
* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of
*              SHAKE256_RATE bytes each. Can be called multiple times
*              to keep squeezing. Assumes next block has not yet been
*              started (state->pos = SHAKE256_RATE).
*
* Arguments:   - uint8_t *out: pointer to output blocks
*              - size_t nblocks: number of blocks to be squeezed (written to output)
*              - keccak_state *s: pointer to input/output Keccak state
**************************************************/
void shake256_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state)
{keccak_squeezeblocks(out, nblocks, state->s, SHAKE256_RATE);
}/*************************************************
* Name:        shake128
*
* Description: SHAKE128 XOF with non-incremental API
*
* Arguments:   - uint8_t *out: pointer to output
*              - size_t outlen: requested output length in bytes
*              - const uint8_t *in: pointer to input
*              - size_t inlen: length of input in bytes
**************************************************/
void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen)
{size_t nblocks;keccak_state state;shake128_absorb_once(&state, in, inlen);nblocks = outlen/SHAKE128_RATE;shake128_squeezeblocks(out, nblocks, &state);outlen -= nblocks*SHAKE128_RATE;out += nblocks*SHAKE128_RATE;shake128_squeeze(out, outlen, &state);
}/*************************************************
* Name:        shake256
*
* Description: SHAKE256 XOF with non-incremental API
*
* Arguments:   - uint8_t *out: pointer to output
*              - size_t outlen: requested output length in bytes
*              - const uint8_t *in: pointer to input
*              - size_t inlen: length of input in bytes
**************************************************/
void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen)
{size_t nblocks;keccak_state state;shake256_absorb_once(&state, in, inlen);nblocks = outlen/SHAKE256_RATE;shake256_squeezeblocks(out, nblocks, &state);outlen -= nblocks*SHAKE256_RATE;out += nblocks*SHAKE256_RATE;shake256_squeeze(out, outlen, &state);
}/*************************************************
* Name:        sha3_256
*
* Description: SHA3-256 with non-incremental API
*
* Arguments:   - uint8_t *h: pointer to output (32 bytes)
*              - const uint8_t *in: pointer to input
*              - size_t inlen: length of input in bytes
**************************************************/
void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen)
{unsigned int i;uint64_t s[25];keccak_absorb_once(s, SHA3_256_RATE, in, inlen, 0x06);KeccakF1600_StatePermute(s);for(i=0;i<4;i++)store64(h+8*i,s[i]);
}/*************************************************
* Name:        sha3_512
*
* Description: SHA3-512 with non-incremental API
*
* Arguments:   - uint8_t *h: pointer to output (64 bytes)
*              - const uint8_t *in: pointer to input
*              - size_t inlen: length of input in bytes
**************************************************/
void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen)
{unsigned int i;uint64_t s[25];keccak_absorb_once(s, SHA3_512_RATE, in, inlen, 0x06);KeccakF1600_StatePermute(s);for(i=0;i<8;i++)store64(h+8*i,s[i]);
}

fips202x4.h

#ifndef FIPS202X4_H
#define FIPS202X4_H#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>#define FIPS202X4_NAMESPACE(s) fips202x4_avx2_##stypedef struct {__m256i s[25];
} keccakx4_state;#define shake128x4_absorb_once FIPS202X4_NAMESPACE(shake128x4_absorb_once)
void shake128x4_absorb_once(keccakx4_state* state,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen);#define shake128x4_squeezeblocks FIPS202X4_NAMESPACE(shake128x4_squeezeblocks)
void shake128x4_squeezeblocks(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t nblocks,keccakx4_state* state);#define shake128x4_squeeze FIPS202X4_NAMESPACE(shake128x4_squeeze)
void shake128x4_squeeze(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t outlen,keccakx4_state* state);#define shake256x4_absorb_once FIPS202X4_NAMESPACE(shake256x4_absorb_once)
void shake256x4_absorb_once(keccakx4_state* state,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen);#define shake256x4_squeezeblocks FIPS202X4_NAMESPACE(shake256x4_squeezeblocks)
void shake256x4_squeezeblocks(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t nblocks,keccakx4_state* state);#define shake256x4_squeeze FIPS202X4_NAMESPACE(shake256x4_squeeze)
void shake256x4_squeeze(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t outlen,keccakx4_state* state);#define shake128x4 FIPS202X4_NAMESPACE(shake128x4)
void shake128x4(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t outlen,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen);#define shake256x4 FIPS202X4_NAMESPACE(shake256x4)
void shake256x4(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t outlen,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen);#endif

fips202x4.c

#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
#include <string.h>
#include "fips202.h"
#include "fips202x4.h"/* Use implementation from the Keccak Code Package */
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds
extern void KeccakF1600_StatePermute4x(__m256i* s);static void keccakx4_absorb_once(__m256i s[25],unsigned int r,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen,uint8_t p)
{size_t i;uint64_t pos = 0;__m256i t, idx;for (i = 0; i < 25; ++i)s[i] = _mm256_setzero_si256();idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);while (inlen >= r) {for (i = 0; i < r / 8; ++i) {t = _mm256_i64gather_epi64((long long*)pos, idx, 1);s[i] = _mm256_xor_si256(s[i], t);pos += 8;}inlen -= r;KeccakF1600_StatePermute4x(s);}for (i = 0; i < inlen / 8; ++i) {t = _mm256_i64gather_epi64((long long*)pos, idx, 1);s[i] = _mm256_xor_si256(s[i], t);pos += 8;}inlen -= 8 * i;if (inlen) {t = _mm256_i64gather_epi64((long long*)pos, idx, 1);idx = _mm256_set1_epi64x((1ULL << (8 * inlen)) - 1);t = _mm256_and_si256(t, idx);s[i] = _mm256_xor_si256(s[i], t);}t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen);s[i] = _mm256_xor_si256(s[i], t);t = _mm256_set1_epi64x(1ULL << 63);s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t);
}static void keccakx4_squeezeblocks(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t nblocks,unsigned int r,__m256i s[25])
{unsigned int i;__m128d t;while (nblocks > 0) {KeccakF1600_StatePermute4x(s);for (i = 0; i < r / 8; ++i) {t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));_mm_storel_pd((double*) & out0[8 * i], t);_mm_storeh_pd((double*) & out1[8 * i], t);t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));_mm_storel_pd((double*) & out2[8 * i], t);_mm_storeh_pd((double*) & out3[8 * i], t);}out0 += r;out1 += r;out2 += r;out3 += r;--nblocks;}
}void shake128x4_absorb_once(keccakx4_state* state,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen)
{keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}void shake128x4_squeezeblocks(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t nblocks,keccakx4_state* state)
{keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);
}void shake128x4_squeeze(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t outlen,keccakx4_state* state)
{unsigned int i;size_t nblocks = outlen / SHAKE128_RATE;uint8_t t[4][SHAKE128_RATE];keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);out0 += nblocks * SHAKE128_RATE;out1 += nblocks * SHAKE128_RATE;out2 += nblocks * SHAKE128_RATE;out3 += nblocks * SHAKE128_RATE;outlen -= nblocks * SHAKE128_RATE;if (outlen) {keccakx4_squeezeblocks(t[0], t[1], t[2], t[3], 1, SHAKE128_RATE, state->s);for (i = 0; i < outlen; ++i) {out0[i] = t[0][i];out1[i] = t[1][i];out2[i] = t[2][i];out3[i] = t[3][i];}}
}void shake256x4_absorb_once(keccakx4_state* state,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen)
{keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
}void shake256x4_squeezeblocks(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t nblocks,keccakx4_state* state)
{keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);
}void shake256x4_squeeze(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t outlen,keccakx4_state* state)
{unsigned int i;size_t nblocks = outlen / SHAKE256_RATE;uint8_t t[4][SHAKE256_RATE];keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);out0 += nblocks * SHAKE256_RATE;out1 += nblocks * SHAKE256_RATE;out2 += nblocks * SHAKE256_RATE;out3 += nblocks * SHAKE256_RATE;outlen -= nblocks * SHAKE256_RATE;if (outlen) {keccakx4_squeezeblocks(t[0], t[1], t[2], t[3], 1, SHAKE256_RATE, state->s);for (i = 0; i < outlen; ++i) {out0[i] = t[0][i];out1[i] = t[1][i];out2[i] = t[2][i];out3[i] = t[3][i];}}
}void shake128x4(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t outlen,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen)
{unsigned int i;size_t nblocks = outlen / SHAKE128_RATE;uint8_t t[4][SHAKE128_RATE];keccakx4_state state;shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);out0 += nblocks * SHAKE128_RATE;out1 += nblocks * SHAKE128_RATE;out2 += nblocks * SHAKE128_RATE;out3 += nblocks * SHAKE128_RATE;outlen -= nblocks * SHAKE128_RATE;if (outlen) {shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);for (i = 0; i < outlen; ++i) {out0[i] = t[0][i];out1[i] = t[1][i];out2[i] = t[2][i];out3[i] = t[3][i];}}
}void shake256x4(uint8_t* out0,uint8_t* out1,uint8_t* out2,uint8_t* out3,size_t outlen,const uint8_t* in0,const uint8_t* in1,const uint8_t* in2,const uint8_t* in3,size_t inlen)
{unsigned int i;size_t nblocks = outlen / SHAKE256_RATE;uint8_t t[4][SHAKE256_RATE];keccakx4_state state;shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);out0 += nblocks * SHAKE256_RATE;out1 += nblocks * SHAKE256_RATE;out2 += nblocks * SHAKE256_RATE;out3 += nblocks * SHAKE256_RATE;outlen -= nblocks * SHAKE256_RATE;if (outlen) {shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);for (i = 0; i < outlen; ++i) {out0[i] = t[0][i];out1[i] = t[1][i];out2[i] = t[2][i];out3[i] = t[3][i];}}
}