/search.css" rel="stylesheet" type="text/css"/> /search.js">
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
threefry.h
Go to the documentation of this file.
1 /*
2 Copyright 2010-2011, D. E. Shaw Research.
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8 
9 * Redistributions of source code must retain the above copyright
10  notice, this list of conditions, and the following disclaimer.
11 
12 * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions, and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16 * Neither the name of D. E. Shaw Research nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 #ifndef _threefry_dot_h_
33 #define _threefry_dot_h_
35 #include "array.h"
36 
38 /* Significant parts of this file were copied from
39  from:
40  Skein_FinalRnd/ReferenceImplementation/skein.h
41  Skein_FinalRnd/ReferenceImplementation/skein_block.c
42 
43  in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
44 
45  This file has been modified so that it may no longer perform its originally
46  intended function. If you're looking for a Skein or Threefish source code,
47  please consult the original file.
48 
49  The original file had the following header:
50 **************************************************************************
51 **
52 ** Interface declarations and internal definitions for Skein hashing.
53 **
54 ** Source code author: Doug Whiting, 2008.
55 **
56 ** This algorithm and source code is released to the public domain.
57 **
58 ***************************************************************************
59 
60 */
61 
62 /* See comment at the top of philox.h for the macro pre-process
63  strategy. */
64 
65 /* Rotation constants: */
66 enum r123_enum_threefry64x4 {
67  /* These are the R_256 constants from the Threefish reference sources
68  with names changed to R_64x4... */
69  R_64x4_0_0=14, R_64x4_0_1=16,
70  R_64x4_1_0=52, R_64x4_1_1=57,
71  R_64x4_2_0=23, R_64x4_2_1=40,
72  R_64x4_3_0= 5, R_64x4_3_1=37,
73  R_64x4_4_0=25, R_64x4_4_1=33,
74  R_64x4_5_0=46, R_64x4_5_1=12,
75  R_64x4_6_0=58, R_64x4_6_1=22,
76  R_64x4_7_0=32, R_64x4_7_1=32
77 };
78 
79 enum r123_enum_threefry64x2 {
80  /*
81  // Output from skein_rot_search: (srs64_B64-X1000)
82  // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
83  // Start: Tue Mar 1 10:07:48 2011
84  // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
85  */
86  R_64x2_0_0=16,
87  R_64x2_1_0=42,
88  R_64x2_2_0=12,
89  R_64x2_3_0=31,
90  R_64x2_4_0=16,
91  R_64x2_5_0=32,
92  R_64x2_6_0=24,
93  R_64x2_7_0=21
94  /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
95  // 5 rounds: minHW = 8 [ 8 8 8 8 ]
96  // 6 rounds: minHW = 16 [ 16 16 16 16 ]
97  // 7 rounds: minHW = 32 [ 32 32 32 32 ]
98  // 8 rounds: minHW = 64 [ 64 64 64 64 ]
99  // 9 rounds: minHW = 64 [ 64 64 64 64 ]
100  //10 rounds: minHW = 64 [ 64 64 64 64 ]
101  //11 rounds: minHW = 64 [ 64 64 64 64 ] */
102 };
103 
104 enum r123_enum_threefry32x4 {
105  /* Output from skein_rot_search: (srs-B128-X5000.out)
106  // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
107  // Start: Mon Aug 24 22:41:36 2009
108  // ...
109  // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
110  R_32x4_0_0=10, R_32x4_0_1=26,
111  R_32x4_1_0=11, R_32x4_1_1=21,
112  R_32x4_2_0=13, R_32x4_2_1=27,
113  R_32x4_3_0=23, R_32x4_3_1= 5,
114  R_32x4_4_0= 6, R_32x4_4_1=20,
115  R_32x4_5_0=17, R_32x4_5_1=11,
116  R_32x4_6_0=25, R_32x4_6_1=10,
117  R_32x4_7_0=18, R_32x4_7_1=20
118 
119  /* 4 rounds: minHW = 3 [ 3 3 3 3 ]
120  // 5 rounds: minHW = 7 [ 7 7 7 7 ]
121  // 6 rounds: minHW = 12 [ 13 12 13 12 ]
122  // 7 rounds: minHW = 22 [ 22 23 22 23 ]
123  // 8 rounds: minHW = 31 [ 31 31 31 31 ]
124  // 9 rounds: minHW = 32 [ 32 32 32 32 ]
125  //10 rounds: minHW = 32 [ 32 32 32 32 ]
126  //11 rounds: minHW = 32 [ 32 32 32 32 ] */
127 
128 };
129 
130 enum r123_enum_threefry32x2 {
131  /* Output from skein_rot_search (srs32x2-X5000.out)
132  // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
133  // Start: Tue Jul 12 11:11:33 2011
134  // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
135  R_32x2_0_0=13,
136  R_32x2_1_0=15,
137  R_32x2_2_0=26,
138  R_32x2_3_0= 6,
139  R_32x2_4_0=17,
140  R_32x2_5_0=29,
141  R_32x2_6_0=16,
142  R_32x2_7_0=24
143 
144  /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
145  // 5 rounds: minHW = 6 [ 6 8 6 8 ]
146  // 6 rounds: minHW = 9 [ 9 12 9 12 ]
147  // 7 rounds: minHW = 16 [ 16 24 16 24 ]
148  // 8 rounds: minHW = 32 [ 32 32 32 32 ]
149  // 9 rounds: minHW = 32 [ 32 32 32 32 ]
150  //10 rounds: minHW = 32 [ 32 32 32 32 ]
151  //11 rounds: minHW = 32 [ 32 32 32 32 ] */
152  };
153 
154 enum r123_enum_threefry_wcnt {
155  WCNT2=2,
156  WCNT4=4
157 };
158 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
159 R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
160 {
161  return (x << (N & 63)) | (x >> ((64-N) & 63));
162 }
163 
164 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
165 R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
166 {
167  return (x << (N & 31)) | (x >> ((32-N) & 31));
168 }
169 
170 #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
171 #define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
172 #define SKEIN_KS_PARITY32 0x1BD11BDA
173 
174 #ifndef THREEFRY2x32_DEFAULT_ROUNDS
175 #define THREEFRY2x32_DEFAULT_ROUNDS 20
176 #endif
177 
178 #ifndef THREEFRY2x64_DEFAULT_ROUNDS
179 #define THREEFRY2x64_DEFAULT_ROUNDS 20
180 #endif
181 
182 #ifndef THREEFRY4x32_DEFAULT_ROUNDS
183 #define THREEFRY4x32_DEFAULT_ROUNDS 20
184 #endif
185 
186 #ifndef THREEFRY4x64_DEFAULT_ROUNDS
187 #define THREEFRY4x64_DEFAULT_ROUNDS 20
188 #endif
189 
190 #define _threefry2x_tpl(W) \
191 typedef struct r123array2x##W threefry2x##W##_ctr_t; \
192 typedef struct r123array2x##W threefry2x##W##_key_t; \
193 typedef struct r123array2x##W threefry2x##W##_ukey_t; \
194 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
195 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
196 R123_CUDA_DEVICE R123_STATIC_INLINE \
197 threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
198  threefry2x##W##_ctr_t X; \
199  uint##W##_t ks[2+1]; \
200  int i; /* avoid size_t to avoid need for stddef.h */ \
201  R123_ASSERT(Nrounds<=32); \
202  ks[2] = SKEIN_KS_PARITY##W; \
203  for (i=0;i < 2; i++) \
204  { \
205  ks[i] = k.v[i]; \
206  X.v[i] = in.v[i]; \
207  ks[2] ^= k.v[i]; \
208  } \
209  \
210  /* Insert initial key before round 0 */ \
211  X.v[0] += ks[0]; X.v[1] += ks[1]; \
212  \
213  if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
214  if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
215  if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
216  if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
217  if(Nrounds>3){ \
218  /* InjectKey(r=1) */ \
219  X.v[0] += ks[1]; X.v[1] += ks[2]; \
220  X.v[1] += 1; /* X.v[2-1] += r */ \
221  } \
222  if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
223  if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
224  if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
225  if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
226  if(Nrounds>7){ \
227  /* InjectKey(r=2) */ \
228  X.v[0] += ks[2]; X.v[1] += ks[0]; \
229  X.v[1] += 2; \
230  } \
231  if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
232  if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
233  if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
234  if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
235  if(Nrounds>11){ \
236  /* InjectKey(r=3) */ \
237  X.v[0] += ks[0]; X.v[1] += ks[1]; \
238  X.v[1] += 3; \
239  } \
240  if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
241  if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
242  if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
243  if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
244  if(Nrounds>15){ \
245  /* InjectKey(r=4) */ \
246  X.v[0] += ks[1]; X.v[1] += ks[2]; \
247  X.v[1] += 4; \
248  } \
249  if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
250  if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
251  if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
252  if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
253  if(Nrounds>19){ \
254  /* InjectKey(r=5) */ \
255  X.v[0] += ks[2]; X.v[1] += ks[0]; \
256  X.v[1] += 5; \
257  } \
258  if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
259  if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
260  if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
261  if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
262  if(Nrounds>23){ \
263  /* InjectKey(r=6) */ \
264  X.v[0] += ks[0]; X.v[1] += ks[1]; \
265  X.v[1] += 6; \
266  } \
267  if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
268  if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
269  if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
270  if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
271  if(Nrounds>27){ \
272  /* InjectKey(r=7) */ \
273  X.v[0] += ks[1]; X.v[1] += ks[2]; \
274  X.v[1] += 7; \
275  } \
276  if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
277  if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
278  if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
279  if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
280  if(Nrounds>31){ \
281  /* InjectKey(r=8) */ \
282  X.v[0] += ks[2]; X.v[1] += ks[0]; \
283  X.v[1] += 8; \
284  } \
285  return X; \
286 } \
287  \
288 enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
289 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
290 R123_CUDA_DEVICE R123_STATIC_INLINE \
291 threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
292  return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
293 }
294 
295 
296 #define _threefry4x_tpl(W) \
297 typedef struct r123array4x##W threefry4x##W##_ctr_t; \
298 typedef struct r123array4x##W threefry4x##W##_key_t; \
299 typedef struct r123array4x##W threefry4x##W##_ukey_t; \
300 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
301 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
302 R123_CUDA_DEVICE R123_STATIC_INLINE \
303 threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
304  threefry4x##W##_ctr_t X; \
305  uint##W##_t ks[4+1]; \
306  int i; /* avoid size_t to avoid need for stddef.h */ \
307  R123_ASSERT(Nrounds<=72); \
308  ks[4] = SKEIN_KS_PARITY##W; \
309  for (i=0;i < 4; i++) \
310  { \
311  ks[i] = k.v[i]; \
312  X.v[i] = in.v[i]; \
313  ks[4] ^= k.v[i]; \
314  } \
315  \
316  /* Insert initial key before round 0 */ \
317  X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
318  \
319  if(Nrounds>0){ \
320  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
321  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
322  } \
323  if(Nrounds>1){ \
324  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
325  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
326  } \
327  if(Nrounds>2){ \
328  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
329  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
330  } \
331  if(Nrounds>3){ \
332  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
333  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
334  } \
335  if(Nrounds>3){ \
336  /* InjectKey(r=1) */ \
337  X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
338  X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \
339  } \
340  \
341  if(Nrounds>4){ \
342  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
343  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
344  } \
345  if(Nrounds>5){ \
346  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
347  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
348  } \
349  if(Nrounds>6){ \
350  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
351  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
352  } \
353  if(Nrounds>7){ \
354  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
355  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
356  } \
357  if(Nrounds>7){ \
358  /* InjectKey(r=2) */ \
359  X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
360  X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \
361  } \
362  \
363  if(Nrounds>8){ \
364  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
365  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
366  } \
367  if(Nrounds>9){ \
368  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
369  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
370  } \
371  if(Nrounds>10){ \
372  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
373  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
374  } \
375  if(Nrounds>11){ \
376  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
377  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
378  } \
379  if(Nrounds>11){ \
380  /* InjectKey(r=3) */ \
381  X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
382  X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \
383  } \
384  \
385  if(Nrounds>12){ \
386  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
387  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
388  } \
389  if(Nrounds>13){ \
390  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
391  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
392  } \
393  if(Nrounds>14){ \
394  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
395  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
396  } \
397  if(Nrounds>15){ \
398  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
399  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
400  } \
401  if(Nrounds>15){ \
402  /* InjectKey(r=1) */ \
403  X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
404  X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \
405  } \
406  \
407  if(Nrounds>16){ \
408  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
409  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
410  } \
411  if(Nrounds>17){ \
412  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
413  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
414  } \
415  if(Nrounds>18){ \
416  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
417  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
418  } \
419  if(Nrounds>19){ \
420  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
421  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
422  } \
423  if(Nrounds>19){ \
424  /* InjectKey(r=1) */ \
425  X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
426  X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \
427  } \
428  \
429  if(Nrounds>20){ \
430  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
431  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
432  } \
433  if(Nrounds>21){ \
434  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
435  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
436  } \
437  if(Nrounds>22){ \
438  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
439  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
440  } \
441  if(Nrounds>23){ \
442  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
443  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
444  } \
445  if(Nrounds>23){ \
446  /* InjectKey(r=1) */ \
447  X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
448  X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \
449  } \
450  \
451  if(Nrounds>24){ \
452  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
453  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
454  } \
455  if(Nrounds>25){ \
456  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
457  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
458  } \
459  if(Nrounds>26){ \
460  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
461  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
462  } \
463  if(Nrounds>27){ \
464  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
465  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
466  } \
467  if(Nrounds>27){ \
468  /* InjectKey(r=1) */ \
469  X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
470  X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \
471  } \
472  \
473  if(Nrounds>28){ \
474  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
475  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
476  } \
477  if(Nrounds>29){ \
478  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
479  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
480  } \
481  if(Nrounds>30){ \
482  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
483  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
484  } \
485  if(Nrounds>31){ \
486  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
487  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
488  } \
489  if(Nrounds>31){ \
490  /* InjectKey(r=1) */ \
491  X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
492  X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \
493  } \
494  \
495  if(Nrounds>32){ \
496  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
497  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
498  } \
499  if(Nrounds>33){ \
500  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
501  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
502  } \
503  if(Nrounds>34){ \
504  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
505  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
506  } \
507  if(Nrounds>35){ \
508  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
509  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
510  } \
511  if(Nrounds>35){ \
512  /* InjectKey(r=1) */ \
513  X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
514  X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \
515  } \
516  \
517  if(Nrounds>36){ \
518  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
519  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
520  } \
521  if(Nrounds>37){ \
522  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
523  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
524  } \
525  if(Nrounds>38){ \
526  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
527  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
528  } \
529  if(Nrounds>39){ \
530  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
531  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
532  } \
533  if(Nrounds>39){ \
534  /* InjectKey(r=1) */ \
535  X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
536  X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \
537  } \
538  \
539  if(Nrounds>40){ \
540  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
541  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
542  } \
543  if(Nrounds>41){ \
544  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
545  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
546  } \
547  if(Nrounds>42){ \
548  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
549  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
550  } \
551  if(Nrounds>43){ \
552  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
553  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
554  } \
555  if(Nrounds>43){ \
556  /* InjectKey(r=1) */ \
557  X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
558  X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \
559  } \
560  \
561  if(Nrounds>44){ \
562  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
563  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
564  } \
565  if(Nrounds>45){ \
566  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
567  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
568  } \
569  if(Nrounds>46){ \
570  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
571  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
572  } \
573  if(Nrounds>47){ \
574  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
575  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
576  } \
577  if(Nrounds>47){ \
578  /* InjectKey(r=1) */ \
579  X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
580  X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \
581  } \
582  \
583  if(Nrounds>48){ \
584  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
585  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
586  } \
587  if(Nrounds>49){ \
588  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
589  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
590  } \
591  if(Nrounds>50){ \
592  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
593  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
594  } \
595  if(Nrounds>51){ \
596  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
597  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
598  } \
599  if(Nrounds>51){ \
600  /* InjectKey(r=1) */ \
601  X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
602  X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \
603  } \
604  \
605  if(Nrounds>52){ \
606  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
607  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
608  } \
609  if(Nrounds>53){ \
610  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
611  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
612  } \
613  if(Nrounds>54){ \
614  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
615  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
616  } \
617  if(Nrounds>55){ \
618  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
619  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
620  } \
621  if(Nrounds>55){ \
622  /* InjectKey(r=1) */ \
623  X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
624  X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \
625  } \
626  \
627  if(Nrounds>56){ \
628  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
629  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
630  } \
631  if(Nrounds>57){ \
632  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
633  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
634  } \
635  if(Nrounds>58){ \
636  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
637  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
638  } \
639  if(Nrounds>59){ \
640  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
641  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
642  } \
643  if(Nrounds>59){ \
644  /* InjectKey(r=1) */ \
645  X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
646  X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \
647  } \
648  \
649  if(Nrounds>60){ \
650  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
651  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
652  } \
653  if(Nrounds>61){ \
654  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
655  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
656  } \
657  if(Nrounds>62){ \
658  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
659  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
660  } \
661  if(Nrounds>63){ \
662  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
663  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
664  } \
665  if(Nrounds>63){ \
666  /* InjectKey(r=1) */ \
667  X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
668  X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \
669  } \
670  \
671  if(Nrounds>64){ \
672  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
673  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
674  } \
675  if(Nrounds>65){ \
676  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
677  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
678  } \
679  if(Nrounds>66){ \
680  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
681  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
682  } \
683  if(Nrounds>67){ \
684  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
685  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
686  } \
687  if(Nrounds>67){ \
688  /* InjectKey(r=1) */ \
689  X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
690  X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \
691  } \
692  \
693  if(Nrounds>68){ \
694  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
695  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
696  } \
697  if(Nrounds>69){ \
698  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
699  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
700  } \
701  if(Nrounds>70){ \
702  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
703  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
704  } \
705  if(Nrounds>71){ \
706  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
707  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
708  } \
709  if(Nrounds>71){ \
710  /* InjectKey(r=1) */ \
711  X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
712  X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \
713  } \
714  \
715  return X; \
716 } \
717  \
718 enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
719 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
720 R123_CUDA_DEVICE R123_STATIC_INLINE \
721 threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
722  return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
723 }
724 
726 _threefry2x_tpl(64)
727 _threefry2x_tpl(32)
728 _threefry4x_tpl(64)
729 _threefry4x_tpl(32)
730 
731 /* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
732  than a static inline function. Why? */
733 #define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
734 #define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
735 #define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
736 #define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
737 
738 #ifdef __cplusplus
739 
740 #define _threefryNxWclass_tpl(NxW) \
741 namespace r123{ \
742 template<unsigned int R> \
743  struct Threefry##NxW##_R{ \
744  typedef threefry##NxW##_ctr_t ctr_type; \
745  typedef threefry##NxW##_key_t key_type; \
746  typedef threefry##NxW##_key_t ukey_type; \
747  static const unsigned int rounds=R; \
748  inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
749  R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \
750  return threefry##NxW##_R(R, ctr, key); \
751  } \
752 }; \
753  typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
754 } // namespace r123
755 
758 _threefryNxWclass_tpl(2x32)
759 _threefryNxWclass_tpl(4x32)
760 _threefryNxWclass_tpl(2x64)
761 _threefryNxWclass_tpl(4x64)
762 
763 /* The _tpl macros don't quite work to do string-pasting inside comments.
764  so we just write out the boilerplate documentation four times... */
765 
862 #endif
863 
864 #endif