00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #ifndef _threefry_dot_h_
00033 #define _threefry_dot_h_
00034 #include "features/compilerfeatures.h"
00035 #include "array.h"
00036
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066 enum{
00067
00068
00069 R_64x4_0_0=14, R_64x4_0_1=16,
00070 R_64x4_1_0=52, R_64x4_1_1=57,
00071 R_64x4_2_0=23, R_64x4_2_1=40,
00072 R_64x4_3_0= 5, R_64x4_3_1=37,
00073 R_64x4_4_0=25, R_64x4_4_1=33,
00074 R_64x4_5_0=46, R_64x4_5_1=12,
00075 R_64x4_6_0=58, R_64x4_6_1=22,
00076 R_64x4_7_0=32, R_64x4_7_1=32
00077 };
00078
00079 enum{
00080
00081
00082
00083
00084
00085
00086 R_64x2_0_0=16,
00087 R_64x2_1_0=42,
00088 R_64x2_2_0=12,
00089 R_64x2_3_0=31,
00090 R_64x2_4_0=16,
00091 R_64x2_5_0=32,
00092 R_64x2_6_0=24,
00093 R_64x2_7_0=21
00094
00095
00096
00097
00098
00099
00100
00101
00102 };
00103
00104 enum{
00105
00106
00107
00108
00109
00110 R_32x4_0_0=10, R_32x4_0_1=26,
00111 R_32x4_1_0=11, R_32x4_1_1=21,
00112 R_32x4_2_0=13, R_32x4_2_1=27,
00113 R_32x4_3_0=23, R_32x4_3_1= 5,
00114 R_32x4_4_0= 6, R_32x4_4_1=20,
00115 R_32x4_5_0=17, R_32x4_5_1=11,
00116 R_32x4_6_0=25, R_32x4_6_1=10,
00117 R_32x4_7_0=18, R_32x4_7_1=20
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128 };
00129
00130 enum{
00131
00132
00133
00134
00135 R_32x2_0_0=13,
00136 R_32x2_1_0=15,
00137 R_32x2_2_0=26,
00138 R_32x2_3_0= 6,
00139 R_32x2_4_0=17,
00140 R_32x2_5_0=29,
00141 R_32x2_6_0=16,
00142 R_32x2_7_0=24
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152 };
00153
00154 enum{
00155 WCNT2=2,
00156 WCNT4=4
00157 };
00158 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
00159 R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
00160 {
00161 return (x << (N & 63)) | (x >> ((64-N) & 63));
00162 }
00163
00164 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
00165 R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
00166 {
00167 return (x << (N & 31)) | (x >> ((32-N) & 31));
00168 }
00169
00170 #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
00171 #define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
00172 #define SKEIN_KS_PARITY32 0x1BD11BDA
00173
00174 #ifndef THREEFRY2x32_DEFAULT_ROUNDS
00175 #define THREEFRY2x32_DEFAULT_ROUNDS 20
00176 #endif
00177
00178 #ifndef THREEFRY2x64_DEFAULT_ROUNDS
00179 #define THREEFRY2x64_DEFAULT_ROUNDS 20
00180 #endif
00181
00182 #ifndef THREEFRY4x32_DEFAULT_ROUNDS
00183 #define THREEFRY4x32_DEFAULT_ROUNDS 20
00184 #endif
00185
00186 #ifndef THREEFRY4x64_DEFAULT_ROUNDS
00187 #define THREEFRY4x64_DEFAULT_ROUNDS 20
00188 #endif
00189
00190 #define _threefry2x_tpl(W) \
00191 typedef struct r123array2x##W threefry2x##W##_ctr_t; \
00192 typedef struct r123array2x##W threefry2x##W##_key_t; \
00193 typedef struct r123array2x##W threefry2x##W##_ukey_t; \
00194 R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
00195 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
00196 R123_CUDA_DEVICE R123_STATIC_INLINE \
00197 threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
00198 threefry2x##W##_ctr_t X; \
00199 uint##W##_t ks[WCNT2+1]; \
00200 int i; \
00201 R123_ASSERT(Nrounds<=20); \
00202 ks[WCNT2] = SKEIN_KS_PARITY##W; \
00203 for (i=0;i < WCNT2; i++) \
00204 { \
00205 ks[i] = k.v[i]; \
00206 X.v[i] = in.v[i]; \
00207 ks[WCNT2] ^= k.v[i]; \
00208 } \
00209 \
00210 \
00211 X.v[0] += ks[0]; X.v[1] += ks[1]; \
00212 \
00213 if(Nrounds>0) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; \
00214 if(Nrounds>1) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; \
00215 if(Nrounds>2) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; \
00216 if(Nrounds>3) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; \
00217 if(Nrounds>3){ \
00218 \
00219 X.v[0] += ks[1]; X.v[1] += ks[2]; \
00220 X.v[1] += 1; \
00221 } \
00222 if(Nrounds>4) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; \
00223 if(Nrounds>5) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; \
00224 if(Nrounds>6) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; \
00225 if(Nrounds>7) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; \
00226 if(Nrounds>7){ \
00227 \
00228 X.v[0] += ks[2]; X.v[1] += ks[0]; \
00229 X.v[1] += 2; \
00230 } \
00231 if(Nrounds>8) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; \
00232 if(Nrounds>9) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; \
00233 if(Nrounds>10) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; \
00234 if(Nrounds>11) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; \
00235 if(Nrounds>11){ \
00236 \
00237 X.v[0] += ks[0]; X.v[1] += ks[1]; \
00238 X.v[1] += 3; \
00239 } \
00240 if(Nrounds>12) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; \
00241 if(Nrounds>13) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; \
00242 if(Nrounds>14) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; \
00243 if(Nrounds>15) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; \
00244 if(Nrounds>15){ \
00245 \
00246 X.v[0] += ks[1]; X.v[1] += ks[2]; \
00247 X.v[1] += 4; \
00248 } \
00249 \
00250 if(Nrounds>16) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; \
00251 if(Nrounds>17) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; \
00252 if(Nrounds>18) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; \
00253 if(Nrounds>19) X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; \
00254 if(Nrounds>19){ \
00255 \
00256 X.v[0] += ks[2]; X.v[1] += ks[0]; \
00257 X.v[1] += 5; \
00258 } \
00259 return X; \
00260 } \
00261 \
00262 enum { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
00263 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
00264 R123_CUDA_DEVICE R123_STATIC_INLINE \
00265 threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
00266 return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
00267 }
00268
00269
00270 #define _threefry4x_tpl(W) \
00271 typedef struct r123array4x##W threefry4x##W##_ctr_t; \
00272 typedef struct r123array4x##W threefry4x##W##_key_t; \
00273 typedef struct r123array4x##W threefry4x##W##_ukey_t; \
00274 R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
00275 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
00276 R123_CUDA_DEVICE R123_STATIC_INLINE \
00277 threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
00278 threefry4x##W##_ctr_t X; \
00279 uint##W##_t ks[WCNT4+1]; \
00280 int i; \
00281 R123_ASSERT(Nrounds<=72); \
00282 ks[WCNT4] = SKEIN_KS_PARITY##W; \
00283 for (i=0;i < WCNT4; i++) \
00284 { \
00285 ks[i] = k.v[i]; \
00286 X.v[i] = in.v[i]; \
00287 ks[WCNT4] ^= k.v[i]; \
00288 } \
00289 \
00290 \
00291 X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
00292 \
00293 if(Nrounds>0){ \
00294 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00295 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00296 } \
00297 if(Nrounds>1){ \
00298 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00299 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00300 } \
00301 if(Nrounds>2){ \
00302 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00303 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00304 } \
00305 if(Nrounds>3){ \
00306 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00307 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00308 } \
00309 if(Nrounds>3){ \
00310 \
00311 X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
00312 X.v[WCNT4-1] += 1; \
00313 } \
00314 \
00315 if(Nrounds>4){ \
00316 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00317 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00318 } \
00319 if(Nrounds>5){ \
00320 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00321 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00322 } \
00323 if(Nrounds>6){ \
00324 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00325 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00326 } \
00327 if(Nrounds>7){ \
00328 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00329 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00330 } \
00331 if(Nrounds>7){ \
00332 \
00333 X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
00334 X.v[WCNT4-1] += 2; \
00335 } \
00336 \
00337 if(Nrounds>8){ \
00338 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00339 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00340 } \
00341 if(Nrounds>9){ \
00342 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00343 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00344 } \
00345 if(Nrounds>10){ \
00346 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00347 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00348 } \
00349 if(Nrounds>11){ \
00350 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00351 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00352 } \
00353 if(Nrounds>11){ \
00354 \
00355 X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
00356 X.v[WCNT4-1] += 3; \
00357 } \
00358 \
00359 if(Nrounds>12){ \
00360 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00361 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00362 } \
00363 if(Nrounds>13){ \
00364 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00365 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00366 } \
00367 if(Nrounds>14){ \
00368 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00369 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00370 } \
00371 if(Nrounds>15){ \
00372 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00373 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00374 } \
00375 if(Nrounds>15){ \
00376 \
00377 X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
00378 X.v[WCNT4-1] += 4; \
00379 } \
00380 \
00381 if(Nrounds>16){ \
00382 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00383 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00384 } \
00385 if(Nrounds>17){ \
00386 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00387 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00388 } \
00389 if(Nrounds>18){ \
00390 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00391 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00392 } \
00393 if(Nrounds>19){ \
00394 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00395 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00396 } \
00397 if(Nrounds>19){ \
00398 \
00399 X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
00400 X.v[WCNT4-1] += 5; \
00401 } \
00402 \
00403 if(Nrounds>20){ \
00404 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00405 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00406 } \
00407 if(Nrounds>21){ \
00408 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00409 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00410 } \
00411 if(Nrounds>22){ \
00412 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00413 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00414 } \
00415 if(Nrounds>23){ \
00416 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00417 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00418 } \
00419 if(Nrounds>23){ \
00420 \
00421 X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
00422 X.v[WCNT4-1] += 6; \
00423 } \
00424 \
00425 if(Nrounds>24){ \
00426 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00427 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00428 } \
00429 if(Nrounds>25){ \
00430 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00431 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00432 } \
00433 if(Nrounds>26){ \
00434 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00435 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00436 } \
00437 if(Nrounds>27){ \
00438 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00439 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00440 } \
00441 if(Nrounds>27){ \
00442 \
00443 X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
00444 X.v[WCNT4-1] += 7; \
00445 } \
00446 \
00447 if(Nrounds>28){ \
00448 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00449 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00450 } \
00451 if(Nrounds>29){ \
00452 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00453 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00454 } \
00455 if(Nrounds>30){ \
00456 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00457 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00458 } \
00459 if(Nrounds>31){ \
00460 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00461 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00462 } \
00463 if(Nrounds>31){ \
00464 \
00465 X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
00466 X.v[WCNT4-1] += 8; \
00467 } \
00468 \
00469 if(Nrounds>32){ \
00470 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00471 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00472 } \
00473 if(Nrounds>33){ \
00474 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00475 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00476 } \
00477 if(Nrounds>34){ \
00478 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00479 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00480 } \
00481 if(Nrounds>35){ \
00482 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00483 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00484 } \
00485 if(Nrounds>35){ \
00486 \
00487 X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
00488 X.v[WCNT4-1] += 9; \
00489 } \
00490 \
00491 if(Nrounds>36){ \
00492 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00493 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00494 } \
00495 if(Nrounds>37){ \
00496 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00497 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00498 } \
00499 if(Nrounds>38){ \
00500 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00501 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00502 } \
00503 if(Nrounds>39){ \
00504 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00505 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00506 } \
00507 if(Nrounds>39){ \
00508 \
00509 X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
00510 X.v[WCNT4-1] += 10; \
00511 } \
00512 \
00513 if(Nrounds>40){ \
00514 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00515 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00516 } \
00517 if(Nrounds>41){ \
00518 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00519 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00520 } \
00521 if(Nrounds>42){ \
00522 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00523 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00524 } \
00525 if(Nrounds>43){ \
00526 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00527 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00528 } \
00529 if(Nrounds>43){ \
00530 \
00531 X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
00532 X.v[WCNT4-1] += 11; \
00533 } \
00534 \
00535 if(Nrounds>44){ \
00536 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00537 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00538 } \
00539 if(Nrounds>45){ \
00540 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00541 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00542 } \
00543 if(Nrounds>46){ \
00544 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00545 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00546 } \
00547 if(Nrounds>47){ \
00548 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00549 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00550 } \
00551 if(Nrounds>47){ \
00552 \
00553 X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
00554 X.v[WCNT4-1] += 12; \
00555 } \
00556 \
00557 if(Nrounds>48){ \
00558 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00559 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00560 } \
00561 if(Nrounds>49){ \
00562 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00563 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00564 } \
00565 if(Nrounds>50){ \
00566 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00567 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00568 } \
00569 if(Nrounds>51){ \
00570 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00571 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00572 } \
00573 if(Nrounds>51){ \
00574 \
00575 X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
00576 X.v[WCNT4-1] += 13; \
00577 } \
00578 \
00579 if(Nrounds>52){ \
00580 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00581 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00582 } \
00583 if(Nrounds>53){ \
00584 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00585 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00586 } \
00587 if(Nrounds>54){ \
00588 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00589 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00590 } \
00591 if(Nrounds>55){ \
00592 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00593 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00594 } \
00595 if(Nrounds>55){ \
00596 \
00597 X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
00598 X.v[WCNT4-1] += 14; \
00599 } \
00600 \
00601 if(Nrounds>56){ \
00602 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00603 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00604 } \
00605 if(Nrounds>57){ \
00606 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00607 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00608 } \
00609 if(Nrounds>58){ \
00610 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00611 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00612 } \
00613 if(Nrounds>59){ \
00614 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00615 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00616 } \
00617 if(Nrounds>59){ \
00618 \
00619 X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
00620 X.v[WCNT4-1] += 15; \
00621 } \
00622 \
00623 if(Nrounds>60){ \
00624 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00625 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00626 } \
00627 if(Nrounds>61){ \
00628 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00629 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00630 } \
00631 if(Nrounds>62){ \
00632 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00633 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00634 } \
00635 if(Nrounds>63){ \
00636 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00637 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00638 } \
00639 if(Nrounds>63){ \
00640 \
00641 X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
00642 X.v[WCNT4-1] += 16; \
00643 } \
00644 \
00645 if(Nrounds>64){ \
00646 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
00647 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
00648 } \
00649 if(Nrounds>65){ \
00650 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
00651 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
00652 } \
00653 if(Nrounds>66){ \
00654 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
00655 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
00656 } \
00657 if(Nrounds>67){ \
00658 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
00659 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
00660 } \
00661 if(Nrounds>67){ \
00662 \
00663 X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
00664 X.v[WCNT4-1] += 17; \
00665 } \
00666 \
00667 if(Nrounds>68){ \
00668 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
00669 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
00670 } \
00671 if(Nrounds>69){ \
00672 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
00673 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
00674 } \
00675 if(Nrounds>70){ \
00676 X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
00677 X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
00678 } \
00679 if(Nrounds>71){ \
00680 X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
00681 X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
00682 } \
00683 if(Nrounds>71){ \
00684 \
00685 X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
00686 X.v[WCNT4-1] += 18; \
00687 } \
00688 \
00689 return X; \
00690 } \
00691 \
00692 enum { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
00693 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
00694 R123_CUDA_DEVICE R123_STATIC_INLINE \
00695 threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
00696 return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
00697 }
00698
00700 _threefry2x_tpl(64)
00701 _threefry2x_tpl(32)
00702 _threefry4x_tpl(64)
00703 _threefry4x_tpl(32)
00704
00705
00706
00707 #define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
00708 #define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
00709 #define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
00710 #define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
00711
00712 #ifdef __cplusplus
00713
00714 #define _threefryNxWclass_tpl(NxW) \
00715 namespace r123{ \
00716 template<unsigned int R> \
00717 struct Threefry##NxW##_R{ \
00718 typedef threefry##NxW##_ctr_t ctr_type; \
00719 typedef threefry##NxW##_key_t key_type; \
00720 typedef threefry##NxW##_key_t ukey_type; \
00721 static const unsigned int rounds=R; \
00722 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
00723 R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 20 rounds\n"); \
00724 return threefry##NxW##_R(R, ctr, key); \
00725 } \
00726 }; \
00727 typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
00728 } // namespace r123
00729
00732 _threefryNxWclass_tpl(2x32)
00733 _threefryNxWclass_tpl(4x32)
00734 _threefryNxWclass_tpl(2x64)
00735 _threefryNxWclass_tpl(4x64)
00736
00737
00738
00739
00831 #endif
00832
00833 #endif