00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #ifndef _philox_dot_h_
00033 #define _philox_dot_h_
00034
00037 #include "features/compilerfeatures.h"
00038 #include "array.h"
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #define _mulhilo_dword_tpl(W, Word, Dword) \
00068 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00069 Dword product = ((Dword)a)*((Dword)b); \
00070 *hip = product>>W; \
00071 return (Word)product; \
00072 }
00073
00074
00075
00076
00077
00078
00079
00080 #ifdef __powerpc__
00081 #define _mulhilo_asm_tpl(W, Word, INSN) \
00082 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
00083 Word dx = 0; \
00084 __asm__("\n\t" \
00085 INSN " %0,%1,%2\n\t" \
00086 : "=r"(dx) \
00087 : "r"(b), "r"(ax) \
00088 ); \
00089 *hip = dx; \
00090 return ax*b; \
00091 }
00092 #else
00093 #define _mulhilo_asm_tpl(W, Word, INSN) \
00094 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
00095 Word dx; \
00096 __asm__("\n\t" \
00097 INSN " %2\n\t" \
00098 : "=a"(ax), "=d"(dx) \
00099 : "r"(b), "0"(ax) \
00100 ); \
00101 *hip = dx; \
00102 return ax; \
00103 }
00104 #endif
00105
00106
00107
00108
00109
00110
00111 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
00112 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00113 return INTRIN(a, b, hip); \
00114 }
00115
00116
00117
00118 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
00119 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00120 *hip = INTRIN(a, b); \
00121 return a*b; \
00122 }
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137 #define _mulhilo_c99_tpl(W, Word) \
00138 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
00139 const unsigned WHALF = W/2; \
00140 const Word LOMASK = ((((Word)1)<<WHALF)-1); \
00141 Word lo = a*b; \
00142 Word ahi = a>>WHALF; \
00143 Word alo = a& LOMASK; \
00144 Word bhi = b>>WHALF; \
00145 Word blo = b& LOMASK; \
00146 \
00147 Word ahbl = ahi*blo; \
00148 Word albh = alo*bhi; \
00149 \
00150 Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \
00151 Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \
00152 hi += ahbl_albh >> WHALF; \
00153 \
00154 hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \
00155 *hip = hi; \
00156 return lo; \
00157 }
00158
00159
00160
00161
00162
00163
00164 #define _mulhilo_fail_tpl(W, Word) \
00165 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
00166 R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
00167 }
00168
00169
00170
00171
00172
00173
00174 #if R123_USE_MULHILO32_ASM
00175 #ifdef __powerpc__
00176 _mulhilo_asm_tpl(32, uint32_t, "mulhwu")
00177 #else
00178 _mulhilo_asm_tpl(32, uint32_t, "mull")
00179 #endif
00180 #else
00181 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
00182 #endif
00183
00184 #if R123_USE_PHILOX_64BIT
00185 #if R123_USE_MULHILO64_ASM
00186 #ifdef __powerpc64__
00187 _mulhilo_asm_tpl(64, uint64_t, "mulhdu")
00188 #else
00189 _mulhilo_asm_tpl(64, uint64_t, "mulq")
00190 #endif
00191 #elif R123_USE_MULHILO64_MSVC_INTRIN
00192 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
00193 #elif R123_USE_MULHILO64_CUDA_INTRIN
00194 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
00195 #elif R123_USE_MULHILO64_OPENCL_INTRIN
00196 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
00197 #elif R123_USE_MULHILO64_MULHI_INTRIN
00198 _mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
00199 #elif R123_USE_GNU_UINT128
00200 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
00201 #elif R123_USE_MULHILO64_C99
00202 _mulhilo_c99_tpl(64, uint64_t)
00203 #else
00204 _mulhilo_fail_tpl(64, uint64_t)
00205 #endif
00206 #endif
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216 #ifndef PHILOX_M2x64_0
00217 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
00218 #endif
00219
00220 #ifndef PHILOX_M4x64_0
00221 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
00222 #endif
00223
00224 #ifndef PHILOX_M4x64_1
00225 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
00226 #endif
00227
00228 #ifndef PHILOX_M2x32_0
00229 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
00230 #endif
00231
00232 #ifndef PHILOX_M4x32_0
00233 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
00234 #endif
00235 #ifndef PHILOX_M4x32_1
00236 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
00237 #endif
00238
00239 #ifndef PHILOX_W64_0
00240 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)
00241 #endif
00242 #ifndef PHILOX_W64_1
00243 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)
00244 #endif
00245
00246 #ifndef PHILOX_W32_0
00247 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
00248 #endif
00249 #ifndef PHILOX_W32_1
00250 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
00251 #endif
00252
00253 #ifndef PHILOX2x32_DEFAULT_ROUNDS
00254 #define PHILOX2x32_DEFAULT_ROUNDS 10
00255 #endif
00256
00257 #ifndef PHILOX2x64_DEFAULT_ROUNDS
00258 #define PHILOX2x64_DEFAULT_ROUNDS 10
00259 #endif
00260
00261 #ifndef PHILOX4x32_DEFAULT_ROUNDS
00262 #define PHILOX4x32_DEFAULT_ROUNDS 10
00263 #endif
00264
00265 #ifndef PHILOX4x64_DEFAULT_ROUNDS
00266 #define PHILOX4x64_DEFAULT_ROUNDS 10
00267 #endif
00268
00269
00270
00271 #define _philox2xWround_tpl(W, T) \
00272 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
00273 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
00274 T hi; \
00275 T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
00276 struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \
00277 return out; \
00278 }
00279 #define _philox2xWbumpkey_tpl(W) \
00280 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
00281 key.v[0] += PHILOX_W##W##_0; \
00282 return key; \
00283 }
00284
00285 #define _philox4xWround_tpl(W, T) \
00286 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
00287 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
00288 T hi0; \
00289 T hi1; \
00290 T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
00291 T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
00292 struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \
00293 hi0^ctr.v[3]^key.v[1], lo0}}; \
00294 return out; \
00295 }
00296
00297 #define _philox4xWbumpkey_tpl(W) \
00298 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
00299 key.v[0] += PHILOX_W##W##_0; \
00300 key.v[1] += PHILOX_W##W##_1; \
00301 return key; \
00302 }
00303
00304 #define _philoxNxW_tpl(N, Nhalf, W, T) \
00305 \
00306 enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
00307 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
00308 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
00309 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
00310 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
00311 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
00312 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
00313 R123_ASSERT(R<=16); \
00314 if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \
00315 if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00316 if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00317 if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00318 if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00319 if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00320 if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00321 if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00322 if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00323 if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00324 if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00325 if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00326 if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00327 if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00328 if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00329 if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00330 return ctr; \
00331 }
00332
00333 _philox2xWbumpkey_tpl(32)
00334 _philox4xWbumpkey_tpl(32)
00335 _philox2xWround_tpl(32, uint32_t)
00336 _philox4xWround_tpl(32, uint32_t)
00338 _philoxNxW_tpl(2, 1, 32, uint32_t)
00339 _philoxNxW_tpl(4, 2, 32, uint32_t)
00340 #if R123_USE_PHILOX_64BIT
00341
00342 _philox2xWbumpkey_tpl(64)
00343 _philox4xWbumpkey_tpl(64)
00344 _philox2xWround_tpl(64, uint64_t)
00345 _philox4xWround_tpl(64, uint64_t)
00347 _philoxNxW_tpl(2, 1, 64, uint64_t)
00348 _philoxNxW_tpl(4, 2, 64, uint64_t)
00349 #endif
00350
00351 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
00352 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
00353 #if R123_USE_PHILOX_64BIT
00354 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
00355 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
00356 #endif
00357
00358 #ifdef __cplusplus
00359 #include <stdexcept>
00360
00363 #define _PhiloxNxW_base_tpl(CType, KType, N, W) \
00364 namespace r123{ \
00365 template<unsigned int ROUNDS> \
00366 struct Philox##N##x##W##_R{ \
00367 typedef CType ctr_type; \
00368 typedef KType key_type; \
00369 typedef KType ukey_type; \
00370 static const unsigned int rounds=ROUNDS; \
00371 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
00372 R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
00373 return philox##N##x##W##_R(ROUNDS, ctr, key); \
00374 } \
00375 }; \
00376 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
00377 } // namespace r123
00378
00380 _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32)
00381 _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32)
00382 #if R123_USE_PHILOX_64BIT
00383 _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64)
00384 _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64)
00385 #endif
00386
00387
00388
00389
00484 #endif
00485
00486 #endif