00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #ifndef _philox_dot_h_
00033 #define _philox_dot_h_
00034
00037 #include "features/compilerfeatures.h"
00038 #include "array.h"
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #define _mulhilo_dword_tpl(W, Word, Dword) \
00068 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00069 Dword product = ((Dword)a)*((Dword)b); \
00070 *hip = product>>W; \
00071 return (Word)product; \
00072 }
00073
00074
00075
00076
00077
00078
00079
00080 #define _mulhilo_asm_tpl(W, Word, INSN) \
00081 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
00082 Word dx; \
00083 __asm__("\n\t" \
00084 INSN " %2\n\t" \
00085 : "=a"(ax), "=d"(dx) \
00086 : "r"(b), "0"(ax) \
00087 ); \
00088 *hip = dx; \
00089 return ax; \
00090 }
00091
00092
00093
00094
00095
00096
00097 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
00098 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00099 return INTRIN(a, b, hip); \
00100 }
00101
00102 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
00103 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00104 *hip = INTRIN(a, b); \
00105 return a*b; \
00106 }
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121 #define _mulhilo_c99_tpl(W, Word) \
00122 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
00123 const unsigned WHALF = W/2; \
00124 const Word LOMASK = ((((Word)1)<<WHALF)-1); \
00125 Word lo = a*b; \
00126 Word ahi = a>>WHALF; \
00127 Word alo = a& LOMASK; \
00128 Word bhi = b>>WHALF; \
00129 Word blo = b& LOMASK; \
00130 \
00131 Word ahbl = ahi*blo; \
00132 Word albh = alo*bhi; \
00133 \
00134 Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \
00135 Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \
00136 hi += ahbl_albh >> WHALF; \
00137 \
00138 hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \
00139 *hip = hi; \
00140 return lo; \
00141 }
00142
00143
00144
00145
00146
00147
00148 #define _mulhilo_fail_tpl(W, Word) \
00149 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
00150 R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
00151 }
00152
00153
00154
00155
00156
00157
00158 #if R123_USE_MULHILO32_ASM
00159 _mulhilo_asm_tpl(32, uint32_t, "mull")
00160 #else
00161 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
00162 #endif
00163
00164 #if R123_USE_PHILOX_64BIT
00165 #if R123_USE_MULHILO64_ASM
00166 _mulhilo_asm_tpl(64, uint64_t, "mulq")
00167 #elif R123_USE_MULHILO64_MSVC_INTRIN
00168 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
00169 #elif R123_USE_MULHILO64_CUDA_INTRIN
00170 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
00171 #elif R123_USE_MULHILO64_OPENCL_INTRIN
00172 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
00173 #elif R123_USE_GNU_UINT128
00174 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
00175 #elif R123_USE_MULHILO64_C99
00176 _mulhilo_c99_tpl(64, uint64_t)
00177 #else
00178 _mulhilo_fail_tpl(64, uint64_t)
00179 #endif
00180 #endif
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190 #ifndef PHILOX_M2x64_0
00191 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
00192 #endif
00193
00194 #ifndef PHILOX_M4x64_0
00195 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
00196 #endif
00197
00198 #ifndef PHILOX_M4x64_1
00199 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
00200 #endif
00201
00202 #ifndef PHILOX_M2x32_0
00203 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
00204 #endif
00205
00206 #ifndef PHILOX_M4x32_0
00207 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
00208 #endif
00209 #ifndef PHILOX_M4x32_1
00210 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
00211 #endif
00212
00213 #ifndef PHILOX_W64_0
00214 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)
00215 #endif
00216 #ifndef PHILOX_W64_1
00217 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)
00218 #endif
00219
00220 #ifndef PHILOX_W32_0
00221 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
00222 #endif
00223 #ifndef PHILOX_W32_1
00224 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
00225 #endif
00226
00227 #ifndef PHILOX2x32_DEFAULT_ROUNDS
00228 #define PHILOX2x32_DEFAULT_ROUNDS 10
00229 #endif
00230
00231 #ifndef PHILOX2x64_DEFAULT_ROUNDS
00232 #define PHILOX2x64_DEFAULT_ROUNDS 10
00233 #endif
00234
00235 #ifndef PHILOX4x32_DEFAULT_ROUNDS
00236 #define PHILOX4x32_DEFAULT_ROUNDS 10
00237 #endif
00238
00239 #ifndef PHILOX4x64_DEFAULT_ROUNDS
00240 #define PHILOX4x64_DEFAULT_ROUNDS 10
00241 #endif
00242
00243
00244
00245 #define _philox2xWround_tpl(W, T) \
00246 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
00247 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
00248 T hi; \
00249 T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
00250 struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \
00251 return out; \
00252 }
00253 #define _philox2xWbumpkey_tpl(W) \
00254 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
00255 key.v[0] += PHILOX_W##W##_0; \
00256 return key; \
00257 }
00258
00259 #define _philox4xWround_tpl(W, T) \
00260 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
00261 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
00262 T hi0; \
00263 T hi1; \
00264 T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
00265 T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
00266 struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \
00267 hi0^ctr.v[3]^key.v[1], lo0}}; \
00268 return out; \
00269 }
00270
00271 #define _philox4xWbumpkey_tpl(W) \
00272 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
00273 key.v[0] += PHILOX_W##W##_0; \
00274 key.v[1] += PHILOX_W##W##_1; \
00275 return key; \
00276 }
00277
00278 #define _philoxNxW_tpl(N, Nhalf, W, T) \
00279 \
00280 enum { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
00281 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
00282 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
00283 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
00284 R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
00285 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
00286 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
00287 R123_ASSERT(R<=12); \
00288 if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \
00289 if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00290 if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00291 if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00292 if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00293 if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00294 if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00295 if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00296 if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00297 if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00298 if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00299 if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00300 return ctr; \
00301 }
00302
00303 _philox2xWbumpkey_tpl(32)
00304 _philox4xWbumpkey_tpl(32)
00305 _philox2xWround_tpl(32, uint32_t)
00306 _philox4xWround_tpl(32, uint32_t)
00308 _philoxNxW_tpl(2, 1, 32, uint32_t)
00309 _philoxNxW_tpl(4, 2, 32, uint32_t)
00310 #if R123_USE_PHILOX_64BIT
00311
00312 _philox2xWbumpkey_tpl(64)
00313 _philox4xWbumpkey_tpl(64)
00314 _philox2xWround_tpl(64, uint64_t)
00315 _philox4xWround_tpl(64, uint64_t)
00317 _philoxNxW_tpl(2, 1, 64, uint64_t)
00318 _philoxNxW_tpl(4, 2, 64, uint64_t)
00319 #endif
00320
00321 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
00322 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
00323 #if R123_USE_PHILOX_64BIT
00324 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
00325 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
00326 #endif
00327
00328 #ifdef __cplusplus
00329 #include <stdexcept>
00330
00333 #define _PhiloxNxW_base_tpl(CType, KType, N, W) \
00334 namespace r123{ \
00335 template<unsigned int ROUNDS> \
00336 struct Philox##N##x##W##_R{ \
00337 typedef CType ctr_type; \
00338 typedef KType key_type; \
00339 typedef KType ukey_type; \
00340 static const unsigned int rounds=ROUNDS; \
00341 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
00342 R123_STATIC_ASSERT(ROUNDS<=12, "philox is only unrolled up to 12 rounds\n"); \
00343 return philox##N##x##W##_R(ROUNDS, ctr, key); \
00344 } \
00345 }; \
00346 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
00347 } // namespace r123
00348
00350 _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32)
00351 _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32)
00352 #if R123_USE_PHILOX_64BIT
00353 _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64)
00354 _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64)
00355 #endif
00356
00357
00358
00359
00451 #endif
00452
00453 #endif