32 #ifndef _philox_dot_h_
33 #define _philox_dot_h_
67 #define _mulhilo_dword_tpl(W, Word, Dword) \
68 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
69 Dword product = ((Dword)a)*((Dword)b); \
71 return (Word)product; \
81 #define _mulhilo_asm_tpl(W, Word, INSN) \
82 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
85 INSN " %0,%1,%2\n\t" \
93 #define _mulhilo_asm_tpl(W, Word, INSN) \
94 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
98 : "=a"(ax), "=d"(dx) \
111 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
112 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
113 return INTRIN(a, b, hip); \
118 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
119 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
120 *hip = INTRIN(a, b); \
137 #define _mulhilo_c99_tpl(W, Word) \
138 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
139 const unsigned WHALF = W/2; \
140 const Word LOMASK = ((((Word)1)<<WHALF)-1); \
142 Word ahi = a>>WHALF; \
143 Word alo = a& LOMASK; \
144 Word bhi = b>>WHALF; \
145 Word blo = b& LOMASK; \
147 Word ahbl = ahi*blo; \
148 Word albh = alo*bhi; \
150 Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \
151 Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \
152 hi += ahbl_albh >> WHALF; \
154 hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \
164 #define _mulhilo_fail_tpl(W, Word) \
165 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
166 R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
174 #if R123_USE_MULHILO32_ASM
176 _mulhilo_asm_tpl(32, uint32_t,
"mulhwu")
178 _mulhilo_asm_tpl(32, uint32_t,
"mull")
181 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
184 #if R123_USE_PHILOX_64BIT
185 #if R123_USE_MULHILO64_ASM
187 _mulhilo_asm_tpl(64, uint64_t,
"mulhdu")
189 _mulhilo_asm_tpl(64, uint64_t,
"mulq")
191 #elif R123_USE_MULHILO64_MSVC_INTRIN
192 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
193 #elif R123_USE_MULHILO64_CUDA_INTRIN
194 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
195 #elif R123_USE_MULHILO64_OPENCL_INTRIN
196 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
197 #elif R123_USE_MULHILO64_MULHI_INTRIN
198 _mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
199 #elif R123_USE_GNU_UINT128
200 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
201 #elif R123_USE_MULHILO64_C99
202 _mulhilo_c99_tpl(64, uint64_t)
204 _mulhilo_fail_tpl(64, uint64_t)
216 #ifndef PHILOX_M2x64_0
217 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
220 #ifndef PHILOX_M4x64_0
221 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
224 #ifndef PHILOX_M4x64_1
225 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
228 #ifndef PHILOX_M2x32_0
229 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
232 #ifndef PHILOX_M4x32_0
233 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
235 #ifndef PHILOX_M4x32_1
236 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
240 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)
243 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)
247 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
250 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
253 #ifndef PHILOX2x32_DEFAULT_ROUNDS
254 #define PHILOX2x32_DEFAULT_ROUNDS 10
257 #ifndef PHILOX2x64_DEFAULT_ROUNDS
258 #define PHILOX2x64_DEFAULT_ROUNDS 10
261 #ifndef PHILOX4x32_DEFAULT_ROUNDS
262 #define PHILOX4x32_DEFAULT_ROUNDS 10
265 #ifndef PHILOX4x64_DEFAULT_ROUNDS
266 #define PHILOX4x64_DEFAULT_ROUNDS 10
271 #define _philox2xWround_tpl(W, T) \
272 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
273 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
275 T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
276 struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \
279 #define _philox2xWbumpkey_tpl(W) \
280 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
281 key.v[0] += PHILOX_W##W##_0; \
285 #define _philox4xWround_tpl(W, T) \
286 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
287 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
290 T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
291 T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
292 struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \
293 hi0^ctr.v[3]^key.v[1], lo0}}; \
297 #define _philox4xWbumpkey_tpl(W) \
298 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
299 key.v[0] += PHILOX_W##W##_0; \
300 key.v[1] += PHILOX_W##W##_1; \
304 #define _philoxNxW_tpl(N, Nhalf, W, T) \
306 enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
307 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
308 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
309 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
310 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
311 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
312 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
313 R123_ASSERT(R<=16); \
314 if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \
315 if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
316 if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
317 if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
318 if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
319 if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
320 if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
321 if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
322 if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
323 if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
324 if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
325 if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
326 if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
327 if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
328 if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
329 if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
333 _philox2xWbumpkey_tpl(32)
334 _philox4xWbumpkey_tpl(32)
335 _philox2xWround_tpl(32, uint32_t)
336 _philox4xWround_tpl(32, uint32_t)
339 _philoxNxW_tpl(4, 2, 32, uint32_t)
340 #if R123_USE_PHILOX_64BIT
342 _philox2xWbumpkey_tpl(64)
343 _philox4xWbumpkey_tpl(64)
344 _philox2xWround_tpl(64, uint64_t)
345 _philox4xWround_tpl(64, uint64_t)
347 _philoxNxW_tpl(2, 1, 64, uint64_t)
348 _philoxNxW_tpl(4, 2, 64, uint64_t)
351 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
352 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
353 #if R123_USE_PHILOX_64BIT
354 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
355 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
363 #define _PhiloxNxW_base_tpl(CType, KType, N, W) \
365 template<unsigned int ROUNDS> \
366 struct Philox##N##x##W##_R{ \
367 typedef CType ctr_type; \
368 typedef KType key_type; \
369 typedef KType ukey_type; \
370 static const unsigned int rounds=ROUNDS; \
371 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
372 R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
373 return philox##N##x##W##_R(ROUNDS, ctr, key); \
376 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
382 #if R123_USE_PHILOX_64BIT
r123array2x32
Definition: philox.h:381
_PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) _PhiloxNxW_base_tpl(r123array4x32
r123array2x64
Definition: philox.h:384
_philoxNxW_tpl(2, 1, 32, uint32_t) _philoxNxW_tpl(4