00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #ifndef _Random123_sse_dot_h__
00033 #define _Random123_sse_dot_h__
00034
00035 #if R123_USE_SSE
00036
00037 #if R123_USE_X86INTRIN_H
00038 #include <x86intrin.h>
00039 #endif
00040 #if R123_USE_IA32INTRIN_H
00041 #include <ia32intrin.h>
00042 #endif
00043 #if R123_USE_EMMINTRIN_H
00044 #include <emmintrin.h>
00045 #endif
00046 #if R123_USE_SMMINTRIN_H
00047 #include <smmintrin.h>
00048 #endif
00049 #if R123_USE_WMMINTRIN_H
00050 #include <wmmintrin.h>
00051 #endif
00052 #if R123_USE_INTRIN_H
00053 #include <intrin.h>
00054 #endif
00055 #ifdef __cplusplus
00056 #include <iostream>
00057 #include <limits>
00058 #include <stdexcept>
00059 #endif
00060
00061 #if R123_USE_ASM_GNU
00062
00063
00064 R123_STATIC_INLINE int haveAESNI(){
00065 unsigned int eax, ebx, ecx, edx;
00066 __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
00067 "a" (1));
00068 return (ecx>>25) & 1;
00069 }
00070 #elif R123_USE_CPUID_MSVC
00071 R123_STATIC_INLINE int haveAESNI(){
00072 int CPUInfo[4];
00073 __cpuid(CPUInfo, 1);
00074 return (CPUInfo[2]>>25)&1;
00075 }
00076 #else
00077 #warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
00078 R123_STATIC_INLINE int haveAESNI(){
00079 return 0;
00080 }
00081 #endif
00082
00083
00084
00085
00086
00087
00088
00089
00090 #if defined(__ICC) || (defined(_MSC_VER) && !defined(_WIN64))
00091
00092
00093 R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
00094 union{
00095 uint64_t u64;
00096 uint32_t u32[2];
00097 } u1, u0;
00098 u1.u64 = v1;
00099 u0.u64 = v0;
00100 return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
00101 }
00102 #endif
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113 #if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
00114 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00115 union{
00116 uint64_t u64[2];
00117 __m128i m;
00118 }u;
00119 _mm_store_si128(&u.m, si);
00120 return u.u64[0];
00121 }
00122 #elif defined(__llvm__) || defined(__ICC)
00123 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00124 return (uint64_t)_mm_cvtsi128_si64(si);
00125 }
00126 #else
00127
00128
00129
00130 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00131 return (uint64_t)_mm_cvtsi128_si64x(si);
00132 }
00133 #endif
00134 #if defined(__GNUC__) && __GNUC__ < 4
00135
00136 R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
00137 return (__m128)si;
00138 }
00139 #endif
00140
00141 #ifdef __cplusplus
00142
00143 struct r123m128i{
00144 __m128i m;
00145 #if R123_USE_CXX0X
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155 r123m128i() = default;
00156 r123m128i(__m128i _m): m(_m){}
00157 #endif
00158 r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
00159 #if R123_USE_CXX0X
00160
00161
00162
00163 explicit operator bool() const {return _bool();}
00164 #else
00165
00166
00167 operator const void*() const{return _bool()?this:0;}
00168 #endif
00169 operator __m128i() const {return m;}
00170
00171 private:
00172 #if R123_USE_SSE4_1
00173 bool _bool() const{ return !_mm_testz_si128(m,m); }
00174 #else
00175 bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
00176 #endif
00177 };
00178
00179 R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
00180 __m128i& c = v.m;
00181 __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
00182 c = _mm_add_epi64(c, zeroone);
00183
00184 #if R123_USE_SSE4_1
00185 __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
00186 if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
00187 __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
00188 c = _mm_add_epi64(c, onezero);
00189 }
00190 #else
00191 unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
00192
00193
00194 if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
00195 __m128i onezero = _mm_set_epi64x(1,0);
00196 c = _mm_add_epi64(c, onezero);
00197 }
00198 #endif
00199 return v;
00200 }
00201
00202 R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
00203 __m128i c = lhs.m;
00204 __m128i incr128 = _mm_set_epi64x(0, n);
00205 c = _mm_add_epi64(c, incr128);
00206
00207
00208 int64_t lo64 = _mm_extract_lo64(c);
00209 if((uint64_t)lo64 < n)
00210 c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
00211 lhs.m = c;
00212 return lhs;
00213 }
00214
00215
00216 R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG lhs, const r123m128i &rhs){
00217 throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
00218
00219
00220
00221
00222 R123_STATIC_INLINE bool operator<(const r123m128i& lhs, const r123m128i& rhs){
00223 throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
00224 R123_STATIC_INLINE bool operator<=(const r123m128i& lhs, const r123m128i& rhs){
00225 throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
00226 R123_STATIC_INLINE bool operator>(const r123m128i& lhs, const r123m128i& rhs){
00227 throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
00228 R123_STATIC_INLINE bool operator>=(const r123m128i& lhs, const r123m128i& rhs){
00229 throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
00230
00231 R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
00232 return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
00233 R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
00234 return !(lhs==rhs);}
00235 R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
00236 r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
00237 R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
00238 return !(lhs==rhs);}
00239 R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
00240 union{
00241 uint64_t u64[2];
00242 __m128i m;
00243 }u;
00244 _mm_storeu_si128(&u.m, m.m);
00245 return os << u.u64[0] << " " << u.u64[1];
00246 }
00247
00248 R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
00249 uint64_t u64[2];
00250 is >> u64[0] >> u64[1];
00251 m.m = _mm_set_epi64x(u64[1], u64[0]);
00252 return is;
00253 }
00254
00255 template<typename T> inline T assemble_from_u32(uint32_t *p32);
00256
00257 template <>
00258 inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
00259 r123m128i ret;
00260 ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
00261 return ret;
00262 }
00263
00264 #else
00265
00266 typedef struct {
00267 __m128i m;
00268 } r123m128i;
00269
00270 #endif
00271
00272 #else
00273 R123_STATIC_INLINE int haveAESNI(){
00274 return 0;
00275 }
00276 #endif
00277
00278 #endif