00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #ifndef _Random123_sse_dot_h__
00033 #define _Random123_sse_dot_h__
00034
00035 #if R123_USE_SSE
00036
00037 #if R123_USE_X86INTRIN_H
00038 #include <x86intrin.h>
00039 #endif
00040 #if R123_USE_IA32INTRIN_H
00041 #include <ia32intrin.h>
00042 #endif
00043 #if R123_USE_EMMINTRIN_H
00044 #include <emmintrin.h>
00045 #endif
00046 #if R123_USE_SMMINTRIN_H
00047 #include <smmintrin.h>
00048 #endif
00049 #if R123_USE_WMMINTRIN_H
00050 #include <wmmintrin.h>
00051 #endif
00052 #if R123_USE_INTRIN_H
00053 #include <intrin.h>
00054 #endif
00055 #ifdef __cplusplus
00056 #include <iostream>
00057 #include <limits>
00058 #include <stdexcept>
00059 #endif
00060
00061 #if R123_USE_ASM_GNU
00062
00063
00064 R123_STATIC_INLINE int haveAESNI(){
00065 unsigned int eax, ebx, ecx, edx;
00066 __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
00067 "a" (1));
00068 return (ecx>>25) & 1;
00069 }
00070 #elif R123_USE_CPUID_MSVC
00071 R123_STATIC_INLINE int haveAESNI(){
00072 int CPUInfo[4];
00073 __cpuid(CPUInfo, 1);
00074 return (CPUInfo[2]>>25)&1;
00075 }
00076 #else
00077 #warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
00078 R123_STATIC_INLINE int haveAESNI(){
00079 return 0;
00080 }
00081 #endif
00082
00083
00084
00085
00086
00087
00088
00089
00090 #if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
00091
00092
00093
00094
00095 R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
00096 union{
00097 uint64_t u64;
00098 uint32_t u32[2];
00099 } u1, u0;
00100 u1.u64 = v1;
00101 u0.u64 = v0;
00102 return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
00103 }
00104 #endif
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115 #if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
00116 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00117 union{
00118 uint64_t u64[2];
00119 __m128i m;
00120 }u;
00121 _mm_store_si128(&u.m, si);
00122 return u.u64[0];
00123 }
00124 #elif defined(__llvm__) || defined(__ICC)
00125 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00126 return (uint64_t)_mm_cvtsi128_si64(si);
00127 }
00128 #else
00129
00130
00131
00132 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00133 return (uint64_t)_mm_cvtsi128_si64x(si);
00134 }
00135 #endif
00136 #if defined(__GNUC__) && __GNUC__ < 4
00137
00138 R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
00139 return (__m128)si;
00140 }
00141 #endif
00142
00143 #ifdef __cplusplus
00144
00145 struct r123m128i{
00146 __m128i m;
00147 #if R123_USE_CXX0X
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157 r123m128i() = default;
00158 r123m128i(__m128i _m): m(_m){}
00159 #endif
00160 r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
00161 r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
00162 #if R123_USE_CXX0X
00163
00164
00165
00166 explicit operator bool() const {return _bool();}
00167 #else
00168
00169
00170 operator const void*() const{return _bool()?this:0;}
00171 #endif
00172 operator __m128i() const {return m;}
00173
00174 private:
00175 #if R123_USE_SSE4_1
00176 bool _bool() const{ return !_mm_testz_si128(m,m); }
00177 #else
00178 bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
00179 #endif
00180 };
00181
00182 R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
00183 __m128i& c = v.m;
00184 __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
00185 c = _mm_add_epi64(c, zeroone);
00186
00187 #if R123_USE_SSE4_1
00188 __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
00189 if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
00190 __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
00191 c = _mm_add_epi64(c, onezero);
00192 }
00193 #else
00194 unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
00195
00196
00197 if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
00198 __m128i onezero = _mm_set_epi64x(1,0);
00199 c = _mm_add_epi64(c, onezero);
00200 }
00201 #endif
00202 return v;
00203 }
00204
00205 R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
00206 __m128i c = lhs.m;
00207 __m128i incr128 = _mm_set_epi64x(0, n);
00208 c = _mm_add_epi64(c, incr128);
00209
00210
00211 int64_t lo64 = _mm_extract_lo64(c);
00212 if((uint64_t)lo64 < n)
00213 c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
00214 lhs.m = c;
00215 return lhs;
00216 }
00217
00218
00219 R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG lhs, const r123m128i &rhs){
00220 throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
00221
00222
00223
00224
00225 R123_STATIC_INLINE bool operator<(const r123m128i& lhs, const r123m128i& rhs){
00226 throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
00227 R123_STATIC_INLINE bool operator<=(const r123m128i& lhs, const r123m128i& rhs){
00228 throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
00229 R123_STATIC_INLINE bool operator>(const r123m128i& lhs, const r123m128i& rhs){
00230 throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
00231 R123_STATIC_INLINE bool operator>=(const r123m128i& lhs, const r123m128i& rhs){
00232 throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
00233
00234 R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
00235 return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
00236 R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
00237 return !(lhs==rhs);}
00238 R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
00239 r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
00240 R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
00241 return !(lhs==rhs);}
00242 R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
00243 union{
00244 uint64_t u64[2];
00245 __m128i m;
00246 }u;
00247 _mm_storeu_si128(&u.m, m.m);
00248 return os << u.u64[0] << " " << u.u64[1];
00249 }
00250
00251 R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
00252 uint64_t u64[2];
00253 is >> u64[0] >> u64[1];
00254 m.m = _mm_set_epi64x(u64[1], u64[0]);
00255 return is;
00256 }
00257
00258 template<typename T> inline T assemble_from_u32(uint32_t *p32);
00259
00260 template <>
00261 inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
00262 r123m128i ret;
00263 ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
00264 return ret;
00265 }
00266
00267 #else
00268
00269 typedef struct {
00270 __m128i m;
00271 } r123m128i;
00272
00273 #endif
00274
00275 #else
00276 R123_STATIC_INLINE int haveAESNI(){
00277 return 0;
00278 }
00279 #endif
00280
00281 #endif