我有以下代码(普通,SSE和AVX):
- int testSSE(const aligned_vector & ghs,const aligned_vector & lhs) {
- int result[4] __attribute__((aligned(16))) = {0};
- __m128i vresult = _mm_set1_epi32(0);
- __m128i v1,v2,vmax;
- for (int k = 0; k < ghs.size(); k += 4) {
- v1 = _mm_load_si128((__m128i *) & lhs[k]);
- v2 = _mm_load_si128((__m128i *) & ghs[k]);
- vmax = _mm_add_epi32(v1,v2);
- vresult = _mm_max_epi32(vresult,vmax);
- }
- _mm_store_si128((__m128i *) result,vresult);
- int mymax = result[0];
- for (int k = 1; k < 4; k++) {
- if (result[k] > mymax) {
- mymax = result[k];
- }
- }
- return mymax;
- }
- int testAVX(const aligned_vector & ghs,const aligned_vector & lhs) {
- int result[8] __attribute__((aligned(32))) = {0};
- __m256i vresult = _mm256_set1_epi32(0);
- __m256i v1,vmax;
- for (int k = 0; k < ghs.size(); k += 8) {
- v1 = _mm256_load_si256((__m256i *) & ghs[ k]);
- v2 = _mm256_load_si256((__m256i *) & lhs[k]);
- vmax = _mm256_add_epi32(v1,v2);
- vresult = _mm256_max_epi32(vresult,vmax);
- }
- _mm256_store_si256((__m256i *) result,vresult);
- int mymax = result[0];
- for (int k = 1; k < 8; k++) {
- if (result[k] > mymax) {
- mymax = result[k];
- }
- }
- return mymax;
- }
- int testNormal(const aligned_vector & ghs,const aligned_vector & lhs) {
- int max = 0;
- int tempMax;
- for (int k = 0; k < ghs.size(); k++) {
- tempMax = lhs[k] + ghs[k];
- if (max < tempMax) {
- max = tempMax;
- }
- }
- return max;
- }
- void alignTestSSE() {
- aligned_vector lhs;
- aligned_vector ghs;
- int mySize = 4096;
- int FinalResult;
- int nofTestCases = 1000;
- double time,time1,time2,time3;
- vector<int> lhs2;
- vector<int> ghs2;
- lhs.resize(mySize);
- ghs.resize(mySize);
- lhs2.resize(mySize);
- ghs2.resize(mySize);
- srand(1);
- for (int k = 0; k < mySize; k++) {
- lhs[k] = randomNodeID(1000000);
- lhs2[k] = lhs[k];
- ghs[k] = randomNodeID(1000000);
- ghs2[k] = ghs[k];
- }
- /* Warming UP */
- for (int k = 0; k < nofTestCases; k++) {
- FinalResult = testNormal(lhs,ghs);
- }
- for (int k = 0; k < nofTestCases; k++) {
- FinalResult = testSSE(lhs,ghs);
- }
- for (int k = 0; k < nofTestCases; k++) {
- FinalResult = testAVX(lhs,ghs);
- }
- cout << "===========================" << endl;
- time = timestamp();
- for (int k = 0; k < nofTestCases; k++) {
- FinalResult = testSSE(lhs,ghs);
- }
- time = timestamp() - time;
- time1 = time;
- cout << "SSE took " << time << " s" << endl;
- cout << "SSE Result: " << FinalResult << endl;
- time = timestamp();
- for (int k = 0; k < nofTestCases; k++) {
- FinalResult = testAVX(lhs,ghs);
- }
- time = timestamp() - time;
- time3 = time;
- cout << "AVX took " << time << " s" << endl;
- cout << "AVX Result: " << FinalResult << endl;
- time = timestamp();
- for (int k = 0; k < nofTestCases; k++) {
- FinalResult = testNormal(lhs,ghs);
- }
- time = timestamp() - time;
- cout << "Normal took " << time << " s" << endl;
- cout << "Normal Result: " << FinalResult << endl;
- cout << "SpeedUP SSE= " << time / time1 << " s" << endl;
- cout << "SpeedUP AVX= " << time / time3 << " s" << endl;
- cout << "===========================" << endl;
- ghs.clear();
- lhs.clear();
- }
哪里
- inline double timestamp() {
- struct timeval tp;
- gettimeofday(&tp,NULL);
- return double(tp.tv_sec) + tp.tv_usec / 1000000.;
- }
和
- typedef vector<int,aligned_allocator<int,sizeof (int)> > aligned_vector;
是使用https://gist.github.com/donny-dont/1471329的AlignedAllocator的对齐矢量
我有一个intel-i7 haswell 4771,以及最新的Ubuntu 14.04 64bit和gcc 4.8.2.一切都是最新的.我用-march = native -mtune = native -O3 -m64编译.
结果是:
- SSE took 0.000375986 s
- SSE Result: 1982689
- AVX took 0.000459909 s
- AVX Result: 1982689
- Normal took 0.00315714 s
- Normal Result: 1982689
- SpeedUP SSE= 8.39696 s
- SpeedUP AVX= 6.8647 s
这表明完全相同的代码在AVX2上比SSE慢22%.我做错了什么还是这种正常行为?
解决方法
我将你的代码转换为更多的vanilla C(普通数组,没有向量等),清理它并在禁用自动向量化的情况下测试它并得到合理的结果:
- #include <iostream>
- using namespace std;
- #include <sys/time.h>
- #include <cstdlib>
- #include <cstdint>
- #include <immintrin.h>
- inline double timestamp() {
- struct timeval tp;
- gettimeofday(&tp,NULL);
- return double(tp.tv_sec) + tp.tv_usec / 1000000.;
- }
- int testSSE(const int32_t * ghs,const int32_t * lhs,size_t n) {
- int result[4] __attribute__((aligned(16))) = {0};
- __m128i vresult = _mm_set1_epi32(0);
- __m128i v1,vmax;
- for (int k = 0; k < n; k += 4) {
- v1 = _mm_load_si128((__m128i *) & lhs[k]);
- v2 = _mm_load_si128((__m128i *) & ghs[k]);
- vmax = _mm_add_epi32(v1,vresult);
- int mymax = result[0];
- for (int k = 1; k < 4; k++) {
- if (result[k] > mymax) {
- mymax = result[k];
- }
- }
- return mymax;
- }
- int testAVX(const int32_t * ghs,size_t n) {
- int result[8] __attribute__((aligned(32))) = {0};
- __m256i vresult = _mm256_set1_epi32(0);
- __m256i v1,vmax;
- for (int k = 0; k < n; k += 8) {
- v1 = _mm256_load_si256((__m256i *) & ghs[k]);
- v2 = _mm256_load_si256((__m256i *) & lhs[k]);
- vmax = _mm256_add_epi32(v1,vresult);
- int mymax = result[0];
- for (int k = 1; k < 8; k++) {
- if (result[k] > mymax) {
- mymax = result[k];
- }
- }
- return mymax;
- }
- int testNormal(const int32_t * ghs,size_t n) {
- int max = 0;
- int tempMax;
- for (int k = 0; k < n; k++) {
- tempMax = lhs[k] + ghs[k];
- if (max < tempMax) {
- max = tempMax;
- }
- }
- return max;
- }
- void alignTestSSE() {
- int n = 4096;
- int normalResult,sseResult,avxResult;
- int nofTestCases = 1000;
- double time,normalTime,sseTime,avxTime;
- int lhs[n] __attribute__ ((aligned(32)));
- int ghs[n] __attribute__ ((aligned(32)));
- for (int k = 0; k < n; k++) {
- lhs[k] = arc4random();
- ghs[k] = arc4random();
- }
- /* Warming UP */
- for (int k = 0; k < nofTestCases; k++) {
- normalResult = testNormal(lhs,ghs,n);
- }
- for (int k = 0; k < nofTestCases; k++) {
- sseResult = testSSE(lhs,n);
- }
- for (int k = 0; k < nofTestCases; k++) {
- avxResult = testAVX(lhs,n);
- }
- time = timestamp();
- for (int k = 0; k < nofTestCases; k++) {
- normalResult = testNormal(lhs,n);
- }
- normalTime = timestamp() - time;
- time = timestamp();
- for (int k = 0; k < nofTestCases; k++) {
- sseResult = testSSE(lhs,n);
- }
- sseTime = timestamp() - time;
- time = timestamp();
- for (int k = 0; k < nofTestCases; k++) {
- avxResult = testAVX(lhs,n);
- }
- avxTime = timestamp() - time;
- cout << "===========================" << endl;
- cout << "Normal took " << normalTime << " s" << endl;
- cout << "Normal Result: " << normalResult << endl;
- cout << "SSE took " << sseTime << " s" << endl;
- cout << "SSE Result: " << sseResult << endl;
- cout << "AVX took " << avxTime << " s" << endl;
- cout << "AVX Result: " << avxResult << endl;
- cout << "SpeedUP SSE= " << normalTime / sseTime << endl;
- cout << "SpeedUP AVX= " << normalTime / avxTime << endl;
- cout << "===========================" << endl;
- }
- int main()
- {
- alignTestSSE();
- return 0;
- }
测试:
- $clang++ -Wall -mavx2 -O3 -fno-vectorize SO_avx.cpp && ./a.out
- ===========================
- Normal took 0.00324106 s
- Normal Result: 2143749391
- SSE took 0.000527859 s
- SSE Result: 2143749391
- AVX took 0.000221968 s
- AVX Result: 2143749391
- SpeedUP SSE= 6.14002
- SpeedUP AVX= 14.6015
- ===========================
我建议你尝试上面的代码,使用-fno-vectorize(或-fno-tree-vectorize,如果使用g),看看你是否得到类似的结果.如果您这样做,那么您可以向后查找原始代码,以查看可能出现的不一致之处.