c – AVX2比Haswell上的SSE慢

前端之家收集整理的这篇文章主要介绍了c – AVX2比Haswell上的SSE慢前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
我有以下代码(普通,SSE和AVX):
  1. int testSSE(const aligned_vector & ghs,const aligned_vector & lhs) {
  2. int result[4] __attribute__((aligned(16))) = {0};
  3. __m128i vresult = _mm_set1_epi32(0);
  4. __m128i v1,v2,vmax;
  5.  
  6. for (int k = 0; k < ghs.size(); k += 4) {
  7. v1 = _mm_load_si128((__m128i *) & lhs[k]);
  8. v2 = _mm_load_si128((__m128i *) & ghs[k]);
  9. vmax = _mm_add_epi32(v1,v2);
  10. vresult = _mm_max_epi32(vresult,vmax);
  11. }
  12. _mm_store_si128((__m128i *) result,vresult);
  13. int mymax = result[0];
  14. for (int k = 1; k < 4; k++) {
  15. if (result[k] > mymax) {
  16. mymax = result[k];
  17. }
  18. }
  19. return mymax;
  20. }
  21.  
  22. int testAVX(const aligned_vector & ghs,const aligned_vector & lhs) {
  23. int result[8] __attribute__((aligned(32))) = {0};
  24. __m256i vresult = _mm256_set1_epi32(0);
  25. __m256i v1,vmax;
  26.  
  27. for (int k = 0; k < ghs.size(); k += 8) {
  28. v1 = _mm256_load_si256((__m256i *) & ghs[ k]);
  29. v2 = _mm256_load_si256((__m256i *) & lhs[k]);
  30. vmax = _mm256_add_epi32(v1,v2);
  31. vresult = _mm256_max_epi32(vresult,vmax);
  32. }
  33. _mm256_store_si256((__m256i *) result,vresult);
  34. int mymax = result[0];
  35. for (int k = 1; k < 8; k++) {
  36. if (result[k] > mymax) {
  37. mymax = result[k];
  38. }
  39. }
  40. return mymax;
  41. }
  42.  
  43. int testNormal(const aligned_vector & ghs,const aligned_vector & lhs) {
  44. int max = 0;
  45. int tempMax;
  46. for (int k = 0; k < ghs.size(); k++) {
  47. tempMax = lhs[k] + ghs[k];
  48. if (max < tempMax) {
  49. max = tempMax;
  50. }
  51. }
  52. return max;
  53. }

所有这些功能都使用以下代码进行测试:

  1. void alignTestSSE() {
  2. aligned_vector lhs;
  3. aligned_vector ghs;
  4.  
  5. int mySize = 4096;
  6. int FinalResult;
  7. int nofTestCases = 1000;
  8. double time,time1,time2,time3;
  9. vector<int> lhs2;
  10. vector<int> ghs2;
  11.  
  12. lhs.resize(mySize);
  13. ghs.resize(mySize);
  14. lhs2.resize(mySize);
  15. ghs2.resize(mySize);
  16.  
  17. srand(1);
  18. for (int k = 0; k < mySize; k++) {
  19. lhs[k] = randomNodeID(1000000);
  20. lhs2[k] = lhs[k];
  21. ghs[k] = randomNodeID(1000000);
  22. ghs2[k] = ghs[k];
  23. }
  24. /* Warming UP */
  25. for (int k = 0; k < nofTestCases; k++) {
  26. FinalResult = testNormal(lhs,ghs);
  27. }
  28.  
  29. for (int k = 0; k < nofTestCases; k++) {
  30. FinalResult = testSSE(lhs,ghs);
  31. }
  32.  
  33. for (int k = 0; k < nofTestCases; k++) {
  34. FinalResult = testAVX(lhs,ghs);
  35. }
  36.  
  37. cout << "===========================" << endl;
  38. time = timestamp();
  39. for (int k = 0; k < nofTestCases; k++) {
  40. FinalResult = testSSE(lhs,ghs);
  41. }
  42. time = timestamp() - time;
  43. time1 = time;
  44. cout << "SSE took " << time << " s" << endl;
  45. cout << "SSE Result: " << FinalResult << endl;
  46.  
  47. time = timestamp();
  48. for (int k = 0; k < nofTestCases; k++) {
  49. FinalResult = testAVX(lhs,ghs);
  50. }
  51. time = timestamp() - time;
  52. time3 = time;
  53. cout << "AVX took " << time << " s" << endl;
  54. cout << "AVX Result: " << FinalResult << endl;
  55.  
  56.  
  57.  
  58. time = timestamp();
  59. for (int k = 0; k < nofTestCases; k++) {
  60. FinalResult = testNormal(lhs,ghs);
  61. }
  62. time = timestamp() - time;
  63. cout << "Normal took " << time << " s" << endl;
  64. cout << "Normal Result: " << FinalResult << endl;
  65. cout << "SpeedUP SSE= " << time / time1 << " s" << endl;
  66. cout << "SpeedUP AVX= " << time / time3 << " s" << endl;
  67. cout << "===========================" << endl;
  68. ghs.clear();
  69. lhs.clear();
  70. }

哪里

  1. inline double timestamp() {
  2. struct timeval tp;
  3. gettimeofday(&tp,NULL);
  4. return double(tp.tv_sec) + tp.tv_usec / 1000000.;
  5. }

  1. typedef vector<int,aligned_allocator<int,sizeof (int)> > aligned_vector;

是使用https://gist.github.com/donny-dont/1471329的AlignedAllocator的对齐矢量

我有一个intel-i7 haswell 4771,以及最新的Ubuntu 14.04 64bit和gcc 4.8.2.一切都是最新的.我用-march = native -mtune = native -O3 -m64编译.

结果是:

  1. SSE took 0.000375986 s
  2. SSE Result: 1982689
  3. AVX took 0.000459909 s
  4. AVX Result: 1982689
  5. Normal took 0.00315714 s
  6. Normal Result: 1982689
  7. SpeedUP SSE= 8.39696 s
  8. SpeedUP AVX= 6.8647 s

这表明完全相同的代码在AVX2上比SSE慢22%.我做错了什么还是这种正常行为?

解决方法

我将你的代码转换为更多的vanilla C(普通数组,没有向量等),清理它并在禁用自动向量化的情况下测试它并得到合理的结果:
  1. #include <iostream>
  2. using namespace std;
  3.  
  4. #include <sys/time.h>
  5. #include <cstdlib>
  6. #include <cstdint>
  7.  
  8. #include <immintrin.h>
  9.  
  10. inline double timestamp() {
  11. struct timeval tp;
  12. gettimeofday(&tp,NULL);
  13. return double(tp.tv_sec) + tp.tv_usec / 1000000.;
  14. }
  15.  
  16. int testSSE(const int32_t * ghs,const int32_t * lhs,size_t n) {
  17. int result[4] __attribute__((aligned(16))) = {0};
  18. __m128i vresult = _mm_set1_epi32(0);
  19. __m128i v1,vmax;
  20.  
  21. for (int k = 0; k < n; k += 4) {
  22. v1 = _mm_load_si128((__m128i *) & lhs[k]);
  23. v2 = _mm_load_si128((__m128i *) & ghs[k]);
  24. vmax = _mm_add_epi32(v1,vresult);
  25. int mymax = result[0];
  26. for (int k = 1; k < 4; k++) {
  27. if (result[k] > mymax) {
  28. mymax = result[k];
  29. }
  30. }
  31. return mymax;
  32. }
  33.  
  34. int testAVX(const int32_t * ghs,size_t n) {
  35. int result[8] __attribute__((aligned(32))) = {0};
  36. __m256i vresult = _mm256_set1_epi32(0);
  37. __m256i v1,vmax;
  38.  
  39. for (int k = 0; k < n; k += 8) {
  40. v1 = _mm256_load_si256((__m256i *) & ghs[k]);
  41. v2 = _mm256_load_si256((__m256i *) & lhs[k]);
  42. vmax = _mm256_add_epi32(v1,vresult);
  43. int mymax = result[0];
  44. for (int k = 1; k < 8; k++) {
  45. if (result[k] > mymax) {
  46. mymax = result[k];
  47. }
  48. }
  49. return mymax;
  50. }
  51.  
  52. int testNormal(const int32_t * ghs,size_t n) {
  53. int max = 0;
  54. int tempMax;
  55. for (int k = 0; k < n; k++) {
  56. tempMax = lhs[k] + ghs[k];
  57. if (max < tempMax) {
  58. max = tempMax;
  59. }
  60. }
  61. return max;
  62. }
  63.  
  64. void alignTestSSE() {
  65.  
  66. int n = 4096;
  67. int normalResult,sseResult,avxResult;
  68. int nofTestCases = 1000;
  69. double time,normalTime,sseTime,avxTime;
  70.  
  71. int lhs[n] __attribute__ ((aligned(32)));
  72. int ghs[n] __attribute__ ((aligned(32)));
  73.  
  74. for (int k = 0; k < n; k++) {
  75. lhs[k] = arc4random();
  76. ghs[k] = arc4random();
  77. }
  78.  
  79. /* Warming UP */
  80. for (int k = 0; k < nofTestCases; k++) {
  81. normalResult = testNormal(lhs,ghs,n);
  82. }
  83.  
  84. for (int k = 0; k < nofTestCases; k++) {
  85. sseResult = testSSE(lhs,n);
  86. }
  87.  
  88. for (int k = 0; k < nofTestCases; k++) {
  89. avxResult = testAVX(lhs,n);
  90. }
  91.  
  92. time = timestamp();
  93. for (int k = 0; k < nofTestCases; k++) {
  94. normalResult = testNormal(lhs,n);
  95. }
  96. normalTime = timestamp() - time;
  97.  
  98. time = timestamp();
  99. for (int k = 0; k < nofTestCases; k++) {
  100. sseResult = testSSE(lhs,n);
  101. }
  102. sseTime = timestamp() - time;
  103.  
  104. time = timestamp();
  105. for (int k = 0; k < nofTestCases; k++) {
  106. avxResult = testAVX(lhs,n);
  107. }
  108. avxTime = timestamp() - time;
  109.  
  110. cout << "===========================" << endl;
  111. cout << "Normal took " << normalTime << " s" << endl;
  112. cout << "Normal Result: " << normalResult << endl;
  113. cout << "SSE took " << sseTime << " s" << endl;
  114. cout << "SSE Result: " << sseResult << endl;
  115. cout << "AVX took " << avxTime << " s" << endl;
  116. cout << "AVX Result: " << avxResult << endl;
  117. cout << "SpeedUP SSE= " << normalTime / sseTime << endl;
  118. cout << "SpeedUP AVX= " << normalTime / avxTime << endl;
  119. cout << "===========================" << endl;
  120.  
  121. }
  122.  
  123. int main()
  124. {
  125. alignTestSSE();
  126. return 0;
  127. }

测试:

  1. $clang++ -Wall -mavx2 -O3 -fno-vectorize SO_avx.cpp && ./a.out
  2. ===========================
  3. Normal took 0.00324106 s
  4. Normal Result: 2143749391
  5. SSE took 0.000527859 s
  6. SSE Result: 2143749391
  7. AVX took 0.000221968 s
  8. AVX Result: 2143749391
  9. SpeedUP SSE= 6.14002
  10. SpeedUP AVX= 14.6015
  11. ===========================

我建议你尝试上面的代码,使用-fno-vectorize(或-fno-tree-vectorize,如果使用g),看看你是否得到类似的结果.如果您这样做,那么您可以向后查找原始代码,以查看可能出现的不一致之处.

猜你在找的C&C++相关文章