我有一个巨大的向量< vector< int>> (18M×128).通常我想要2行这个向量,并通过这个功能进行比较:
@H_403_2@int getDiff(int indx1,int indx2) {
int result = 0;
int pplus,pminus,tmp;
for (int k = 0; k < 128; k += 2) {
pplus = nodeL[indx2][k] - nodeL[indx1][k];
pminus = nodeL[indx1][k + 1] - nodeL[indx2][k + 1];
tmp = max(pplus,pminus);
if (tmp > result) {
result = tmp;
}
}
return result;
}
如你所见,函数循环通过两个行向量进行一些减法,最后返回一个最大值.这个功能将被使用一百万次,所以我想知道是否可以通过SSE指令加速.我使用Ubuntu 12.04和gcc.
当然这是微型优化,但是如果你能提供一些帮助,那么这是有帮助的,因为我对SSE一无所知.提前致谢
基准测试:
@H_403_2@int nofTestCases = 10000000; vector<int> nodeIds(nofTestCases); vector<int> goalNodeIds(nofTestCases); vector<int> results(nofTestCases); for (int l = 0; l < nofTestCases; l++) { nodeIds[l] = randomNodeID(18000000); goalNodeIds[l] = randomNodeID(18000000); } double time,result; time = timestamp(); for (int l = 0; l < nofTestCases; l++) { results[l] = getDiff2(nodeIds[l],goalNodeIds[l]); } result = timestamp() - time; cout << result / nofTestCases << "s" << endl; time = timestamp(); for (int l = 0; l < nofTestCases; l++) { results[l] = getDiff(nodeIds[l],goalNodeIds[l]); } result = timestamp() - time; cout << result / nofTestCases << "s" << endl;哪里
@H_403_2@int randomNodeID(int n) { return (int) (rand() / (double) (RAND_MAX + 1.0) * n); } /** Returns a timestamp ('now') in seconds (incl. a fractional part). */ inline double timestamp() { struct timeval tp; gettimeofday(&tp,NULL); return double(tp.tv_sec) + tp.tv_usec / 1000000.; }解决方法
FWIW我将一个纯粹的SSE版本(SSE4.1)放在一起,似乎比Core i7上的原始标量码快了20%
@H_403_2@#include <smmintrin.h>
int getDiff_SSE(int indx1,int indx2)
{
int result[4] __attribute__ ((aligned(16))) = { 0 };
const int * const p1 = &nodeL[indx1][0];
const int * const p2 = &nodeL[indx2][0];
const __m128i vke = _mm_set_epi32(0,-1,-1);
const __m128i vko = _mm_set_epi32(-1,0);
__m128i vresult = _mm_set1_epi32(0);
for (int k = 0; k < 128; k += 4)
{
__m128i v1,v2,vmax;
v1 = _mm_loadu_si128((__m128i *)&p1[k]);
v2 = _mm_loadu_si128((__m128i *)&p2[k]);
v1 = _mm_xor_si128(v1,vke);
v2 = _mm_xor_si128(v2,vko);
v1 = _mm_sub_epi32(v1,vke);
v2 = _mm_sub_epi32(v2,vko);
vmax = _mm_add_epi32(v1,v2);
vresult = _mm_max_epi32(vresult,vmax);
}
_mm_store_si128((__m128i *)result,vresult);
return max(max(max(result[0],result[1]),result[2]),result[3]);
}