前几天我正在玩,试图看看我可以在多大程度上优化某些东西.我决定从一个简单的地图开始,只是进行线性搜索,找出是否存在元素,然后尝试优化其中的大部分.另外,为了比较,我使用std :: find对std :: map和std :: vector做同样的事情.
地图的结果是预期的,比我的地图更慢的创建和破坏,但更快的速度(实际上,我无法测量它,它总是返回0).
问题出在std :: vector上.我希望它比我的实现慢,但不是,我真的不明白它是如何相同或更快,因为我的实现正在跳过最坏的情况(值不在向量中)并且是使用结果缓存.
谁能在这里解决一些问题?我知道stl背后的人是半神,但仍然没有意义.
基准测试结果(i3,Windows 8.1 Pro 64,Visual Studio 2013):
std::vector : Build : 85.0042 ms Loop : 37.0011 ms Find : 1.82259 ms -> First : Found,Second : Found,Third : Not Found Release : 0 ms -------------------- std::map : Build : 6929.41 ms Loop : 570.032 ms Find : 0 ms -> First : Found,Third : Not Found Release : 1425.08 -------------------- Linear Map V0: Build : 194.012 ms Loop : 49.0052 ms Find : 1.88915 ms -> First : Found,Third : Not Found Release : 109.004
这是地图的代码:
template<typename T> class LinearMap0 { public: LinearMap0() { _end = _root = new Node; _prebuffer = nullptr; prebufferCapacity = 0; _alive = true; prebufferMarker = 0; _cache = _mm_set1_epi32(-1); for (auto& ptr : _cacheBuffer) ptr = nullptr; MinID = INT32_MAX - 1; MaxID = -1; } void PreAllocate(int Count) { prebufferCapacity = Count; _prebuffer = new Node[Count]; } ~LinearMap0() { if (_alive) { Release(); } } void Release() { Node* marker = _end; while (marker->Prev) { marker = marker->Prev; if (!marker->Next->IsPreAllocated) delete marker->Next; } if (!_root->IsPreAllocated) delete _root; delete[] _prebuffer; _alive = false; } void AddElement(int ID,T element) { Node* tmp = nullptr; if (prebufferMarker < prebufferCapacity) { // Use a pre-allocated object tmp = &_prebuffer[prebufferMarker]; prebufferMarker++; tmp->IsPreAllocated = true; } else { tmp = new Node; } tmp->ID = ID; tmp->Data = element; // Update list _end->Next = tmp; Node* prevEnd = _end; _end = tmp; _end->Prev = prevEnd; bool isMin = ID < MinID; MinID = ID * isMin + (1 - isMin) * MinID; bool isMax = ID > MaxID; MaxID = ID * isMax + (1 - isMax) * MaxID; } void DeleteLast() { Node* tmp = _end; _end = _end->Prev; _end->Next = nullptr; delete tmp; } template<class Function> void Loop(Function&& f,bool Forward = true) { if (Forward) { Node* marker = _root; while (marker->Next) { marker = marker->Next; f(marker->Data); } } else { Node* marker = _end; while (marker->Prev) { marker = marker->Prev; f(marker->Data); } } } T* Find(int ID) { // Bounds check if (ID < MinID || ID > MaxID) return nullptr; // Check it it's in the cache // Compare the value to every value in the cache __m128i idxSSE = _mm_set1_epi32(ID); __m128i C = _mm_cmpeq_epi32(_cache,idxSSE); // To change form -1 to 1 C = _mm_mul_epi32(C,_mm_set1_epi32(-1)); // Now C holds 1 if true,or 0 if false (in each of its 4 members). It should only be ONE set at 1 __m128i tmp = _mm_set1_epi32(1); __m128i S = _mm_sub_epi32(tmp,C); // Now find the index int i = S.m128i_i32[0] * (C.m128i_i32[1] + S.m128i_i32[1] * (2 * C.m128i_i32[2] + S.m128i_i32[2] * (3 * C.m128i_i32[3] + S.m128i_i32[3] * -1))); if (i != -1) return _cacheBuffer[i]; // Traverse the list Node* marker0 = _root; T* obj = nullptr; while (true) { if (marker0->ID == ID) { obj = &marker0->Data; } if (marker0->Next) marker0 = marker0->Next; else break; } // Cache value and return _cache.m128i_i32[cacheMarker] = ID; _cacheBuffer[cacheMarker] = obj; cacheMarker = (cacheMarker + 1) & 3; // x & 3 = x % 4 return obj; } private: struct Node { Node() { Prev = nullptr; Next = nullptr; IsPreAllocated = false; ID = -1; } T Data; Node* Prev; Node* Next; bool IsPreAllocated; int ID; }; Node* _root; Node* _end; Node* _prebuffer; int prebufferCapacity; int prebufferMarker; bool _alive; __m128i _cache; T* _cacheBuffer[4]; int cacheMarker; int MinID,MaxID; };
这是基准:
// Initialize seeds const __int64 ecount = 5 * 1000*1000; vector<__int64> seed(ecount); for (__int64 i = 0; i < ecount; i++) { seed[i] = i; } random_shuffle(seed.begin(),seed.end()); ///////////// std::vector vector<__int64> v; cout << "--------------------" << endl; cout << "std::vector :" << endl; cout << " Build : " << time_call([&]() { v.resize(ecount/2); for (__int64 i = 0; i < ecount; i++) { if (i < (ecount / 2)) v[i] = seed[i]; else v.push_back(seed[i]); } }) << " ms" << endl; cout << " Loop : " << time_call([&]() { for (auto& n : v) n /= 2; }) << " ms" << endl; bool found1,found2,found3; cout << " Find : " << (((float)time_call([&]() { for (int i = 0; i < 15; i++) { // Should exist found1 = find(v.begin(),v.end(),seed[5] / 2) != v.end();//find(seed[5]) != m.end(); found2 = find(v.begin(),seed[1000] / 2) != v.end(); // Shouldn't exist found3 = find(v.begin(),-1234) != v.end(); } })) / 15.0) / 3.0; cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ",Second : " << ((found2) ? "Found" : "Not Found") << ",Third : " << ((found3) ? "Found" : "Not Found") << endl; cout << " Release : " << time_call([&]() { v.clear(); }) << " ms" << endl; ///////////// std::map map<__int64,__int64> m; cout << "--------------------" << endl; cout << "std::map :" << endl; cout << " Build : " << time_call([&]() { for (__int64 i = 0; i < ecount; i++) { m[seed[i]] = seed[i]; } }) << " ms" << endl; cout << " Loop : " << time_call([&]() { for (auto& n : m) n.second /= 2; }) << " ms" << endl; cout << " Find : " << (((float)time_call([&]() { for (int i = 0; i < 15; i++) { // Should exist found1 = m.find(seed[5]) != m.end(); found2 = m.find(seed[1000]) != m.end(); // Shouldn't exist found3 = m.find(-1234) != m.end(); } })) / 15.0) / 3.0; cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ",Third : " << ((found3) ? "Found" : "Not Found") << endl; cout << " Release : " << time_call([&]() { m.clear(); }) << endl; ///////////// Linear Map V0 LinearMap0<__int64> c; cout << "--------------------" << endl; cout << "Linear Map V0:" << endl; cout << " Build : " << time_call([&]() { c.PreAllocate(ecount / 2); for (__int64 i = 0; i < ecount; i++) { c.AddElement(seed[i],seed[i]); } }) << " ms" << endl; cout << " Loop : " << time_call([&]() { c.Loop([](__int64& Data) { Data /= 2; }); }) << " ms" << endl; cout << " Find : " << (((float)time_call([&]() { for (int i = 0; i < 15; i++) { // Should exist found1 = c.Find(seed[5]); found2 = c.Find(seed[1000]); // Shouldn't exist found3 = c.Find(-1234); } })) / 15.0) / 3.0; cout << " ms -> First : " << ((found1) ? "Found" : "Not Found") << ",Third : " << ((found3) ? "Found" : "Not Found") << endl; cout << " Release : " << time_call([&]() { c.Release(); }) << endl;
编辑:time_call是:
template <class Function> double time_call(Function&& f) { chrono::time_point<chrono::high_resolution_clock> start,end; start = chrono::high_resolution_clock::now(); f(); end = chrono::high_resolution_clock::now(); return ((double)(chrono::duration_cast<chrono::nanoseconds>(end - start).count())) / 1000000.0; }
解决方法
您的容器是链表,而std :: vector是动态大小的数组.
链表方法具有诸多优点,例如能够在不重新分配的情况下插入元素.
然而阵列方法有一些显着的优点:
>线性搜索只扫描内存,这正是构建缓存和预取程序的内容.链接列表的扫描效率较低,因为每次跳转到未缓存的内存意味着昂贵的缓存未命中.>线性阵列扫描易于矢量化.如果使用-O3进行编译,则编译器可能会使用std :: find的矢量化版本.由于内存依赖性,无法对链表扫描进行矢量化.>使用的内存量.您的链接列表必须维护下一个指针,这有效地使您的元素更大.而且,每个非预分配的节点必须支付分配器的开销(即新的和删除的计费数据).这意味着您可以更快地达到内存带宽限制,并且可以在缓存中容纳更少的元素.