在C 11中,
const std::vector<int>& f() { static const std::vector<int> x { 1,2,3 }; return x; }
“The best intution to be ever had is ‘I should measure this.'”所以
let’s find out:
#include <atomic> #include <chrono> #include <cstdint> #include <iostream> #include <numeric> #include <vector> namespace { class timer { using hrc = std::chrono::high_resolution_clock; hrc::time_point start; static hrc::time_point now() { // Prevent memory operations from reordering across the // time measurement. This is likely overkill,needs more // research to determine the correct fencing. std::atomic_thread_fence(std::memory_order_seq_cst); auto t = hrc::now(); std::atomic_thread_fence(std::memory_order_seq_cst); return t; } public: timer() : start(now()) {} hrc::duration elapsed() const { return now() - start; } template <typename Duration> typename Duration::rep elapsed() const { return std::chrono::duration_cast<Duration>(elapsed()).count(); } template <typename Rep,typename Period> Rep elapsed() const { return elapsed<std::chrono::duration<Rep,Period>>(); } }; const std::vector<int>& f() { static const auto x = std::vector<int>{ 1,3 }; return x; } static const auto y = std::vector<int>{ 1,3 }; const std::vector<int>& g() { return y; } const unsigned long long n_iterations = 500000000; template <typename F> void test_one(const char* name,F f) { f(); // First call outside the timer. using value_type = typename std::decay<decltype(f()[0])>::type; std::cout << name << ": " << std::flush; auto t = timer{}; auto sum = uint64_t{}; for (auto i = n_iterations; i > 0; --i) { const auto& vec = f(); sum += std::accumulate(begin(vec),end(vec),value_type{}); } const auto elapsed = t.elapsed<std::chrono::milliseconds>(); std::cout << elapsed << " ms (" << sum << ")\n"; } } // anonymous namespace int main() { test_one("local static",f); test_one("global static",g); }
在Coliru运行,本地版本在4618 ms中进行5e8次迭代,全局版本为4392 ms.所以是的,本地版本的迭代速度慢了大约0.452纳秒.尽管存在可衡量的差异,但是在大多数情况下,影响观察到的效果太小.
编辑:有趣的对立点,switching from clang++ to g++ changes the result ordering. g编译的二进制文件运行在4418毫秒(全局)与4181毫秒(本地),所以本地的速度比迭代速度快474皮秒.然而,它确实得出这样的结论:两种方法之间的差异很小.
#include <atomic> #include <chrono> #include <cstdint> #include <iostream> #include <numeric> #include <vector> namespace { class timer { using hrc = std::chrono::high_resolution_clock; hrc::time_point start; static hrc::time_point now() { // Prevent memory operations from reordering across the // time measurement. This is likely overkill. std::atomic_thread_fence(std::memory_order_seq_cst); auto t = hrc::now(); std::atomic_thread_fence(std::memory_order_seq_cst); return t; } public: timer() : start(now()) {} hrc::duration elapsed() const { return now() - start; } template <typename Duration> typename Duration::rep elapsed() const { return std::chrono::duration_cast<Duration>(elapsed()).count(); } template <typename Rep,Period>>(); } }; class f { public: const std::vector<int>& operator()() { static const auto x = std::vector<int>{ 1,3 }; return x; } }; class g { static const std::vector<int> x; public: const std::vector<int>& operator()() { return x; } }; const std::vector<int> g::x{ 1,3 }; const unsigned long long n_iterations = 500000000; template <typename F> void test_one(const char* name,f()); test_one("global static",g()); }
毫不奇怪,运行时间在g++ (3803ms local,2323ms global)和clang (4183ms local,3253ms global)都更快.结果肯定了我们的直觉,全局技术应该比当地更快,每次迭代的增量为2.96纳秒(g)和1.86纳秒(克朗).