> 2 x Xeon E5-2690 v4 @ 2.60GHz(Turbo Boost ON,HT OFF,共28个逻辑cpu)
> 32GB DDR4 2400内存,四通道
并在每个上安装了Windows 7 SP1(x64)和Windows 10 Creators Update(x64).
然后我们运行了一个小内存基准测试(下面的代码,使用VS2015 Update 3,64位架构构建),它可以同时从多个线程执行内存分配 – 无填充.
#include <Windows.h> #include <vector> #include <ppl.h> unsigned __int64 ZQueryPerformanceCounter() { unsigned __int64 c; ::QueryPerformanceCounter((LARGE_INTEGER *)&c); return c; } unsigned __int64 ZQueryPerformanceFrequency() { unsigned __int64 c; ::QueryPerformanceFrequency((LARGE_INTEGER *)&c); return c; } class CZPerfCounter { public: CZPerfCounter() : m_st(ZQueryPerformanceCounter()) {}; void reset() { m_st = ZQueryPerformanceCounter(); }; unsigned __int64 elapsedCount() { return ZQueryPerformanceCounter() - m_st; }; unsigned long elapsedMS() { return (unsigned long)(elapsedCount() * 1000 / m_freq); }; unsigned long elapsedMicroSec() { return (unsigned long)(elapsedCount() * 1000 * 1000 / m_freq); }; static unsigned __int64 frequency() { return m_freq; }; private: unsigned __int64 m_st; static unsigned __int64 m_freq; }; unsigned __int64 CZPerfCounter::m_freq = ZQueryPerformanceFrequency(); int main(int argc,char ** argv) { SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); int ncpu = sysinfo.dwNumberOfProcessors; if (argc == 2) { ncpu = atoi(argv[1]); } { printf("No of threads %d\n",ncpu); try { concurrency::Scheduler::ResetDefaultSchedulerPolicy(); int min_threads = 1; int max_threads = ncpu; concurrency::SchedulerPolicy policy (2 // two entries of policy settings,concurrency::MinConcurrency,min_threads,concurrency::MaxConcurrency,max_threads ); concurrency::Scheduler::SetDefaultSchedulerPolicy(policy); } catch (concurrency::default_scheduler_exists &) { printf("Cannot set concurrency runtime scheduler policy (Default scheduler already exists).\n"); } static int cnt = 100; static int num_fills = 1; CZPerfCounter pcTotal; // malloc/free printf("malloc/free\n"); { CZPerfCounter pc; for (int i = 1 * 1024 * 1024; i <= 8 * 1024 * 1024; i *= 2) { concurrency::parallel_for(0,50,[i](size_t x) { std::vector<void *> ptrs; ptrs.reserve(cnt); for (int n = 0; n < cnt; n++) { auto p = malloc(i); ptrs.emplace_back(p); } for (int x = 0; x < num_fills; x++) { for (auto p : ptrs) { memset(p,num_fills,i); } } for (auto p : ptrs) { free(p); } }); printf("size %4d MB,elapsed %8.2f s,\n",i / (1024 * 1024),pc.elapsedMS() / 1000.0); pc.reset(); } } printf("\n"); printf("Total %6.2f s\n",pcTotal.elapsedMS() / 1000.0); } return 0; }
令人惊讶的是,与Windows 7相比,Windows 10 CU的结果非常糟糕.我将下面的结果绘制为1MB块大小和8MB块大小,将线程数从2,4,…更改为28.当我们增加线程数时,性能略差,Windows 10的可扩展性更差.
我们已尝试确保应用所有Windows更新,更新驱动程序,调整BIOS设置,但未成功.我们还在其他几个硬件平台上运行了相同的基准测试,所有这些都为Windows 10提供了类似的曲线.所以它似乎是Windows 10的一个问题.
有没有人有类似的经历,或者对此有所了解(也许我们错过了什么?).这种行为使我们的多线程应用程序获得了显着的性能损失.
***已编辑
使用https://github.com/google/UIforETW(感谢Bruce Dawson)分析基准测试,我们发现大部分时间都花在内核KiPageFault上.进一步向下挖掘调用树,所有这些都导致ExpWaitForSpinLockExclusiveAndAcquire.似乎锁争用导致此问题.
***已编辑
在同一硬件上收集Server 2012 R2数据. Server 2012 R2也比Win7差,但仍然比Win10 CU好很多.
***已编辑
它也发生在Server 2016中.我添加了标签windows-server-2016.
***已编辑
使用来自@ Ext3h的信息,我修改了基准测试以使用VirtualAlloc和VirtualLock.与不使用VirtualLock时相比,我可以确认显着改进.当使用VirtualAlloc和VirtualLock时,整体Win10仍然比Win7慢30%到40%.
这是更新的图表.
Win 10 FCU和WKS的开销低于Win 7.作为交换,VirtualLock似乎有更高的开销.