c – 多线程random_r比单线程版本慢

以下程序基本上与 here所述的程序相同.当我使用两个线程(NTHREADS == 2)运行和编译程序时,我得到以下运行时间：

real        0m14.120s
user        0m25.570s
sys         0m0.050s

当它只运行一个线程(NTHREADS == 1)时,我的运行时间明显更好,即使它只使用一个核心.

real        0m4.705s
user        0m4.660s
sys         0m0.010s

我的系统是双核,我知道random_r是线程安全的,我很确定它是非阻塞的.当没有random_r运行相同的程序,并且使用余弦和正弦的计算作为替代时,双线程版本的运行时间大约在预期的1/2的时间内.

#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>

#define NTHREADS 2
#define PRNG_BUFSZ 8
#define ITERATIONS 1000000000

void* thread_run(void* arg) {
    int r1,i,totalIterations = ITERATIONS / NTHREADS;
    for (i = 0; i < totalIterations; i++){
        random_r((struct random_data*)arg,&r1);
    }
    printf("%i\n",r1);
}

int main(int argc,char** argv) {
    struct random_data* rand_states = (struct random_data*)calloc(NTHREADS,sizeof(struct random_data));
    char* rand_statebufs = (char*)calloc(NTHREADS,PRNG_BUFSZ);
    pthread_t* thread_ids;
    int t = 0;
    thread_ids = (pthread_t*)calloc(NTHREADS,sizeof(pthread_t));
    /* create threads */
    for (t = 0; t < NTHREADS; t++) {
        initstate_r(random(),&rand_statebufs[t],PRNG_BUFSZ,&rand_states[t]);
        pthread_create(&thread_ids[t],NULL,&thread_run,&rand_states[t]);
    }
    for (t = 0; t < NTHREADS; t++) {
        pthread_join(thread_ids[t],NULL);
    }
    free(thread_ids);
    free(rand_states);
    free(rand_statebufs);
}

我很困惑为什么当生成随机数时,两个线程版本的性能比单线程版本要差得多,因为random_r是在多线程应用程序中使用的.

解决方法

一个非常简单的改变,将数据放在内存中：

struct random_data* rand_states = (struct random_data*)calloc(NTHREADS * 64,sizeof(struct random_data));
char* rand_statebufs = (char*)calloc(NTHREADS*64,PRNG_BUFSZ);
pthread_t* thread_ids;
int t = 0;
thread_ids = (pthread_t*)calloc(NTHREADS,sizeof(pthread_t));
/* create threads */
for (t = 0; t < NTHREADS; t++) {
    initstate_r(random(),&rand_statebufs[t*64],&rand_states[t*64]);
    pthread_create(&thread_ids[t],&rand_states[t*64]);
}

导致我的双核机器运行时间快得多.

这将确认它是要测试的怀疑 – 您在两个单独的线程中在相同的高速缓存行上突变值,因此具有高速缓存争用. Herb Sutter的‘machine architecture – what your programming language never told you’ talk值得一看,如果你还没有了解的话,他会在大约1:20左右的时候展示虚假的分享.

制定缓存行大小,并创建每个线程的数据,使其与之对齐.

将所有线程的数据放入一个结构体中,这样做比较简单：

#define CACHE_LINE_SIZE 64

struct thread_data {
    struct random_data random_data;
    char statebuf[PRNG_BUFSZ];
    char padding[CACHE_LINE_SIZE - sizeof ( struct random_data )-PRNG_BUFSZ];
};

int main ( int argc,char** argv )
{
    printf ( "%zd\n",sizeof ( struct thread_data ) );

    void* apointer;

    if ( posix_memalign ( &apointer,sizeof ( struct thread_data ),NTHREADS * sizeof ( struct thread_data ) ) )
        exit ( 1 );

    struct thread_data* thread_states = apointer;

    memset ( apointer,NTHREADS * sizeof ( struct thread_data ) );

    pthread_t* thread_ids;

    int t = 0;

    thread_ids = ( pthread_t* ) calloc ( NTHREADS,sizeof ( pthread_t ) );

    /* create threads */
    for ( t = 0; t < NTHREADS; t++ ) {
        initstate_r ( random(),thread_states[t].statebuf,&thread_states[t].random_data );
        pthread_create ( &thread_ids[t],&thread_states[t].random_data );
    }

    for ( t = 0; t < NTHREADS; t++ ) {
        pthread_join ( thread_ids[t],NULL );
    }

    free ( thread_ids );
    free ( thread_states );
}

与CACHE_LINE_SIZE 64：

refugio:$gcc -O3 -o bin/nixuz_random_r src/nixuz_random_r.c -lpthread
refugio:$time bin/nixuz_random_r 
64
63499495
944240966

real    0m1.278s
user    0m2.540s
sys 0m0.000s

或者您可以使用双倍的缓存行大小,并使用malloc – 额外的填充确保突变的内存是分开的行,因为malloc是16(IIRC)而不是64字节对齐.

(我将ITERATIONS减少了十倍,而不是笨蛋快速的机器)

c – 多线程random_r比单线程版本慢

解决方法

猜你在找的C&C++相关文章