同样代码用OpenMP并行 在PC(Ubuntu)和Mac上不同效率的问题
我写了个用CG法解方程式组的c程序
运算量不大 矩阵是2000@2000的 用openmp 之后 用ompgetwtime测时间
在pc上ubuntu里面明显2线程比不用openmp快将近1.6倍
但是为什么同样的程序在Mac上用了openmp之后2线程花的时间比不用openmp还多呢 4线程也只能和openmp的时候大概持平
想知道为什么 完全相同的代码
编译器都是GCC
代码如下(在日本读书 所以有些日语 但不影响理解)
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> #include <omp.h> #define N 2000 #define MAX 100000 #define EPS (1.0e-10) #define TIMER 10 int cgm(double **amat, double *bvec, double *xvec, double *resh, int ndata, double eps, int *n_his, int i_max){ int i, j, n; double *pvec, *apvec, *axvec, *rvec; double alpha, beta, r_norm_old, r_norm_new, pap_norm, b_norm; double start_time, stop_time; pvec = (double *)malloc(sizeof(double)*ndata); apvec = (double *)malloc(sizeof(double)*ndata); axvec = (double *)malloc(sizeof(double)*ndata); rvec = (double *)malloc(sizeof(double)*ndata); start_time = omp_get_wtime(); // //Setting initial values // #pragma omp parallel for for (i=0; i<ndata; i++){ xvec[i] = 0.0; } r_norm_old = 0.0; b_norm = 0.0; #pragma omp parallel for private(j) reduction(+:r_norm_old,b_norm) for (i=0; i<ndata; i++){ axvec[i] = 0.0; pvec[i] = 0.0; for (j=0; j<ndata; j++){ axvec[i] += amat[i][j]*xvec[j]; } rvec[i] = bvec[i] - axvec[i]; b_norm += bvec[i] * bvec[i]; r_norm_old += rvec[i]*rvec[i]; } beta = 0.0; resh[0] = 1.0; // // Main loop // *n_his = 1; for (n=1; n<i_max; n++){ #pragma omp parallel for for (i=0; i<ndata; i++){ pvec[i] = rvec[i] + beta*pvec[i]; } pap_norm = 0.0; #pragma omp parallel for private(i,j) reduction(+:pap_norm) for (i=0; i<ndata; i++){ apvec[i] = 0.0; for (j=0; j<ndata; j++){ apvec[i] += amat[i][j]*pvec[j]; } pap_norm += pvec[i]*apvec[i]; } alpha = r_norm_old/pap_norm; r_norm_new = 0.0; #pragma omp parallel for reduction(+:r_norm_new) for (i=0; i<ndata; i++){ xvec[i] += alpha*pvec[i]; rvec[i] -= alpha*apvec[i]; r_norm_new += rvec[i]*rvec[i]; } beta = r_norm_new/r_norm_old; if (sqrt(r_norm_new) < eps*sqrt(b_norm)){ // for (i=0; i<ndata; i++){ // printf("xvec[%d] = %12.6e \n", i, xvec[i]); // } resh[*n_his] = sqrt(r_norm_new)/sqrt(b_norm); for(i=0;i<N;i++){ free(amat[i]); } free(amat); stop_time = omp_get_wtime(); printf("elapsed time=%lf s\n", stop_time - start_time); return 0; }else{ resh[*n_his] = sqrt(r_norm_new)/sqrt(b_norm); (*n_his)++; r_norm_old = r_norm_new; } } for(i=0;i<N;i++){ free(amat[i]); } free(amat); return -1; } void setup(double** a,double *x_setup,double *x,double *b){ int i,j; double vxm; for(i=0;i<N;i++){ for(j=0;j<N;j++){ if(j>i) a[i][j]=2*(i+1)-1; if(j<i||j==i) a[i][j]=2*(j+1)-1; } } for(i=0;i<N;i++){ x_setup[i]=1.0; x[i]=0.0; } for(i=0;i<N;i++){ vxm=0; for(j=0;j<N;j++){ vxm+=a[i][j]*x_setup[j]; } b[i]=vxm; } } int main(int argc,char *argv[]){ int i; double **a; int back; char *s; int threads=1; if(argc>2||argc<=1){ printf("引数の数が違う、Thread数だけ引数として入力してください!\n"); exit(1); }else{ s=argv[1]; threads=atoi(s); printf("設定したThreads数は%dである.\n",threads); } /* set core number */ omp_set_num_threads(threads); a=(double **)malloc(sizeof(double *)*N); for(i=0;i<N;i++){ a[i]=(double *)malloc(sizeof(double)*N); } double x[N]; double x_setup[N]; double b[N]; double* resh = (double*)malloc(sizeof(double) * MAX); int n_his; setup(a,x_setup,x,b); back=cgm(a,b,x,resh,N,EPS,&n_his,MAX); return 0; }
我是美少年
11 years, 7 months ago