同样代码用OpenMP并行 在PC(Ubuntu)和Mac上不同效率的问题


我写了个用CG法解方程式组的c程序
运算量不大 矩阵是2000@2000的 用openmp 之后 用ompgetwtime测时间
在pc上ubuntu里面明显2线程比不用openmp快将近1.6倍
但是为什么同样的程序在Mac上用了openmp之后2线程花的时间比不用openmp还多呢 4线程也只能和openmp的时候大概持平
想知道为什么 完全相同的代码
编译器都是GCC

分割线

代码如下(在日本读书 所以有些日语 但不影响理解)

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <omp.h>

#define N 2000
#define MAX 100000
#define EPS (1.0e-10)
#define TIMER 10

int cgm(double **amat, double *bvec, double *xvec, double *resh, int ndata, double
    eps, int *n_his, int i_max){
  int i, j, n;
  double *pvec, *apvec, *axvec, *rvec;
  double alpha, beta, r_norm_old, r_norm_new, pap_norm, b_norm;
  double start_time, stop_time;
  pvec  = (double *)malloc(sizeof(double)*ndata);
  apvec = (double *)malloc(sizeof(double)*ndata);
  axvec = (double *)malloc(sizeof(double)*ndata);
  rvec  = (double *)malloc(sizeof(double)*ndata);

  start_time = omp_get_wtime();
  //
  //Setting initial values
  //

#pragma omp parallel for
    for (i=0; i<ndata; i++){
      xvec[i] = 0.0;
    }
      r_norm_old  = 0.0;
      b_norm = 0.0;
#pragma omp parallel for private(j) reduction(+:r_norm_old,b_norm)
    for (i=0; i<ndata; i++){
      axvec[i] = 0.0;
      pvec[i] = 0.0;
      for (j=0; j<ndata; j++){
        axvec[i] += amat[i][j]*xvec[j];
      }
      rvec[i] = bvec[i] - axvec[i];
      b_norm += bvec[i] * bvec[i];
      r_norm_old  += rvec[i]*rvec[i];
    }
      beta = 0.0;
      resh[0] = 1.0;

    //
    // Main loop
    //
    *n_his = 1;
    for (n=1; n<i_max; n++){
#pragma omp parallel for
      for (i=0; i<ndata; i++){
        pvec[i] = rvec[i] + beta*pvec[i];
      }

      pap_norm = 0.0;
#pragma omp parallel for private(i,j) reduction(+:pap_norm)
      for (i=0; i<ndata; i++){
        apvec[i] = 0.0;
        for (j=0; j<ndata; j++){
          apvec[i] += amat[i][j]*pvec[j];
        }
        pap_norm += pvec[i]*apvec[i];
      }
      alpha = r_norm_old/pap_norm;
      r_norm_new = 0.0;
#pragma omp parallel for reduction(+:r_norm_new)
      for (i=0; i<ndata; i++){
        xvec[i] += alpha*pvec[i];
        rvec[i] -= alpha*apvec[i];
        r_norm_new += rvec[i]*rvec[i];
      }

      beta = r_norm_new/r_norm_old;

      if (sqrt(r_norm_new) < eps*sqrt(b_norm)){
        //      for (i=0; i<ndata; i++){
        // printf("xvec[%d] = %12.6e \n", i, xvec[i]);
        //      }
        resh[*n_his] = sqrt(r_norm_new)/sqrt(b_norm);
        for(i=0;i<N;i++){
          free(amat[i]);
        }
        free(amat);
      stop_time = omp_get_wtime();
      printf("elapsed time=%lf s\n", stop_time - start_time);
      return 0;

    }else{
      resh[*n_his] = sqrt(r_norm_new)/sqrt(b_norm);
      (*n_his)++;
      r_norm_old = r_norm_new;
    }
  }

  for(i=0;i<N;i++){
    free(amat[i]);
  }
  free(amat);
  return -1;
}
void setup(double** a,double *x_setup,double *x,double *b){
  int i,j;
  double vxm;
  for(i=0;i<N;i++){
    for(j=0;j<N;j++){
      if(j>i)
        a[i][j]=2*(i+1)-1;
      if(j<i||j==i)
        a[i][j]=2*(j+1)-1;
    }
  }
  for(i=0;i<N;i++){
    x_setup[i]=1.0;
    x[i]=0.0;
  }
  for(i=0;i<N;i++){
    vxm=0;
    for(j=0;j<N;j++){
      vxm+=a[i][j]*x_setup[j];
    }
    b[i]=vxm;
  }

}

int main(int argc,char *argv[]){
  int i;
  double **a;
  int back;
  char *s;
  int threads=1;
  if(argc>2||argc<=1){
    printf("引数の数が違う、Thread数だけ引数として入力してください!\n");
    exit(1);
  }else{
    s=argv[1];
    threads=atoi(s);
    printf("設定したThreads数は%dである.\n",threads);
  }
  /* set core number */
  omp_set_num_threads(threads);

  a=(double **)malloc(sizeof(double *)*N);
  for(i=0;i<N;i++){
    a[i]=(double *)malloc(sizeof(double)*N);
  }
  double x[N];
  double x_setup[N];
  double b[N];
  double* resh = (double*)malloc(sizeof(double) * MAX);
  int n_his;
  setup(a,x_setup,x,b);
  back=cgm(a,b,x,resh,N,EPS,&n_his,MAX);
  return 0;
}

openmp macosx 并行

我是美少年 11 years, 7 months ago

環境變量 OMP_NUM_THREADS 是不是沒有定?

#include <omp.h>
#include <stdio.h>
int main() {
#pragma omp parallel
printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
}
wwy1028 answered 11 years, 7 months ago

Your Answer