欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

openMP学习笔记

程序员文章站 2022-04-19 11:14:27
...

本次学习课程来自Intel 高级研究员Tim Mattson

课程视频下载地址(全英文且无字幕):

链接:https://pan.baidu.com/s/1nw6pcRv 密码:aolo

虽然最近量子计算和Nvidia CUDA技术越来越热,但是工业上都采用arm架构的嵌入式设备,负担不起nvidia的成本,所以学好OpenMP、SIMD之类的还是比较重要的。

1.openmp是干什么的

openmp是一个跨平台的、较常用的基于共享内存(地址空间)的并行编程模型,它提供一组编译指令集和库例程给并行应用开发者,openmp只支持fortran,c和c++。Linux下的gcc天然支持openmp(注意版本)。

echo |cpp -fopenmp -dM |grep -i open
openMP学习笔记
openmp的架构层次如下图所示

openMP学习笔记

openmp内线程是如何交互的?

  openmp是一个多线程、共享地址模型,不同线程通过共享变量交互信息。但是随意的数据共享将会导致竞争问题,即每次程序运行的结果都会因OS不同的线程调度执行次序而改变。为了控制竞争问题,openmp采用同步策略来保护数据冲突。但是,同步策略是非常昂贵的,消耗performance的。因此,合格的openmp程序员,将以最小的同步代价来制定数据的访问规则。

2.一则求PI的例子

   我们不来搞hello world这种入门例子了,直接通过对求PI程序的三种优化方法,来感悟下openmp的魅力

openMP学习笔记

   计算机没法求解连续问题,就用数学插值法来离散化求解

openMP学习笔记

openMP学习笔记

原来的串行代码:
#include <stdio.h>
#include <omp.h>
static long num_steps = 100000000;
double step;
int main ()
{
	  int i;
	  double x, pi, sum = 0.0;
	  double start_time, run_time;

	  step = 1.0/(double) num_steps;

        	 
	  start_time = omp_get_wtime();

	  for (i=1;i<= num_steps; i++){
		  x = (i-0.5)*step;
		  sum = sum + 4.0/(1.0+x*x);
	  }

	  pi = step * sum;
	  run_time = omp_get_wtime() - start_time;
	  printf("\n pi with %ld steps is %lf in %lf seconds\n ",num_steps,pi,run_time);
}	  

SPMD(singleprogram multiple data)优化,每个线程做各自的统计,最后通过atomic同步机制汇总

#include <stdio.h>
#include <omp.h>

#define MAX_THREADS 4

static long num_steps = 100000000;
double step;
int main ()
{
	  int i,j;
	  double pi, full_sum = 0.0;
	  double start_time, run_time;
	  double sum[MAX_THREADS];

	  step = 1.0/(double) num_steps;


for(j=1;j<=MAX_THREADS ;j++){
   omp_set_num_threads(j);
   full_sum = 0.0;
	  start_time = omp_get_wtime();
#pragma omp parallel private(i)
{
	  int id = omp_get_thread_num();
	  int numthreads = omp_get_num_threads();
	  double x;

	  double partial_sum = 0;

#pragma omp single
	  printf(" num_threads = %d",numthreads);

	  for (i=id;i< num_steps; i+=numthreads){
		  x = (i+0.5)*step;
		  partial_sum += + 4.0/(1.0+x*x);
	  }
#pragma omp critical
		  full_sum += partial_sum;
}
      
	  pi = step * full_sum;
	  run_time = omp_get_wtime() - start_time;
	  printf("\n pi is %f in %f seconds %d threds \n ",pi,run_time,j);
}
}	  

openMP Loop Parallelism 优化,注意openmp for loop 语法,另外注意reduction归约操作

#include <stdio.h>
#include <omp.h>
static long num_steps = 100000000;
double step;
int main ()
{
	  int i;
	  double x, pi, sum = 0.0;
	  double start_time, run_time;

	  step = 1.0/(double) num_steps;
	 for (i=1;i<=4;i++){
          sum = 0.0;
          omp_set_num_threads(i);
	  start_time = omp_get_wtime();
#pragma omp parallel  
{
#pragma omp single
	  printf(" num_threads = %d",omp_get_num_threads());

#pragma omp for reduction(+:sum)
	  for (i=1;i<= num_steps; i++){
		  x = (i-0.5)*step;
		  sum = sum + 4.0/(1.0+x*x);
	  }
}
	  pi = step * sum;
	  run_time = omp_get_wtime() - start_time;
	  printf("\n pi is %f in %f seconds and %d threads\n",pi,run_time,i);
}
}	  

分治法(Divide and Conquer Pattern)优化,openmp task是高级特性,从openmp3.1后开始支持

openMP学习笔记


3.其他例子

生产者与消费者问题优化

原始的串行代码:

#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>

#define N        10000

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
   int i; 
   for (i=0;i<length;i++) {
     randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
     *(a+i) = ((double) randy)/((double) RAND_MOD);
   }   
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
   int i;  double sum = 0.0;
   for (i=0;i<length;i++)  sum += *(a+i);  
   return sum; 
}
  
int main()
{
  double *A, sum, runtime;
  int flag = 0;

  A = (double *)malloc(N*sizeof(double));

  runtime = omp_get_wtime();

  fill_rand(N, A);        // Producer: fill an array of data

  sum = Sum_array(N, A);  // Consumer: sum the array
   
  runtime = omp_get_wtime() - runtime;

  printf(" In %f seconds, The sum is %f \n",runtime,sum);
}

并行优化代码,用标记符号通知消费者生产者是否完成

#include "omp.h"
#ifndef APPLE
#include <malloc.h>
#endif
#include <stdio.h>
#include <stdlib.h>

#define N        10000
#define Nthreads 2

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
   int i; 
   for (i=0;i<length;i++) {
     randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
     *(a+i) = ((double) randy)/((double) RAND_MOD);
   }   
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
   int i;  double sum = 0.0;
   for (i=0;i<length;i++)  sum += *(a+i);  
   return sum; 
}
  
int main()
{
  double *A, sum, runtime;
  int numthreads, flag = 0,flg_tmp=0;

  omp_set_num_threads(Nthreads);

  A = (double *)malloc(N*sizeof(double));

  #pragma omp parallel
  {
     #pragma omp master
     {
        numthreads = omp_get_num_threads();
        if(numthreads != 2)
        {
           printf("error: incorect number of threads, %d \n",numthreads);
           exit(-1);
        }
        runtime = omp_get_wtime();
     }
     #pragma omp barrier

     #pragma omp sections
     {
        #pragma omp section
        {
           fill_rand(N, A);
           #pragma omp flush
           flag = 1;
           #pragma omp flush (flag)
        }
        #pragma omp section
        {
           while (1){
              #pragma omp flush (flag)
			  #pragma omp atomic read
				flg_tmp = flag;
			  if(flg_tmp == 1) break;
           }

           #pragma omp flush 
           sum = Sum_array(N, A);
        }
      }
      #pragma omp master
         runtime = omp_get_wtime() - runtime;
   }  

   printf(" with %d threads and %lf seconds, The sum is %lf \n",numthreads,runtime,sum);
}


链表遍历、斐波那契递归问题的优化:

原始串行代码:

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#ifndef N
#define N 5
#endif
#ifndef FS
#define FS 38
#endif

struct node {
   int data;
   int fibdata;
   struct node* next;
};

int fib(int n) {
   int x, y;
   if (n < 2) {
      return (n);
   } else {
      x = fib(n - 1);
      y = fib(n - 2);
	  return (x + y);
   }
}

void processwork(struct node* p) 
{
   int n;
   n = p->data;
   p->fibdata = fib(n);
}

struct node* init_list(struct node* p) {
    int i;
    struct node* head = NULL;
    struct node* temp = NULL;
    
    head = malloc(sizeof(struct node));
    p = head;
    p->data = FS;
    p->fibdata = 0;
    for (i=0; i< N; i++) {
       temp  =  malloc(sizeof(struct node));
       p->next = temp;
       p = temp;
       p->data = FS + i + 1;
       p->fibdata = i+1;
    }
    p->next = NULL;
    return head;
}

int main(int argc, char *argv[]) {
     double start, end;
     struct node *p=NULL;
     struct node *temp=NULL;
     struct node *head=NULL;
     
	 printf("Process linked list\n");
     printf("  Each linked list node will be processed by function 'processwork()'\n");
     printf("  Each ll node will compute %d fibonacci numbers beginning with %d\n",N,FS);      
 
     p = init_list(p);
     head = p;

     start = omp_get_wtime();
     {
        while (p != NULL) {
		   processwork(p);
		   p = p->next;
        }
     }

     end = omp_get_wtime();
     p = head;
	 while (p != NULL) {
        printf("%d : %d\n",p->data, p->fibdata);
        temp = p->next;
        free (p);
        p = temp;
     }  
	 free (p);

     printf("Compute Time: %f seconds\n", end - start);

     return 0;
}


并行优化代码

看了openmp论坛,递归那边前20个不用并行优化,否则性能反而降低了

#include <omp.h>
#include <stdlib.h>
#include <stdio.h>


#ifndef N
#define N 5
#endif
#ifndef FS
#define FS 38
#endif

typedef struct node {
   int data;
   int fibdata;
   struct node* next;
}node;

node* init_list(node* p);
void processwork(node* p); 
int fib(int n); 

int fib(int n) 
{
   int x, y;
   if (n < 2) {
      return (n);
   } else {
	if(n<20)
		return fib(n-1)+fib(n-2);
#pragma omp task shared(x)
      x = fib(n - 1);
#pragma omp task shared(y)
      y = fib(n - 2);
#pragma omp taskwait
	  return (x + y);
   }
}

void processwork(node* p) 
{
   int n, temp;
   n = p->data;
   temp = fib(n);

   p->fibdata = temp;

}

node* init_list(node* p) 
{
    int i;
    node* head = NULL;
    node* temp = NULL;
    
    head = (node*)malloc(sizeof(node));
    p = head;
    p->data = FS;
    p->fibdata = 0;
    for (i=0; i< N; i++) {
       temp  = (node*)malloc(sizeof(node));
       p->next = temp;
       p = temp;
       p->data = FS + i + 1;
       p->fibdata = i+1;
    }
    p->next = NULL;
    return head;
}

int main() 
{
     double start, end;
     struct node *p=NULL;
     struct node *temp=NULL;
     struct node *head=NULL;

     printf("Process linked list\n");
     printf("  Each linked list node will be processed by function 'processwork()'\n");
     printf("  Each ll node will compute %d fibonacci numbers beginning with %d\n",N,FS);      

     p = init_list(p);
     head = p;

     start = omp_get_wtime();

	#pragma omp parallel 
	{
            #pragma omp master
                  printf("Threads:      %d\n", omp_get_num_threads());

		#pragma omp single
		{
			p=head;
			while (p) {
				#pragma omp task firstprivate(p) //first private is required
				{
					processwork(p);
				}
			  p = p->next;
		   }
		}
	}

     end = omp_get_wtime();
     p = head;
	 while (p != NULL) {
        printf("%d : %d\n",p->data, p->fibdata);
        temp = p->next;
        free (p);
        p = temp;
     }  
	 free (p);

     printf("Compute Time: %f seconds\n", end - start);

     return 0;
}
相关标签: openMP