ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

cuda规约操作的优化策略

2021-07-03 17:05:22  阅读:197  来源: 互联网

标签:__ mat 规约 double reduce int cuda 优化 vec


问题描述

完成按列求和规约程序的并行优化,主要要求:
∙ \bullet ∙只能使用一块Tesla K40m GPU进行计算。
∙ \bullet ∙测试附件slurm中几种典型规模的加速效果。
原始代码解析:
在这里插入图片描述

代码运行:进入文件夹以后,我们先加载cuda模块,这里特别注意,我们需要使用命令:
module add cuda/9.0加载9.0版本的cuda。
然后再使用make编译,如果事先直接使用make或者加载错误版本的cuda,然后编译,这样会生成一些中间文件,这个时候编译报错,此时重新加载9.0版本的cuda仍然会报错,原因极有可能是这些中间文件影响编译,这个时候需要使用命令make clean来删除之前生成的后缀为o的文件,重新编译才行。原始版本的代码经过运行以后会打印出不同形状的矩阵规约以后的时间,包括CPU和GPU以及kernel运行的时间。
在这里插入图片描述

代码说明

后面几个版本的代码唯一的差别在于col_reduce.cu,其余的代码不变。
Makefile

CC = gcc
CFLAGS = -std=c99

NVCC = nvcc
NVCC_FLAGS = --gpu-architecture=sm_35 -std=c++11 -O3 -Wno-deprecated-gpu-targets

LIBRARIES = -L/mnt/lustrefs/softwares/cuda/9.0/lib64 -lcudart -lm

col_reduce: main.o cpu_col_reduce.o gpu_col_reduce.o 
	$(CC) $^ -o $@ $(LIBRARIES)

main.o: main.c
	$(CC) $(CFLAGS) -c $^ -o $@

cpu_col_reduce.o: col_reduce.c
	$(CC) $(CFLAGS) -c $^ -o $@

gpu_col_reduce.o: col_reduce.cu
	$(NVCC) $(NVCC_FLAGS) -c $^ -o $@

clean:
	rm -f *.o col_reduce
	

col_reduce.c

// M: [m, n]
// vec: [n]
void cpu_mat_column_reduce(double* mat, double* vec, int m, int n) {
  for (int i = 0; i < n; i++) {  // n col
    double sum = 0.0;
    for (int j = 0; j < m; j++) {  // m row
      sum += mat[j * n + i];
    }
    vec[i] = sum;
  }
}


col_reduce.h

#ifndef MAT_MUL_H
#define MAT_MUL_H

#include <stdio.h>
#include <time.h>
#include <sys/time.h>

#define SEED 0

void cpu_mat_column_reduce(double* mat, double* vec, int m, int n);
void gpu_mat_column_reduce(double* mat, double* vec, int m, int n);
// Returns a randomized matrix containing mxn elements
static inline double *rand_mat(int m, int n) {
  srand(SEED);
  double *mat = (double *) malloc(m * n * sizeof(double));
  if (mat == NULL) {
    printf("Error allocating CPU memory");
    exit(1);
  }
  for(int i = 0; i < m; i++){
    for(int j = 0; j < n; j++){
      mat[i * n + j] = (double)(rand() % 10) / 100000.0;
    }
  }
  return mat;
}

// Returns a raw matrix containing n elements
static inline double *raw_mat(int m, int n) {
  double *mat = (double *) malloc(m * n * sizeof(double));
  if (mat == NULL) {
    printf("Error allocating CPU memory");
    exit(1);
  }
  return mat;
}

// Returns a raw matrix containing n elements
static inline double *zero_vec(int m) {
  double *vec = (double *) malloc(m * sizeof(double));
  if (vec == NULL) {
    printf("Error allocating CPU memory");
    exit(1);
  }
  for (int i = 0; i < m; i++) {
    vec[i] = 0.0;
  }
  return vec;
}

// Returns the current time in microseconds
static inline long long start_timer() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return tv.tv_sec*1000000 + tv.tv_usec;
}

// Prints the time elapsed since the specified time
static inline long long stop_timer(long long start_time, char *name) {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  long long end_time = tv.tv_sec*1000000 + tv.tv_usec;
  printf("%s: %f milisec\n", name, ((double) (end_time - start_time)) / 1000);
  return end_time - start_time;
}

#endif

main.c

/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "col_reduce.h"

int main(int argc, char **argv){

  int m = 3;  // default value
  int n= 3;  // default value
  if (argc == 3) {
    m = atoi(argv[1]); // user-specified value
    n = atoi(argv[2]); // user-specified value
  }
  printf("\nDo column reduce on mat m: %d n: %d ...\n", m, n);
  double* mat = rand_mat(m, n);
  double* cpu_vec = zero_vec(n);
  double* host_vec = zero_vec(n);
  
// CPU
  long long cpu_start_time = start_timer();
  cpu_mat_column_reduce(mat, cpu_vec, m, n);
  long long cpu_time = stop_timer(cpu_start_time, "CPU");

// GPU  
  long long gpu_start_time = start_timer();
  gpu_mat_column_reduce(mat, host_vec, m, n);
  long long gpu_time = stop_timer(gpu_start_time, "GPU");

  // Check the correctness of the GPU results
  int num_wrong = 0;
  for (int i = 0; i < n; i++) {
    if (fabs(cpu_vec[i] - host_vec[i]) > 0.001) {
      num_wrong++;
      printf("cpu vec: %f, host vec: %f \n", cpu_vec[i], host_vec[i]);
    }
  }
  // Report the correctness results
  if (num_wrong) printf("GPU %d / %d values incorrect\n", num_wrong, n);
  else           printf("Passed, GPU all values correct!\n");
  free(mat);
  free(cpu_vec);
  free(host_vec);
}

run.slurm

#!/bin/bash

#SBATCH -o job_%j_%N.out
#SBATCH --partition=gpu
#SBATCH -J col_reduce
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu:1
#SBATCH -t 20:00

module add cuda/9.0
./col_reduce 160000 8
./col_reduce 1600000 8
./col_reduce 6400000 8
./col_reduce 160000 32
./col_reduce 1600000 32
./col_reduce 6400000 32
./col_reduce 160000 64
./col_reduce 1600000 64


col_reduce.cu

#include <cuda.h>
#include <stdio.h>
#include <math.h>

#define ThreadNum 128

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
static __inline__ __device__ double atomicAdd(double *address, double val) {
  unsigned long long int* address_as_ull = (unsigned long long int*)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val==0.0)
    return __longlong_as_double(old);
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
}
#endif

extern "C" void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n);

__global__
void gpu_mat_col_reduce_kernel(double* mat, double* vec, int m, int n){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if(i < n){
    double sum = 0.0;
    for (int j = 0; j < m; j++) {  // m row
      sum +=  mat[j * n + i];
    }
    vec[i] = sum;
  }
}

void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n) {
  double *d_M, *d_vec;
  size_t size_of_float = sizeof(double);
  size_t size_M = m * n * size_of_float;
  size_t size_vec = n * size_of_float;

// malloc and memcpy
  cudaMalloc((void**)&d_M, size_M);
  cudaMalloc((void**)&d_vec, size_vec);
  cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice);
  cudaMemcpy(d_vec, h_vec, size_vec, cudaMemcpyHostToDevice);

// timing   
  cudaEvent_t start, stop;
  float elapsed_time = 0.0;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);

// running
  dim3 grid_dim(int(ceil((double)n/ThreadNum)), 1, 1);
  dim3 block_dim(ThreadNum, 1, 1);
  gpu_mat_col_reduce_kernel<<<grid_dim, block_dim>>>(d_M, d_vec, m, n);

// timing  
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsed_time, start, stop);
// memcpy   
  cudaMemcpy(h_vec, d_vec, size_vec, cudaMemcpyDeviceToHost);

// print res  
  printf("  grid  dim:  %d, %d, %d.\n", grid_dim.x, grid_dim.y, grid_dim.z);
  printf("  block dim: %d, %d, %d.\n", block_dim.x, block_dim.y, block_dim.z);
  printf("  kernel time: %f milisec\n", elapsed_time);
  
 // free 
  cudaFree(d_M);
  cudaFree(d_vec);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
}

版本优化

high1
上面的原始代码,由于n最大只取64,根据64除以128向上取整,因此grid的第一个维度就是1,而blockDim.x虽然是128,但是n最多只取64,所以会有大量线程闲置,上面的原始代码每一个线程处理一列元素的求和,那么我们按着这个思路去想,是否可以让多个线程去操作一列,具体的想法如下:
我们设置blockDim.x = 64,表示x维度上看一行线程处理操作一列元素,然后考虑blockDim.y,由于机器的限制,一个block最多只有1024个线程,因此设置blockDim.y=16,即每一列元素被切分成16 份,其中一个线程操作一段,等到16个线程操作完以后,再把这些结果分别加起来。\textbf{注意:}当我们这样处理以后,由于不同线程操作的都是该列的一段元素,最终我们需要把不同线程计算出来的sum累加到vec去,这里就存在读写冲突,因此,在最后我们处理的时候需要利用原子操作避免这种冲突,
在这里插入图片描述
col_reduce.cu

#include <cuda.h>
#include <stdio.h>
#include <math.h>

#define ThreadNum 64
#define colnum 16

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
static __inline__ __device__ double atomicAdd(double *address, double val) {
  unsigned long long int* address_as_ull = (unsigned long long int*)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val==0.0)
    return __longlong_as_double(old);
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
}
#endif

extern "C" void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n);

__global__
void gpu_mat_col_reduce_kernel(double* mat, double* vec, int m, int n){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  //printf("i=%d\n",i);
  int wid = m/colnum;
  int ind = blockIdx.y * blockDim.y + threadIdx.y;
  int st = ind*wid,end = min(ind*wid + wid,m);
  if(i < n){
    double sum = 0.0;
    for (int j = st; j < end; j++) {  // m row
      sum +=  mat[j * n + i];
    }
    //printf("sum:%.2f,ind:%d\n");
    atomicAdd(&vec[i],sum);
  }
}

void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n) {
  double *d_M, *d_vec;
  size_t size_of_float = sizeof(double);
  size_t size_M = m * n * size_of_float;
  size_t size_vec = n * size_of_float;

// malloc and memcpy
  cudaMalloc((void**)&d_M, size_M);
  cudaMalloc((void**)&d_vec, size_vec);
  cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice);
  cudaMemcpy(d_vec, h_vec, size_vec, cudaMemcpyHostToDevice);

// timing   
  cudaEvent_t start, stop;
  float elapsed_time = 0.0;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);

// running
  dim3 grid_dim(int(ceil((double)n/ThreadNum)), 1, 1);
  dim3 block_dim(ThreadNum, colnum, 1);
  gpu_mat_col_reduce_kernel<<<grid_dim, block_dim>>>(d_M, d_vec, m, n);

// timing  
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsed_time, start, stop);
// memcpy   
  cudaMemcpy(h_vec, d_vec, size_vec, cudaMemcpyDeviceToHost);

// print res  
  printf("  grid  dim:  %d, %d, %d.\n", grid_dim.x, grid_dim.y, grid_dim.z);
  printf("  block dim: %d, %d, %d.\n", block_dim.x, block_dim.y, block_dim.z);
  printf("  kernel time: %f milisec\n", elapsed_time);
  
 // free 
  cudaFree(d_M);
  cudaFree(d_vec);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
}

在这里插入图片描述
在上面本人提供了两种优化思路,本来打算写一个共享内存的优化版本,然而对于这种规约操作,本人实在看不出来如何做共享内存。另外是对于high2优化,还可以把上面提到的一些常量定义为寄存器变量以达到加速效果,不过经过本人测试,这样做效果并不明显。另外是,我们可以这么想,多申请几个block,以达到一个block处理一列元素,这样的代码十分容易编写,只不过这里涉及到一个问题:n 最大是64,也就是我们最少要申请64个block,此时每一个block的线程数目该是多少,本人尝试设置每一个block线程数目为1024,然后不知道什么原因计算结果错误,本人猜测可能GPU上能够申请的线程数目有上限,如果尝试把每一个block的线程数目降低,然后发现此时计算结果正确,但是本人测试以后发现加速效果不佳,因此也没有罗列结果。下面本人用一个表格\ref{out}来展示上述提到的两种加速版本的加速比,其中radio1,radio2分别表示high1,high2相对于原始版本的加速比。
在这里插入图片描述
col_reduce.cu

#include <cuda.h>
#include <stdio.h>
#include <math.h>

#define gridnum 8
#define blocknum 8
#define colnum 128

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
static __inline__ __device__ double atomicAdd(double *address, double val) {
  unsigned long long int* address_as_ull = (unsigned long long int*)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val==0.0)
    return __longlong_as_double(old);
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
}
#endif

extern "C" void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n);

__global__
void gpu_mat_col_reduce_kernel(double* mat, double* vec, int m, int n){
  int fen = n/blockDim.x;
  int H = blockDim.y,V = (int)((double)(blockDim.x/fen));
  int i;
  if (fen > 1 && fen < blockDim.x){
    if (threadIdx.x%2 == 0) i = blockIdx.x*fen + (threadIdx.x/2);
    else i = blockIdx.x*fen + (threadIdx.x - 1)/2;
  }
  else {
    i = blockIdx.x*fen + threadIdx.x%fen;
  }
  //if (threadIdx.x%2 == 0) i = blockIdx.x*fen + (threadIdx.x/2);
 // else i = blockIdx.x*fen + (threadIdx.x - 1)/2;
 // int i = blockIdx.x*fen + threadIdx.x%fen;
 
  int wid = (int)(ceil((double)m/(H*V))); 
  int thread = threadIdx.x%V + threadIdx.y*V;
 
  int jst = thread*wid,jend = min((thread + 1)*wid,m);
 
  if(i < n){
    double sum = 0.0;
    for (int j = jst; j < jend; j++) {  // m row
      //atomicAdd(&sum,mat[j*n + i]);
      sum +=  mat[j * n + i];
    }
    atomicAdd(&vec[i],sum);
  }
  
}

void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n) {
  double *d_M, *d_vec;
  size_t size_of_float = sizeof(double);
  size_t size_M = m * n * size_of_float;
  size_t size_vec = n * size_of_float;

// malloc and memcpy
  cudaMalloc((void**)&d_M, size_M);
  cudaMalloc((void**)&d_vec, size_vec);
  cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice);
  cudaMemcpy(d_vec, h_vec, size_vec, cudaMemcpyHostToDevice);

// timing   
  cudaEvent_t start, stop;
  float elapsed_time = 0.0;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);

// running
  dim3 grid_dim(gridnum , 1, 1);
  dim3 block_dim(blocknum, colnum, 1);
  gpu_mat_col_reduce_kernel<<<grid_dim, block_dim>>>(d_M, d_vec, m, n);

// timing  
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsed_time, start, stop);
// memcpy   
  cudaMemcpy(h_vec, d_vec, size_vec, cudaMemcpyDeviceToHost);

// print res  
  printf("  grid  dim:  %d, %d, %d.\n", grid_dim.x, grid_dim.y, grid_dim.z);
  printf("  block dim: %d, %d, %d.\n", block_dim.x, block_dim.y, block_dim.z);
  printf("  kernel time: %f milisec\n", elapsed_time);
  
 // free 
  cudaFree(d_M);
  cudaFree(d_vec);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
}

经过我们测试,发现不同的线程组织方式经过合理搭配可以有效加速算法,同时我们还可以发现,每次申请的线程数目有限,尤其是对于一个block的线程数有一个上界,本人一开始不知道这个,每一个block申请的线程数目巨大,导致Kernel函数没有办法进去GPU。\par
另外是,经过我们实验,我们很明显看出这样修改线程调度方式只能缩小kernel时间,然而对于这个GPU的运行时间影响不是特别明显,这个需要人为修改变量的访存模型,提高线程对不同变量的访问速度才能达到目的,这个就比较困难。

标签:__,mat,规约,double,reduce,int,cuda,优化,vec
来源: https://blog.csdn.net/forrestguang/article/details/118418681

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有