当前位置:编程学习 > 网站相关 >>

cuda初学(1):稀疏矩阵向量乘法(单精度)

初步学习CUDA编程,实现简单稀疏矩阵向量乘法运算,由于硬件限制,目前只测试了单精度程序

GPU计算子程序gpu_fmmv.cu:

#include <stdio.h>
#include <stdlib.h>
// CUDA-C includes
#include <cuda_runtime.h>

#ifdef __cplusplus
extern "C" {
#endif
//   For Fortran interface  //
#define GPU_fmmv gpu_fmmv_
extern void GPU_fmmv(int *, int *, int *, float *, float *, float *);

#ifdef __cplusplus
  }
#endif

#define THREAD_NUM 512

__global__ static void fmmv(int *neq, int *numcol, int *ia, float *a, float *v, float *w)
{
   const int tId = threadIdx.x;
   int row, col;

   for(row = tId; row < *neq; row += THREAD_NUM){
      w[row] = 0.0;
      for(int num = numcol[row]; num < numcol[row+1]; num ++){
         col = ia[num]-1;
         w[row] += a[num]*v[col];

      }
   }

}

void GPU_fmmv(int *neqi, int *numcol, int *ia, float *a, float *v, float *w)
{
   int *gpu_neq;
   int *gpu_ia, *gpu_numcol;
   float *gpu_a, *gpu_v, *gpu_w;

   int neq = *neqi;
   int nnz = numcol[neq];

/*   Malloc space on GPU device   */
   cudaMalloc((void **) &gpu_neq, sizeof(int));
   cudaMalloc((void **) &gpu_numcol, sizeof(float)*(neq+1));
   cudaMalloc((void **) &gpu_ia, sizeof(int)*nnz);
   cudaMalloc((void **) &gpu_a, sizeof(float)*nnz);
   cudaMalloc((void **) &gpu_v, sizeof(float)*neq);
   cudaMalloc((void **) &gpu_w, sizeof(float)*neq);

/*  Copy data to GPU device  */
   cudaMemcpy(gpu_neq, &neq, sizeof(int), cudaMemcpyHostToDevice);
   cudaMemcpy(gpu_numcol, numcol, sizeof(int) * (neq+1), cudaMemcpyHostToDevice);
   cudaMemcpy(gpu_ia, ia, sizeof(int) * nnz, cudaMemcpyHostToDevice);
   cudaMemcpy(gpu_a, a, sizeof(float) * nnz, cudaMemcpyHostToDevice);
   cudaMemcpy(gpu_v, v, sizeof(float) * neq, cudaMemcpyHostToDevice);

   fmmv<<<1,THREAD_NUM,0>>>(gpu_neq, gpu_numcol, gpu_ia, gpu_a, gpu_v, gpu_w);

   cudaMemcpy(w, gpu_w, sizeof(float) * neq, cudaMemcpyDeviceToHost);

   cudaFree(gpu_neq);
   cudaFree(gpu_numcol);
   cudaFree(gpu_ia);
   cudaFree(gpu_a);
   cudaFree(gpu_v);
   cudaFree(gpu_w);

}

主调用程序testgpu.f:

      implicit real*4(a-h, o-z)

      dimension numcol(1000), ia(10000)
      dimension a(10000), v(1000), w(1000)
      character*12 fname

      numarg = 0
      numarg = numarg + 1
      call getarg(numarg, fname)
      print *,'fname : ', fname
      open(21,file=fname,form='formatted',status='old')
      read(21,*) neq
      read(21,*) (numcol(i),i=1,neq+1)
      do i=1, neq
         read(21,*) (ia(j),j=numcol(i)+1, numcol(i+1))
      enddo
      do i=1,neq
         read(21,*) (a(j),j=numcol(i)+1, numcol(i+1))
      enddo
      read(21,*) (v(i),i=1,neq)
      close(21)

      call gpu_fmmv(neq, numcol, ia, a, v, w)

      print *,'w ======'
      print *,(w(i),i=1,neq)

      stop 0000
      end

编译命令:

nvcc -c gpu_fmmv.cu

ifort -o fmvv -L /usr/local/cuda/lib64/ -lcudart gpu_fmmv.o testgpu.f 


摘自  xlsp的专栏
 
补充:综合编程 , 其他综合 ,
CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,