cuda初学(1):稀疏矩阵向量乘法(单精度)
初步学习CUDA编程,实现简单稀疏矩阵向量乘法运算,由于硬件限制,目前只测试了单精度程序
GPU计算子程序gpu_fmmv.cu:
#include <stdio.h>
#include <stdlib.h>
// CUDA-C includes
#include <cuda_runtime.h>
#ifdef __cplusplus
extern "C" {
#endif
// For Fortran inte易做图ce //
#define GPU_fmmv gpu_fmmv_
extern void GPU_fmmv(int *, int *, int *, float *, float *, float *);
#ifdef __cplusplus
}
#endif
#define THREAD_NUM 512
__global__ static void fmmv(int *neq, int *numcol, int *ia, float *a, float *v, float *w)
{
const int tId = threadIdx.x;
int row, col;
for(row = tId; row < *neq; row += THREAD_NUM){
w[row] = 0.0;
for(int num = numcol[row]; num < numcol[row+1]; num ++){
col = ia[num]-1;
w[row] += a[num]*v[col];
}
}
}
void GPU_fmmv(int *neqi, int *numcol, int *ia, float *a, float *v, float *w)
{
int *gpu_neq;
int *gpu_ia, *gpu_numcol;
float *gpu_a, *gpu_v, *gpu_w;
int neq = *neqi;
int nnz = numcol[neq];
/* Malloc space on GPU device */
cudaMalloc((void **) &gpu_neq, sizeof(int));
cudaMalloc((void **) &gpu_numcol, sizeof(float)*(neq+1));
cudaMalloc((void **) &gpu_ia, sizeof(int)*nnz);
cudaMalloc((void **) &gpu_a, sizeof(float)*nnz);
cudaMalloc((void **) &gpu_v, sizeof(float)*neq);
cudaMalloc((void **) &gpu_w, sizeof(float)*neq);
/* Copy data to GPU device */
cudaMemcpy(gpu_neq, &neq, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_numcol, numcol, sizeof(int) * (neq+1), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_ia, ia, sizeof(int) * nnz, cudaMemcpyHostToDevice);
cudaMemcpy(gpu_a, a, sizeof(float) * nnz, cudaMemcpyHostToDevice);
cudaMemcpy(gpu_v, v, sizeof(float) * neq, cudaMemcpyHostToDevice);
fmmv<<<1,THREAD_NUM,0>>>(gpu_neq, gpu_numcol, gpu_ia, gpu_a, gpu_v, gpu_w);
cudaMemcpy(w, gpu_w, sizeof(float) * neq, cudaMemcpyDeviceToHost);
cudaFree(gpu_neq);
cudaFree(gpu_numcol);
cudaFree(gpu_ia);
cudaFree(gpu_a);
cudaFree(gpu_v);
cudaFree(gpu_w);
}
主调用程序testgpu.f:
implicit real*4(a-h, o-z)
dimension numcol(1000), ia(10000)
dimension a(10000), v(1000), w(1000)
character*12 fname
numarg = 0
numarg = numarg + 1
call getarg(numarg, fname)
print *,'fname : ', fname
open(21,file=fname,form='formatted',status='old')
read(21,*) neq
read(21,*) (numcol(i),i=1,neq+1)
do i=1, neq
read(21,*) (ia(j),j=numcol(i)+1, numcol(i+1))
enddo
do i=1,neq
read(21,*) (a(j),j=numcol(i)+1, numcol(i+1))
enddo
read(21,*) (v(i),i=1,neq)
close(21)
call gpu_fmmv(neq, numcol, ia, a, v, w)
print *,'w ======'
print *,(w(i),i=1,neq)
stop 0000
end
编译命令:
nvcc -c gpu_fmmv.cu
ifort -o fmvv -L /usr/local/cuda/lib64/ -lcudart gpu_fmmv.o testgpu.f
摘自 xlsp的专栏
补充:综合编程 , 其他综合 ,