__global__ void sum_kernel ( float * a, float * b, float * c ){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
c [idx] = a [idx] + b [idx];
}
#include <stdio.h>
int sum_host( float * a, float * b, float * c, int n ){
int nb = n * sizeof ( float );
float *aDev = NULL, *bDev = NULL, *cDev = NULL;
cudaError_t cuerr = cudaMalloc ( (void**)&aDev, nb );
if (cuerr != cudaSuccess)
{
fprintf(stderr, "Cannot allocate GPU memory for aDev: %s\n",
cudaGetErrorString(cuerr));
return 1;
}
cuerr = cudaMalloc ( (void**)&bDev, nb );
if (cuerr != cudaSuccess)
{
fprintf(stderr, "Cannot allocate GPU memory for bDev: %s\n",
cudaGetErrorString(cuerr));
return 1;
}
cuerr = cudaMalloc ( (void**)&cDev, nb );
if (cuerr != cudaSuccess)
{
fprintf(stderr, "Cannot allocate GPU memory for cDev: %s\n",
cudaGetErrorString(cuerr));
return 1;
}
dim3 threads = dim3(BLOCK_SIZE, 1);
dim3 blocks = dim3(n / BLOCK_SIZE, 1);
cuerr = cudaMemcpy ( aDev, a, nb, cudaMemcpyHostToDevice );
if (cuerr != cudaSuccess)
{
fprintf(stderr, "Cannot copy data from a to aDev: %s\n",
cudaGetErrorString(cuerr));
return 1;
}
cuerr = cudaMemcpy ( bDev, b, nb, cudaMemcpyHostToDevice );
if (cuerr != cudaSuccess)
{
fprintf(stderr, "Cannot copy data from b to bDev: %s\n",
cudaGetErrorString(cuerr));
return 1;
}
sum_kernel<<<blocks, threads>>> (aDev, bDev, cDev);
cuerr = cudaGetLastError();
if (cuerr != cudaSuccess)
{
fprintf(stderr, "Cannot launch CUDA kernel: %s\n",
cudaGetErrorString(cuerr));
return 1;
}
cuerr = cudaDeviceSynchronize();
if (cuerr != cudaSuccess)
{
fprintf(stderr, "Cannot synchronize CUDA kernel: %s\n",
cudaGetErrorString(cuerr));
return 1;
}
cuerr = cudaMemcpy ( c, cDev, nb, cudaMemcpyDeviceToHost );
if (cuerr != cudaSuccess)
{
fprintf(stderr, "Cannot copy data from cdev to c: %s\n",
cudaGetErrorString(cuerr));
return 1;
}
cudaFree ( aDev );
cudaFree ( bDev );
cudaFree ( cDev );
return 0;
}