Hello everyone,
i’m newbie in CUDA programming (and on the Nvidia forum, so sorry if the topic is in the wrong forum), i started to read some articles just a couple of days ago.
i’m trying to add 2 matrices (it’s basic, i know).
Here’s my code :
#include <stdlib.h>
#include <stdio.h>
#define N 512
static void HandleError( cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void addMat(int *dev_a, int *dev_b, int *dev_c)
{
int x = threadIdx.x + blockIdx.x*threadIdx.y;
int y = blockIdx.x + gridDim.x*blockIdx.y;
int idx = x+y*N;
while(idx < N*N)
{
dev_c[idx] = dev_a[idx]+dev_b[idx];
idx += blockDim.x*blockDim.y*gridDim.x*gridDim.y;
}
}
int main()
{
int a[N*N], b[N*N], c[N*N];
int *dev_a, *dev_b, *dev_c;
int i,j;
long tmp=0;
dim3 dimBlock(32,32);
dim3 dimGrid(N/dimBlock.x, N/dimBlock.y);
cudaEvent_t start, stop;
float elapsedTime;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
HANDLE_ERROR(cudaMalloc((void**)&dev_a, N*N*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b, N*N*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c, N*N*sizeof(int)));
for(i=0 ; i<N ; i++)
{
for(j=0 ; j<N ; j++)
{
a[j+i*N] = i+j;
b[j+i*N] = i;
}
}
cudaMemcpy(dev_a, a, N*N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*N*sizeof(int), cudaMemcpyHostToDevice);
addMat<<<1,16>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N*N*sizeof(int), cudaMemcpyDeviceToHost);
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("Temps ecoule : %f ms\n", elapsedTime);
for(i=0 ; i<N ; i++)
{
for(j=0 ; j<N ; j++)
tmp+=c[j+i*N];
}
//printf("tmp : %ld\n", tmp);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
i can build & run it if N is <=256.
But, for a greater N, i can’t run it. I tried to find a solution, but it was not really successful…
may it be a memory problem on the graphical card ? I’m working on a GeForce GTX 460.
Or it may be because of my thread distribution ?
Does anyone have an idea/advice ??
Thank you very much !
Iota