Hi All,
I need some help here. I am brand new to CUDA and still trying to figure it out. I wrote this very basic program to help me understant how CUDA works. I am using visual Studio 2005 and Cuda version 2.0. My graphics card is a geForce 9600GT
My program has three parts, each part calulating the same thing. The first part increments the values of an array by 1 within the host. The second part does the same thing but using CUDA but with multiple blocks and threads. The third part also uses CUDA but I use only 1 block.<<<1,1>>>. My problem is that when the number of elements in my array go past 1023 floats the second cuda call seem to spit out just grabage. Below 1024 it seems to work fine. The other two function (host and the first cuda call) work perfectly fine no matter how big the array size.
Any help will be appreciated.
[codebox]// incrementArray.cu
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cutil.h>
void incrementArrayOnHost(float *a, int N)
{
int i;
for (i=0; i < N; i++) a[i] = a[i]+1.f;
}
global void incrementArrayOnDevice(float *a, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx]+1.f;
}
global void incrementArrayOnDevice2(float *a, int N)
{
int i;
for (i=0; i < (sizeof(float))*N; i++)
a[i] = a[i]+1.f;
}
int main(void)
{
float *a_h, *b_h, *c_h; // pointers to host memory
float *a_out, *b_out;
float *a_d, *b_d; // pointer to device memory
int i, N =1024;
char ans;
unsigned int timer = 0;
unsigned int timer2 = 0;
unsigned int timer3 = 0;
size_t size = N*sizeof(float);
// allocate arrays on host
a_h = (float *)malloc(size);
b_h = (float *)malloc(size);
c_h = (float *)malloc(size);
a_out = (float *)malloc(size);
b_out = (float *)malloc(size);
// initialization of host data
for (i=0; i<N; i++)
{
a_h[i] = (float) i;
b_h[i] = (float) i;
c_h[i] = (float) i;
}
// do calculation on host
cutCreateTimer(&timer);
cutStartTimer( timer);
incrementArrayOnHost(c_h, N);
cutStopTimer( timer);
//printf("\nCPU TIME: %f\n", cutGetTimerValue( timer));
//Threaded Device Calculation
// allocate array on device
cudaMalloc((void **) &a_d, size);
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);
// do calculation on device:
int blockSize = 4;
int nBlocks = N/blockSize + (N%blockSize == 0?0:1);
cutCreateTimer(&timer2);
cutStartTimer( timer2);
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
cutStopTimer( timer2);
// Retrieve result from device and store in a_out
cudaMemcpy(a_out, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
//cudaFree(a_d);
//Non Threaded Calculation
cudaMalloc((void **) &b_d, size);
cudaMemcpy(b_d, b_h, sizeof(float)*N, cudaMemcpyHostToDevice);
cutCreateTimer(&timer3);
cutStartTimer( timer3);
incrementArrayOnDevice2 <<< 1,1 >>> (b_d, N);
cutStopTimer( timer3);
// Retrieve result from device and store in b_out
cudaMemcpy(b_out, b_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// check results
for (i=0; i<N; i++) {
//assert(a_h[i] == b_h[i]);
if(i%100 == 0)
{
printf(" compare results %f %f %f\n", c_h[i], a_out[i], b_out[i]);
}
}
printf("\nCPU TIME: %f\n", cutGetTimerValue( timer));
printf("\nGPU TIME Multi_thread: %f\n", cutGetTimerValue( timer2));
printf("\nGPU TIME: %f\n", cutGetTimerValue( timer3));
// cleanup
free(a_h);
free(b_h);
free(c_h);
free(a_out);
free(b_out);
cudaFree(a_d);
cudaFree(b_d);
printf("\nHit enter to quit\n");
scanf("%c", &ans);
//return 0;
}[/codebox]