Hi,
i currently work in a company who doesn’t allow me to download the cuda SDK. Therefore i can’t launch the device query test and the memory bandwidth test.
So i wrote a (very) simple kernel, which build without any problem but doens’t give me the expected results. Instead of i get a zero filled vector.
This kernel compute the sum of two vectors of size N (floating point precision), and stores the result in a third vector.
The code is very simple so i guess i’ve not made mistake in it. I don’t understand what could be wrong… perhaps a harware problem, or an inattention mistake…
I use the toolkit v3.2 and a nvidia driver 6.14.12.5957 which date is 01/09/2010 under windows XP.
Thank you very much for the attention you’ll give to my request.
R.Portalez
Here is my code :
main.cpp
[codebox]#include
extern “C” void addGPU(int N, float* a, float* b, float* c) ;
int main(int argc, char** argv)
{
int N = 5 ;
float* a = (float*) malloc(N*sizeof(int)) ;
float* b = (float*) malloc(N*sizeof(int)) ;
float* cc = (float*) malloc(N*sizeof(int)) ;
for (int i = 0 ; i < N ; ++i)
{
a[i] = float(i) ;
b[i] = float(i) ;
cc[i] = 0.f ;
}
addGPU(N, a, b, cc) ;
return 0 ;
}
[/codebox]
kernel.cu
[codebox]global void kernel_addGPU(int N, float* a, float* b, float* c)
{
int step = blockDim.x*gridDim.x ;
for (int k = threadIdx.x + blockIdx.x*blockDim.x ; k < N ; k += step)
{
c[k] = a[k] + b[k] ;
}
}
extern “C” void addGPU(int N, float* a, float* b, float* c)
{
int size = N*sizeof(float) ;
float *d_a, *d_b, *d_c ;
cudaMalloc((void**) &d_a, size) ;
cudaMalloc((void**) &d_b, size) ;
cudaMalloc((void**) &d_c, size) ;
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice) ;
kernel_addGPU <<< 1, 8 >>> (N, d_a, d_b, d_c) ;
cudaThreadSynchronize () ;
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost) ;
cudaFree(d_a) ;
cudaFree(d_b) ;
cudaFree(d_c) ;
}[/codebox]