Hi everyone,
I am having a problem running my cuda program on device. :(
Its working fine in EmuDebug mode but giving invalid result on device.
Can anyone tell what could be the problem in the following code.
// simple cuda program to find circular convolution
#define MEM_SIZE 4
global
void flip(int * x, int * fx)
{
// Thread index
int tx = threadIdx.x;
fx[(MEM_SIZE-tx)%MEM_SIZE] = x[tx];
}
global
void conv(int * fx, int * h, int * y)
{
// Block index
int bx = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ind = (MEM_SIZE + (tx - bx)) % MEM_SIZE;
__shared__ __device__ int sum;
sum += h[tx] * fx[ind];
__syncthreads();
y[bx] = sum;
__syncthreads();
sum =0;
}
int main()
{
CUT_DEVICE_INIT(); // must call to initialize CUDA
int x[16]={1,2,3,0};
int h[16]={10,20,0,0};
int* d_x;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_x, MEM_SIZE * sizeof(int)));
int* d_h;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_h, MEM_SIZE * sizeof(int)));
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_x, x, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_h, h, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );
int fx[MEM_SIZE] = {0};
int y[MEM_SIZE] = {0};
int* d_fx;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_fx, MEM_SIZE * sizeof(int)));
// setup execution parameters
dim3 threads(MEM_SIZE, 1);
dim3 grid(1, 1);
// execute the kernel
flip <<< grid, threads >>> ( d_x, d_fx );
// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(fx, d_fx, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL(cudaFree(d_x));
printf(" \n FlippedX = " );
for(int i=0; i<MEM_SIZE; i++) {
printf(" %d,", fx[i]);
}
int* d_y;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_y, MEM_SIZE * sizeof(int)));
// setup execution parameters
dim3 convThread(MEM_SIZE, 1);
dim3 convGrid(MEM_SIZE, 1);
// execute the kernel
conv <<< convGrid, convThread >>> ( d_fx, d_h, d_y );
// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(y, d_y, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
printf(" \n Result Y = " );
for(int i=0; i<MEM_SIZE; i++) {
printf(" %d,", y[i]);
}
// clean up memory
CUDA_SAFE_CALL(cudaFree(d_fx));
CUDA_SAFE_CALL(cudaFree(d_h));
CUDA_SAFE_CALL(cudaFree(d_y));
}