I am having trouble with cudamemcpy. I have an array that I am sending in to the cuda kernel and it is suppose to return a modified copy, same length and type. However, when I try to copy my data from the device to the host, I am not getting the correct numbers! I am getting a bit frustrated, if anyone has an insight, please let me know.
Thanks, much appreciated!
unsigned int * cpu_in_image = (unsigned int *) malloc(mem_size);
unsigned int * cpu_out_image = (unsigned int *) malloc(mem_size);
......
unsigned int* gpu_in_image;
CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_in_image, mem_size));
//copy CPU memory to GPU memory
CUDA_SAFE_CALL(cudaMemcpy(gpu_in_image, cpu_in_image, mem_size, cudaMemcpyHostToDevice));
//allocate GPU memory for results
unsigned int *gpu_out_image;
CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_out_image, mem_size));
//setup execution parameters
dim3 grid(1, 1, 1);
dim3 threads(num_threads, 1, 1); //Cannot have more than 2^16 Threads. That is, 256*256
//execute kernel
negativeKernel<<<grid, threads, 0>>>(gpu_in_image, gpu_out_image);
CUT_CHECK_ERROR("Kernel execution failed");
//Copy GPU Results Memory to CPU Results Memory
CUDA_SAFE_CALL(cudaMemcpy(cpu_out_image, gpu_out_image, mem_size, cudaMemcpyDeviceToHost)); //GPU OUT IMAGE IS MESSED UP
I then use a for loop to display my contents before and after. They are not the correct results.
The kernel is as followed:
__global__ void
negativeKernel( unsigned int* gpu_in_image, unsigned int* gpu_out_image)
{
__syncthreads();
int tid = blockIdx.x * blockDim.x+threadIdx.x;
gpu_out_image[tid] = gpu_in_image[tid];
__syncthreads();
}
I don’t know what to do with this anymore!