Hello there.
While programming in CUDA (comparing into the kernel two images sent as unsigned char*, computing the square RGB difference of each image’s pixel into an element of device uint64_t cud_err[200][200] matrix) I am stuck in the phase of copying the data back from the device → host.
Here is the host code:
CallKernelGenetics(const unsigned char *dline, const unsigned char *sline, int h, int w, uint16_t pitch, uint16_t bpp)
{
uint64_t err_tot = (uint64_t)malloc(sizeof(uint64_t)); // this is the host variable use to initialize the device one (cud_error_tot)
*err_tot = 3;
CUDA_SAFE_CALL(cudaMalloc((void**)&cud_err_tot, sizeof(uint64_t)));
CUDA_SAFE_CALL(cudaMemcpy(cud_err_tot, err_tot, sizeof(uint64_t), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc((void **)&cud_dline, w * h * sizeof(unsigned char))); // the first image
CUDA_SAFE_CALL(cudaMemcpyToSymbol(cud_dline, dline, w * h * sizeof(unsigned char), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc((void **)&cud_sline, w * h * sizeof(unsigned char))); // the second image
CUDA_SAFE_CALL(cudaMemcpyToSymbol(cud_sline, sline, w * h * sizeof(unsigned char), cudaMemcpyHostToDevice));
dim3 threads( 16, 16);
dim3 grid( w/threads.x, h/threads.y);
testGufKernel<<<grid, threads, 0>>>(bpp, pitch, h, w, cud_err_tot);
CUDA_SAFE_CALL(cudaMemcpy(err_tot, cud_err_tot, sizeof(uint64_t), cudaMemcpyDeviceToHost)); // here is the problem
CUDA_SAFE_CALL(cudaFree(cud_sline));
CUDA_SAFE_CALL(cudaFree(cud_dline));
CUDA_SAFE_CALL(cudaFree(cud_err_tot));
return err_tot[0];
}
And here is the device code:
device unsigned char *cud_dline;
device unsigned char *cud_sline;
device uint64_t *cud_err_tot;
device uint64_t cud_err[200][200];
global void
testGufKernel(uint64_t cud_bpp, uint64_t cud_pitch, int h, int w, uint64_t* cud_err_tot)
{
//-- Get the location of the texel corresponding the thread ID
const unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int tidy = blockIdx.y * blockDim.y + threadIdx.y;
cud_dline += tidx * cud_pitch + tidy * cud_bpp;
cud_sline += tidx * cud_pitch + tidy * cud_bpp;
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
int er = (int)(cud_dline[0]) - (int)(cud_sline[0]);
int eg = (int)(cud_dline[1]) - (int)(cud_sline[1]);
int eb = (int)(cud_dline[2]) - (int)(cud_sline[2]);
#else
int er = (int)(cud_dline[1]) - (int)(cud_sline[1]);
int eg = (int)(cud_dline[2]) - (int)(cud_sline[2]);
int eb = (int)(cud_dline[3]) - (int)(cud_sline[3]);
#endif
if(tidx < w && tidy < h)
cud_err[tidx][tidy] = ((er * er) + (eb * eb) + (eg * eg));
__syncthreads();
if(tidx == 0 && tidy == 0)
*cud_err_tot = cud_err[0][0];
}
Well the problem shows up when executing the line:
CUDA_SAFE_CALL(cudaMemcpy(err_tot, cud_err_tot, sizeof(uint64_t), cudaMemcpyDeviceToHost));
The problem is in the kernel attribution → cud_err[tidx][tidy] = ((er * er) + (eb * eb) + (eg * eg));
If I try instead cud_err[tidx][tidy] = 12 for example it works fine and the host variable err_tot has the right value.
I hope you can help me cause I am driving nuts trying different approaches to solve this.
I have an NVIDIA GeForce 8600 M GT and trying on Windows 7 x64, but I don’t think the windows should be a problem cause the same problem
showed up also on Windows XP x64.