hi,
i am pretty confused because my code works fine in emulation mode but not in GPU mode…
and i apologize if my problem could look like a very newbie one
here a sample of my code which should transform a 32bit coulored image (the bip pointer) in a 8bit grey scale image (bop) :
post function is like the sprintf
extern "C" void cu_hello(t_cu_jit_hello *x, long dimcount, long *dim, long planecount,
t_jit_matrix_info *in_minfo, char *bip, t_jit_matrix_info *out_minfo, char *bop);
// Kernel that executes on the CUDA device
__global__ void rgb2luma_kernel(unsigned char *a, unsigned char *b, long *indimstride, long *outdimstride, long *dim, long ascale, long rscale, long gscale, long bscale)
{
//int i = blockIdx.x * blockDim.x + threadIdx.x;
//int j = blockIdx.y * blockDim.y + threadIdx.y;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int i = idx%dim[0];
int j = idx/dim[0];
if ( j < dim[1] )
{
if ( i < dim[0] )
{
a += i*indimstride[0] + j*indimstride[1];
b += i + j*outdimstride[1]; //assumes that b is a one plane 2D matrix
*b = (unsigned char) ((*a * ascale + *(a+1) * rscale + *(a+2) * gscale + *(a+3) * bscale)/255);
}
}
}
// main routine that executes on the host
void cu_hello(t_cu_jit_hello *x, long dimcount, long *dim, long planecount,
t_jit_matrix_info *in_minfo, char *bip, t_jit_matrix_info *out_minfo, char *bop)
{
unsigned char *bip_d, *bop_d; // Pointer to host & device arrays
const int insize = in_minfo->dimstride[1]*dim[1]; // Number of elements in input arrays
const int outsize = out_minfo->dimstride[1]*dim[1]; // Number of elements in output arrays
long w=dim[0], h=dim[1];
double fascale = x->ascale;
double frscale = x->rscale;
double fgscale = x->gscale;
double fbscale = x->bscale;
long ascale = fascale * 65536.;
long rscale = frscale * 65536.;
long gscale = fgscale * 65536.;
long bscale = fbscale * 65536.;
dim3 dimBlock(x->dimBlock[0], x->dimBlock[1]); // number of threads in one block
dim3 dimGrid(w/dimBlock.x + (w%dimBlock.x == 0 ? 0:1),h/dimBlock.x + (h%dimBlock.x == 0 ? 0:1)); // number of blocks per grid
// Allocate array on device
if ( cudaSuccess != cudaMalloc((void **) &bip_d, insize*sizeof(unsigned char))
|| cudaSuccess != cudaMalloc((void **) &bop_d, outsize*sizeof(unsigned char)))
{
post("CUDA can't allocated device memory.");
goto out;
}
if ( cudaSuccess != cudaMemcpy(bip_d, (unsigned char *) bip, insize*sizeof(unsigned char), cudaMemcpyHostToDevice) )
{
post("CUDA can't copy data from host to device");
goto out;
}
// Do calculation on device:
rgb2luma_kernel <<< dimGrid, dimBlock >>> (bip_d, bop_d, in_minfo->dimstride, out_minfo->dimstride, dim, ascale, rscale, gscale, bscale);
// Retrieve result from device and store it in host array
if (cudaSuccess != cudaMemcpy(bop, (char *) bop_d, outsize*sizeof(unsigned char), cudaMemcpyDeviceToHost))
{
post("CUDA can't copy data from host to device");
goto out;
}
out:
// Cleanup
if ( cudaSuccess != cudaFree(bip_d) || cudaSuccess != cudaFree(bop_d)) post("CUDA can't free device pointer.");
}
in normal mode (no emulation), when i assign a value to the b pointer in the global function, the programm crashes…
whereas it works fine in emulation mode
should I pass the pointer adress in another way to the global function ?
many thanks for your help and fell free to contact me for any precision
best
ChianLi