Hallo,
the below code works on Tesla C1060 but does not work on my mobile workstation with a Quadro 3000M.
This is mainly what the code does:
__global__ void gpuKernel
(
BYTE *src,float *aux, /* source, destination */
size_t memPitch, /* memory pitch */
size_t memPitchAux,
int w,int h, /* size of picture */
BYTE *Rng,int sizeRng, /* ranges */
BYTE *Dir,int sizeDir /* directions */
)
{
int rx = ((VEC2*)Rng)->x; // range of calculations (sizeXofwindow = 2*rx+1)
int ry = ((VEC2*)Rng)->y; // range of calculations (sizeYofwindow = 2*ry+1)
int i = blockIdx.x * blockDim.x + threadIdx.x; // x-coordinate of pixel = column in device memory
int j = blockIdx.y * blockDim.y + threadIdx.y; // y-coordinate of pixel = row in device memory
int idx = j * memPitchAux/sizeof(float) + i;
int i0 = i-rx, i1 = i+rx; // the range of calculation for columns
int j0 = j-ry, j1 = j+ry; // the range of calculation for rows
float g=0.0f ,g0=0.0f;
if((i0>=0) && (i1< w) && (j0>=0) && (j1< h))
{
g = 0.0;
for(int n=0; n<5; n++)
{
// computations....
g = fmaxf(g,g0*s);
}
aux[idx] = g;
__syncthreads();
}
else
{
aux[idx] = 0;
}
}
int main()
{
[..]
checkCudaErrors(cudaMemcpy2D(data->BufGPU[0],data->memPitch, /* to GPU (device) */
data->BufCPU[0],data->imgPitch, /* from CPU (host) */
data->imgWidth, data->imgHeight, /* size of data (image) */
cudaMemcpyHostToDevice));
cudaThreadSynchronize();
[..]
dim3 dimBlock(data->dimBLX,data->dimBLY,1);
dim3 dimGrid(data->memPitch/dimBlock.x ,data->imgHeight/dimBlock.y,1);
size_t memPitch = data->memPitch;
float *auxD;
size_t auxDPitch;
size_t auxHPitch;
checkCudaErrors(cudaMallocPitch((void**)&auxD,&auxDPitch,w*sizeof(float),h));
auxHPitch = w*sizeof(float);
float *auxH = (float *) malloc(h*auxHPitch);
gpuKernel<<<dimGrid,dimBlock>>>
(data->BufGPU[0],auxD,
memPitch,auxDPitch,w, h,
Rng,sRng,
Dir,sDir);
cudaThreadSynchronize();
checkCudaErrors(cudaMemcpy2D(auxH,auxHPitch, // to CPU (host)
auxD,auxDPitch, // from GPU (device)
auxHPitch, h, // size of data (image)
cudaMemcpyDeviceToHost));
cudaThreadSynchronize();
[..]
}
The execution on the Quadro 3000M simply skips the kernel and outputs a blanck image in few ms.
The execution on the Tesla C1060 outputs a processed image in (say) 100 s. The weird thing is that in the last days also the execution on the c1060 appeared rather unpredictable (sometimes skipping the kernel, sometimes outputting weird numbers as -10^12, …). I do not understand this behaviour.
Could it be a driver version problem?
Thank you in advance for helping.
ps. both machines on ubuntu 11.10
Quadro 3000M —> Cuda compilation tools, release 4.1, V0.2.1221
Tesla C1060 —> Cuda compilation tools, release 4.1, V0.2.1221