Hello,
I’m trying to convert a very slow serial program to GPU. Problem is, I get the “Unspecified launch failure” error message every time I run it. The line that the CUDA error points to is
CUDA_SAFE_CALL( cudaThreadSynchronize() );
which I have inserted directly after my call to the kernel. I am really perplexed and hoping that someone can help. I’ve compiled and run a number of other codes, so I think I’ve got my system properly configured.
The command line is (from Properties->Configuration Properties->CUDA Runtime API->Command Line):
echo “C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\bin\nvcc.exe” -gencode=arch=compute_20,code="sm_20,compute_20" -gencode=arch=compute_20,code="sm_20,compute_20" --machine 32 -ccbin “C:\Program Files\Microsoft Visual Studio 9.0\VC\bin” -Xcompiler “/EHsc /W3 /nologo /O2 /Zi /MT " -I"C:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK 3.2\C\common\inc” -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\include" -maxrregcount=32 --compile -o “Debug/Image_Dist.vcproj.obj” Image_Dist.vcproj
“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\bin\nvcc.exe” -gencode=arch=compute_20,code="sm_20,compute_20" -gencode=arch=compute_20,code="sm_20,compute_20" --machine 32 -ccbin “C:\Program Files\Microsoft Visual Studio 9.0\VC\bin” -Xcompiler “/EHsc /W3 /nologo /O2 /Zi /MT " -I"C:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK 3.2\C\common\inc” -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\include" -maxrregcount=32 --compile -o “Debug/Image_Dist.vcproj.obj” “c:\Tim\Image_Dist\Image_Dist.vcproj”
I am using Visual C++ 2008 Express Edition with the CUDA 3.2 SDK and a GTX 570 card.
Here’s the relevant bit of host code. pos and dist are float** and num_zero, x, y, and len are ints
// Allocate matrices in device memory
size_t pitch;
int* dzeros;
cudaMalloc((void**)&dzeros, num_zero * sizeof(int));
float* dpos;
cudaMallocPitch((void**)&dpos,&pitch,10 * sizeof(float),len);
float* ddist;
cudaMallocPitch((void**)&ddist,&pitch,(len+2)*sizeof(float),num_zero);
// Copy matrices from host memory to device memory
cudaMemcpy(dzeros,thezeros,num_zero*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy2D(dpos,pitch,pos,pitch,10*sizeof(float),len,cudaMemcpyHostToDevice);
cudaMemcpy2D(ddist,pitch,dist,pitch,(len+2)*sizeof(float),num_zero,cudaMemcpyHostToDevice);
// Send the task out
dim3 threads(1, 1);
dim3 grid(x / threads.x, y / threads.y);
cudaError_t cudastat;
computedist<<<grid,threads>>>(dzeros,dpos,ddist,num_zero,x,len,pitch);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
Here’s the device code:
global void computedist(int* dzeros,float* dpos,float* ddist,int num_zero,int x,int len,size_t pitch){
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Block Dimension
int dx = blockDim.x;
int dy = blockDim.y;
int i = bx*dx+tx;
int j = by*dy+ty;
int index = i*x+j;
int theind;
for(theind=0;theind<num_zero;theind++)
if(dzeros[theind]==index)
break;
// Pointers to matrix locations
float* location;
float* r;
float* c;
location = (float*)((char*)ddist + theind*pitch);
*location = i;
location = (float*)((char*)ddist + theind*pitch) + 1;
*location = j;
for(int ind=0;ind<len;ind++){
location = (float*)((char*)ddist + theind*pitch) + ind+2;
r = (float*)((char*)dpos + ind*pitch) + 6;
c = (float*)((char*)dpos + ind*pitch) + 5;
*location = sqrt((i-*r)*(i-*r)+(j-*c)*(j-*c));
}
}