Hi Everybody,
I have problems with my project code since I updated from CUDA 0.8 to 1.0.
After a while I could specify the problem, which causes a CUDA Error
“unspecified launch failure”.
I’ve attached a simple CUDA SDK Project which just allocates memory for
a specified volume size and initializes the values with 0.0f. I am aware that,
there is a memset function, but I just wanted to use a very simple example.
The error occurs in “cudaThreadSynchronize();” after calling the kernel “myInitVolumeKernel<<<grid, block>>>(pVolumeDevice);”.
If SDK Methods are used “CUT_CHECK_ERROR” this program will work
in release mode, since nothing is checked. Both version are not working
in debug mode, but work fine in the emulation.
The program runs also fine, if the parameters, which are transferred into
constant memory, are hardcoded. See makro option in the kernel.
This Example program was tested on a GeForce 8800 GTX using Linux.
I also tested it on Windows XP. This time it caused sometimes a not clear
memory access error or froze the system.
A additional test on a second linux system with another G80 Card also failed.
Anybody an idea?
Thanks
////////////////////////////////////////////////////////////////////////////////
// //
// MyVolumeInitializeTest //
// //
////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cutil.h>
////////////////////////////////////////////////////////////////////////////////
// Common host and device functions
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
// Volume dimensions
device constant unsigned int gVolSize[3];
// Volume strides dimensions
device constant unsigned int gVolStride[2];
static global void myInitVolumeKernel(float* pVolume);
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv){
// Pointer to float volume
float pVolumeDevice=0;
// Initialize Device
CUT_DEVICE_INIT();
// Set volume size and volume strides
// this should be changeable during runtime
unsigned int volSize[3];
volSize[0] = 512;
volSize[1] = 512;
volSize[2] = 512;
unsigned int volStride[2];
volStride[0] = 512;
// if volstride[1] is 512 instead of 512^2 it works?!
volStride[1] = 512*512;
// Allocate device memory for volume part
CUDA_SAFE_CALL(
cudaMalloc((void **) &pVolumeDevice,
volSize[0] * volSize[1] *
volSize[2] * sizeof(float) ));
// Copy volume stride to constant memory
CUDA_SAFE_CALL(
cudaMemcpyToSymbol(gVolStride, volStride, 2 * sizeof(unsigned int)));
// Copy volume dimension to constant memory
CUDA_SAFE_CALL(cudaMemcpyToSymbol(gVolSize, volSize, 3 * sizeof(unsigned int)));
// Set up kernel configuration
dim3 block(64, 4);
dim3 grid(512 / block.x, 512 / block.y);
// Call kernel execution
myInitVolumeKernel<<<grid, block>>>(pVolumeDevice);
// use definition out of cutil.h instead of using cutil-makros
// both fails in debug mode, but not in emulation.
#if 1
cudaError_t err = cudaThreadSynchronize();
if( cudaSuccess != err) {
fprintf(stderr, “Cuda error: %s.\n”,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
#else
CUT_CHECK_ERROR(“myInitVolumeKernel() execution failed\n”);
#endif
CUDA_SAFE_CALL( cudaThreadSynchronize() );
// Exit program
CUT_EXIT(argc, argv);
}
static global void myInitVolumeKernel(float* pVolume)
{
// compute volume coordinates
unsigned int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
unsigned int y;
unsigned int z = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
#if 1
//volume index
unsigned int idx = __umul24(z, gVolStride[1]) + x;
for (y = 0; y < gVolSize[1]; y++, idx += gVolStride[0])
{
pVolume[idx] = 0.0f;
}
#else
// if variables are hardcoded it works also
unsigned int idx = __umul24(z, 512*512) + x;
for (y = 0; y < 512; y++, idx += 512)
{
pVolume[idx] = 0.0f;
}
} // myInitVolumeKernel
initialVolumeTest.zip (23.8 KB)