Hello, I’m receiving a “cudaSafeCall() Runtime API error 4: unspecified launch failure” from the shortened code below. At first I attempted to run the code with the full kernel (I know there are no addressing issues as I have carefully checked it many times). Next commented out all code inside the kernel and removed the float4 (pData) pointer via the kernel arguments, still gives the error. Further commented out the call to the function (update_points(); within .cpp file) and I’m able to copy the data from device to host with no issues (and obviously from host to device initially). Size of memory block is approx 843MB (1GB available on card). I put a comment next to where the error is being issued.
The program is built upon the fluidsGL example, with most of the previous functionality removed (just wanted keyboard and interop ability for future use). I know the host data is setup properly as I have the data being output to a BMP file every N amount of iterations. I’m running XP professional SP3, quadro 2000, and VC++ 2008 express. Also ran through the mem-debugging program that comes with the toolkit, no help… Any ideas would be appreciated.
// fluidsGL.cpp
static pData *pointfield = NULL;
static pData *device_pointfield = NULL;
extern "C" void update_points();
int main(int argc, char** argv)
{
// Skipping opengl and initialization parts
pointfield = (pData*)_aligned_malloc(MEMDIM * NOBLOCKS * sizeof(float4), 16);
glutHideWindow(); coreInitialize(); // Calculates initial conditions
cutilSafeCall ( cudaMalloc((void**)&device_pointfield, MEMDIM * NOBLOCKS * sizeof(float4)) );
cutilSafeCall ( cudaMemcpy( device_pointfield, pointfield, MEMDIM * NOBLOCKS * sizeof(float4)), cudaMemcpyHostToDevice );
coreLoop(); atexit(cleanup);
}
int coreLoop() {
while(endSimulation == false)
{
for(int temp_counter = 0; temp_counter < OUTPUT_RATE; temp_counter++)
{
update_points();
cutilCheckMsg("kernel launch failure");
cutilSafeCall( cutilDeviceSyncronize() ); // <- Get error here
cutilSafeCall ( cudaMemcpy( pointfield, device_pointfield, (MEMDIM * NOBLOCKS * sizeof(float4)), cudaMemcpyDeviceToHost
cutilSafeCall( cutilDeviceSyncronize() );
}
output_Dataset(); // Saves data (2d plane) in host memory to BMP file
}
}
// fluidsGL_kernels.cuh
#ifndef __STABLEFLUIDS_KERNELS_CUH_
#define __STABLEFLUIDS_KERNELS_CUH_
typedef float4 pData;
__global__ void update_points_k();
#endif
// fluidsGL_kernels.h
#ifndef __STABLEFLUIDS_KERNELS_CUH_
#define __STABLEFLUIDS_KERNELS_CUH_
typedef float4 pData;
__global__ void update_points_k();
#endif
// fluidsGL_kernels.cu
// Left out includes
__global__ void update_points_k()
{
// Commented out code
}
extern "C" void update_points()
{
dim3 grid(XBLOCKS, YBLOCKS, ZBLOCKS); // 60 x 30 x 30 -> defined in header file
dim3 tids(8, 8, 8); // 512 Total
update_points_k<<<grid, tids>>>();
cutilCheckMsg("update_points_k failed.");
}