Hi there,
I’m currently porting my CUDA 0.8 application to CUDA 1.0 and ran into some problems invoking multiple kernels using the driver API.
With CUDA 0.8 everything was working fine and we even used the program in a production system where it was running for hours without any problems.
Now with CUDA 1.0 however my kernels start to crash semi-randomly. (they always crash with error 2bc (launch failed) on cuLaunchGrid)
I have prepared a little example that roughly does what my program does:
CU_SAFE_CALL(initCUDA());
// copy data from host to device
CUarray cu_array;
CUDA_ARRAY_DESCRIPTOR desc;
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
desc.NumChannels = 1;
desc.Width = width;
desc.Height = height;
CU_SAFE_CALL( cuArrayCreate( &cu_array, &desc ));
CUDA_MEMCPY2D copyParam;
memset(©Param, 0, sizeof(copyParam));
copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
copyParam.dstArray = cu_array;
copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
copyParam.srcHost = image;
copyParam.srcPitch = width * sizeof(unsigned char);
copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height;
CU_SAFE_CALL(cuMemcpy2D(©Param));
int block_size = 8;
int offset = 0;
CUmodule module;
CUdeviceptr d_data = (CUdeviceptr)NULL;
CU_SAFE_CALL( cuMemAlloc( &d_data, width * height * sizeof(float)));
// load module and kernel
CU_SAFE_CALL(cuModuleLoad(&module, "somemodule.cubin"));
CUfunction func;
CU_SAFE_CALL(cuModuleGetFunction(&func, module, "some_kernel"));
// set texture parameters
CUtexref cu_texref;
CU_SAFE_CALL(cuModuleGetTexRef(&cu_texref, module, "tex"));
CU_SAFE_CALL(cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT));
CU_SAFE_CALL(cuTexRefSetAddressMode(cu_texref, 0, CU_TR_ADDRESS_MODE_WRAP));
CU_SAFE_CALL(cuTexRefSetAddressMode(cu_texref, 1, CU_TR_ADDRESS_MODE_WRAP));
CU_SAFE_CALL(cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_LINEAR));
CU_SAFE_CALL(cuTexRefSetFlags(cu_texref, 0));
CU_SAFE_CALL(cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_UNSIGNED_INT8, 1));
// setup first kernel
CU_SAFE_CALL(cuFuncSetBlockShape( func, block_size, block_size, 1 ));
CU_SAFE_CALL(cuParamSeti(func, offset, d_data)); offset += sizeof(d_data);
CU_SAFE_CALL(cuParamSeti(func, offset, width)); offset += sizeof(width);
CU_SAFE_CALL(cuParamSetSize(func, offset));
CU_SAFE_CALL(cuParamSetTexRef(func, CU_PARAM_TR_DEFAULT, cu_texref));
CU_SAFE_CALL( cuCtxSynchronize() );
// execute the kernel
CU_SAFE_CALL(cuLaunchGrid( func, width / block_size, height / block_size ));
CU_SAFE_CALL( cuCtxSynchronize() );
// load second kernel
CUfunction func2;
CU_SAFE_CALL(cuModuleGetFunction(&func2, module, "another_kernel"));
// create another texref
CUtexref tex;
CU_SAFE_CALL(cuModuleGetTexRef(&tex, module, "tex2"));
CU_SAFE_CALL(cuTexRefSetArray(tex, cu_array, CU_TRSA_OVERRIDE_FORMAT));
CU_SAFE_CALL(cuTexRefSetAddressMode(tex, 0, CU_TR_ADDRESS_MODE_WRAP));
CU_SAFE_CALL(cuTexRefSetAddressMode(tex, 1, CU_TR_ADDRESS_MODE_WRAP));
CU_SAFE_CALL(cuTexRefSetFilterMode(tex, CU_TR_FILTER_MODE_POINT));
CU_SAFE_CALL(cuTexRefSetFlags(tex, 0));
CU_SAFE_CALL(cuTexRefSetFormat(tex, CU_AD_FORMAT_UNSIGNED_INT8, 1));
// setup the kernel
CUdeviceptr dest_dev;
CU_SAFE_CALL(cuMemAlloc(&dest_dev, width * height * 4));
offset = 0;
CU_SAFE_CALL(cuFuncSetBlockShape(func2, block_size, block_size, 1));
CU_SAFE_CALL(cuParamSeti(func2, offset, dest_dev));
offset += sizeof(dest_dev);
CU_SAFE_CALL(cuParamSeti(func2, offset, width));
offset += sizeof(width);
CU_SAFE_CALL(cuParamSetSize(func2, offset));
CU_SAFE_CALL(cuParamSetTexRef(func2, CU_PARAM_TR_DEFAULT, tex));
CU_SAFE_CALL( cuCtxSynchronize() );
// launch the kernel
CU_SAFE_CALL(cuLaunchGrid(func2, width / block_size, height / block_size ));
CU_SAFE_CALL( cuCtxSynchronize() );
// clean up
CU_SAFE_CALL(cuMemFree(dest_dev));
CU_SAFE_CALL(cuMemFree(d_data));
CU_SAFE_CALL(cuModuleUnload(module));
cuCtxDetach(cuContext);
This program crashes only on every second invocation. I also noticed that sometimes some of my kernels finish but only if a certain other kernel finished on the previous program invocation. To me it seems that CUDA is keeping some faulty state inbetween program invocations.
Also note that the example only seems to crash if both kernels use different texture samplers.
I have the strange feeling that all this is somehow related to the textures. Sometimes some of my kernels randomly finished by just changing some flags of the texture references, which is also very strange. I also think that it works when I don’t use textures at all.
Well all this is extremely weird and I absolutely don’t have any idea why this is happening, especially since it was running without problems in CUDA 0.8.