Hi,
I noticed some strange behavior with the cuArrays in my program.
What I described in this thread:
http://forums.nvidia.com/index.php?showtopic=42729
Doesn’t seem to happen due to problems with the textures like I thought before, but due to some strange things happening with the cuArrays.
I discovered that my kernels start failing (error 2bc (“launch failed”) on cuLaunchGrid) when there are several cuArrays definied (the exact number of cuArrays required to crash the kernel seem to be dependend on the kernel).
Here’s a little example program. There’s a definition for a second cuArray (cu_array2). If this is commented out, the kernel works. Otherwise it crashes.
static texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;
__global__ static void testKernel(float* out, int width)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
out[y*width +x] = tex2D(tex, x,y);
}
Host Code:
CU_SAFE_CALL(initCUDA());
// create the cuArray
CUDA_ARRAY_DESCRIPTOR desc;
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
desc.NumChannels = 1;
desc.Width = width;
desc.Height = height;
CU_SAFE_CALL(cuArrayCreate(&cu_array, &desc));
// create a second cuarray..this makes the kernel crash
CUarray cu_array2;
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
desc.NumChannels = 1;
desc.Width = width;
desc.Height = height;
CU_SAFE_CALL(cuArrayCreate(&cu_array2, &desc));
// copy data into cu_array
CUDA_MEMCPY2D copyParam;
memset(©Param, 0, sizeof(copyParam));
copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
copyParam.dstArray = cu_array;
copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
copyParam.srcHost = image;
copyParam.srcPitch = width * sizeof(unsigned char);
copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height;
CU_SAFE_CALL(cuMemcpy2D(©Param));
int block_size = 8;
CUmodule module;
// load module and kernel
std::string path("some_module.cubin");
CU_SAFE_CALL(cuModuleLoad(&module, path.c_str()));
CUdeviceptr d_data = (CUdeviceptr)NULL;
CU_SAFE_CALL( cuMemAlloc( &d_data, width * height * sizeof(float)));
CUfunction func;
CU_SAFE_CALL(cuModuleGetFunction(&func, module, "testKernel"));
// setup texture
CUtexref cu_texref;
CU_SAFE_CALL(cuModuleGetTexRef(&cu_texref, module, "tex"));
CU_SAFE_CALL(cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT));
CU_SAFE_CALL(cuTexRefSetAddressMode(cu_texref, 0, CU_TR_ADDRESS_MODE_WRAP));
CU_SAFE_CALL(cuTexRefSetAddressMode(cu_texref, 1, CU_TR_ADDRESS_MODE_WRAP));
CU_SAFE_CALL(cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_POINT));
CU_SAFE_CALL(cuTexRefSetFlags(cu_texref, 0));
CU_SAFE_CALL(cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_UNSIGNED_INT8, 1));
// setup kernel
int offset = 0;
CU_SAFE_CALL(cuFuncSetBlockShape( func, 8, 8, 1 ));
CU_SAFE_CALL(cuParamSeti(func, offset, d_data));
offset += sizeof(d_data);
CU_SAFE_CALL(cuParamSeti(func, offset, width));
offset += sizeof(width);
CU_SAFE_CALL(cuParamSetSize(func, offset));
CU_SAFE_CALL(cuParamSetTexRef(func, CU_PARAM_TR_DEFAULT, cu_texref));
// launch
CU_SAFE_CALL(cuLaunchGrid( func, width / 8, height / 8 ));
cuArrayDestroy(cu_array);
cuArrayDestroy(cu_array2);
CU_SAFE_CALL(cuMemFree(d_data));
CU_SAFE_CALL(cuModuleUnload(module));
cuCtxDetach(cuContext);
Note that this only happens with CUDA 1.0, CUDA 0.8 works without any problems.