Kernel crashes with driver API and several cuArray

Hi,

I noticed some strange behavior with the cuArrays in my program.

What I described in this thread:

http://forums.nvidia.com/index.php?showtopic=42729

Doesn’t seem to happen due to problems with the textures like I thought before, but due to some strange things happening with the cuArrays.

I discovered that my kernels start failing (error 2bc (“launch failed”) on cuLaunchGrid) when there are several cuArrays definied (the exact number of cuArrays required to crash the kernel seem to be dependend on the kernel).

Here’s a little example program. There’s a definition for a second cuArray (cu_array2). If this is commented out, the kernel works. Otherwise it crashes.

static texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;

__global__ static void testKernel(float* out, int width)

{

    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

	unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

	out[y*width +x] = tex2D(tex, x,y);

}

Host Code:

       CU_SAFE_CALL(initCUDA());

	// create the cuArray

	CUDA_ARRAY_DESCRIPTOR desc;

	desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;

	desc.NumChannels = 1;

	desc.Width = width;

	desc.Height = height;

	CU_SAFE_CALL(cuArrayCreate(&cu_array, &desc));

       // create a second cuarray..this makes the kernel crash

	CUarray cu_array2;

	desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;

	desc.NumChannels = 1;

	desc.Width = width;

	desc.Height = height;

	CU_SAFE_CALL(cuArrayCreate(&cu_array2, &desc));

	// copy data into cu_array

	CUDA_MEMCPY2D copyParam;

	memset(&copyParam, 0, sizeof(copyParam));

	copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;

	copyParam.dstArray  = cu_array;

	copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;

	copyParam.srcHost  = image;

	copyParam.srcPitch  = width * sizeof(unsigned char);

	copyParam.WidthInBytes	= copyParam.srcPitch;

	copyParam.Height  = height;

    CU_SAFE_CALL(cuMemcpy2D(&copyParam));

	int block_size = 8;

	CUmodule module;

	// load module and kernel

	std::string path("some_module.cubin");

	CU_SAFE_CALL(cuModuleLoad(&module, path.c_str()));

	CUdeviceptr d_data = (CUdeviceptr)NULL;

    CU_SAFE_CALL( cuMemAlloc( &d_data, width * height * sizeof(float)));

   CUfunction func;

    CU_SAFE_CALL(cuModuleGetFunction(&func, module, "testKernel"));

	// setup texture

	CUtexref cu_texref;

    CU_SAFE_CALL(cuModuleGetTexRef(&cu_texref, module, "tex"));

	CU_SAFE_CALL(cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT));

    CU_SAFE_CALL(cuTexRefSetAddressMode(cu_texref, 0, CU_TR_ADDRESS_MODE_WRAP));

    CU_SAFE_CALL(cuTexRefSetAddressMode(cu_texref, 1, CU_TR_ADDRESS_MODE_WRAP));

    CU_SAFE_CALL(cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_POINT));

    CU_SAFE_CALL(cuTexRefSetFlags(cu_texref, 0));

    CU_SAFE_CALL(cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_UNSIGNED_INT8, 1));  

	// setup kernel

	int offset = 0;

    CU_SAFE_CALL(cuFuncSetBlockShape( func, 8, 8, 1 ));

    CU_SAFE_CALL(cuParamSeti(func, offset, d_data));

	offset += sizeof(d_data);

    CU_SAFE_CALL(cuParamSeti(func, offset, width));

	offset += sizeof(width);

	CU_SAFE_CALL(cuParamSetSize(func, offset));

    CU_SAFE_CALL(cuParamSetTexRef(func, CU_PARAM_TR_DEFAULT, cu_texref));

	// launch

    CU_SAFE_CALL(cuLaunchGrid( func, width / 8, height / 8 ));

	cuArrayDestroy(cu_array);

	cuArrayDestroy(cu_array2);

	CU_SAFE_CALL(cuMemFree(d_data));

    CU_SAFE_CALL(cuModuleUnload(module));

	cuCtxDetach(cuContext);

Note that this only happens with CUDA 1.0, CUDA 0.8 works without any problems.