"unspecified launch failure" with large size (but within 256 bytes) kernel arguments

Hi,

The following code exits with “unspecified launch failure” on winXP w/ both cuda 2.2 and 2.3

[codebox]

#include <cuda.h>

#include <cutil_inline.h>

#include

#define NUM_ELEMENTS 8

// 0-112 : no crash

// 113-199 : crash

// 200 : no crash

// 201-231: crash

struct KernelParams

{

bool junk[133];

bool write_two;

};

struct KernelData

{

float* d_buf;

};

global void kernel(KernelParams kp, KernelData kd)

{

const int loc = blockDim.x * blockIdx.x + threadIdx.x;

if (loc >= NUM_ELEMENTS)

	return;

kd.d_buf[loc] = 1;

if (kp.write_two)

{

	kd.d_buf[loc] = 2;

}

}

int main(int argc, char** argv)

{

// Structs to be passed to the kernel

KernelParams kp;

KernelData   kd;

std::cout << "Size of parameters = " << (sizeof(kp) + sizeof(kd)) << " bytes\n" << std::endl;

kp.write_two = true;

// Allocate device output array

int size = NUM_ELEMENTS*sizeof(float);

cutilSafeCall(cudaMalloc((void**)&kd.d_buf, size));

// Memset to 0

cutilSafeCall(cudaMemset(kd.d_buf, 0, size));

cutilSafeCall(cudaThreadSynchronize());

// Launch kernel

dim3 threadBlock(16,1);

dim3 blockGrid(1,1);

kernel<<<blockGrid, threadBlock>>>(kp, kd);

cudaThreadSynchronize();

cutilCheckMsg("kernel execution failed\n");	

// Print results

float* h_buf = new float;

cutilSafeCall(cudaMemcpy(h_buf, kd.d_buf, size, cudaMemcpyDeviceToHost));

for (int i = 0; i < NUM_ELEMENTS; i++)

	std::cout << h_buf[i] << " ";

std::cout << std::endl;

// Clean up

cutilSafeCall(cudaFree(kd.d_buf));

}

[/codebox]

I don’t see anything wrong with the code (has been isolated from a larger cuda program).

There are several ways to make it not crash:

  1. In the KernelParams struct, change array length of .junk to anywhere between 1-112 or 200. Anything else causes it to crash (didn’t try exhaustively, but took a good number of samples)

  2. In the KernelParams struct, change the ‘bool write_two’ to int

  3. In the KernelParams struct, change the ‘bool junk’ array to an int array (reduce size to fit within the 256 byte limit). (However adding another bool between this int array and the bool write_two will cause a crash)

  4. In the kernel function, comment out either write to kd.d_buf.

1,2,3 suggest something to do with alignment, however commenting out the first write to kd.d_buf in the kernel (without changing the KernelParams struct) also works which has nothing to do with alignment.

Is this a bug, or am I missing something?