Unspecified launch failure when launch my kernel

Hi!

I’m a newbie of CUDA programming and I’m writing my simple kernel for matrix-vector multiplication. I don’t know why it doesn’t work; the application code is the following:

// REAL is essentially a float

// I've omitted some error check code

__global__ void my_mv_kernel(REAL *y, const REAL *a, const REAL *x, const int Width)

{

	__shared__ REAL as[TILE_WIDTH][TILE_WIDTH];

	__shared__ REAL xs[TILE_WIDTH];

	

	REAL sum = 0.0f;

	for(int m = 0; m < (Width>>5); ++m){

		as[tx][ty] = a[(m << 5)+tx];

		xs[tx] = x[tx];

		for(int k = 0; k < TILE_WIDTH; ++k)

			sum += as[tx][k] * xs[k];

	}

	y[tx] = sum;

}

int main(int argc, char** argv)

{

	REAL* h_A;

	REAL* h_x;

	REAL* h_y;

	REAL *d_A = 0;

	REAL* d_x = 0;

	REAL* d_y = 0;

	n2 = N * N;

	h_A = (REAL*)Malloc(n2 * sizeof(h_A[0]));

	h_x = (REAL*)Malloc(N * sizeof(h_x[0]));

	cudaMallocHost((void **)&h_y, N*sizeof(h_y[0]));

	blkNum = (N >> 4) + ((N & 15) ? 1 : 0);

	dim3 threads(16, 16);

	dim3 grid(blkNum, 1);

	for (i = 0; i < n2; i++) {

	  h_A[i] = (REAL)drand48();

	}

	cudaMalloc((void **) &d_x, N * sizeof(REAL));

	cudaMalloc((void **) &d_y, N * sizeof(REAL));

	for (i = 0; i < N; i++) {

	  h_y[i] = (REAL)0;

	}

	cudaMemcpy(d_y, h_y, N * sizeof(REAL), cudaMemcpyHostToDevice);

	for (j = 0; j < N; j++) {

		h_x[j] = (REAL)drand48();

	}

	CUDA_SAFE_CALL(cudaMemcpy(d_x, h_x, N * sizeof(REAL), cudaMemcpyHostToDevice));

	my_mv_kernel<<< grid, threads >>>(d_y, d_A, d_x, N);

	CUDA_SAFE_CALL( cudaThreadSynchronize() );

	...

}

The error “Unspecified launch failure” show as guilty line in which there is CUDA_SAFE_CALL( cudaThreadSynchronize() ). What can I do?

Apparently you have not allocated memory on GPU for “d_A” which is passed to the kernel as a host pointer. Allocate memory for it on host and then pass it to the kernel.

Every pointer that you pass to kernel should be pointing to device memory (exception is pagelocked memory)

Exact, you’re right!!! Now it works. Thanks a lot.