Hi!
I’m a newbie of CUDA programming and I’m writing my simple kernel for matrix-vector multiplication. I don’t know why it doesn’t work; the application code is the following:
// REAL is essentially a float
// I've omitted some error check code
__global__ void my_mv_kernel(REAL *y, const REAL *a, const REAL *x, const int Width)
{
__shared__ REAL as[TILE_WIDTH][TILE_WIDTH];
__shared__ REAL xs[TILE_WIDTH];
REAL sum = 0.0f;
for(int m = 0; m < (Width>>5); ++m){
as[tx][ty] = a[(m << 5)+tx];
xs[tx] = x[tx];
for(int k = 0; k < TILE_WIDTH; ++k)
sum += as[tx][k] * xs[k];
}
y[tx] = sum;
}
int main(int argc, char** argv)
{
REAL* h_A;
REAL* h_x;
REAL* h_y;
REAL *d_A = 0;
REAL* d_x = 0;
REAL* d_y = 0;
n2 = N * N;
h_A = (REAL*)Malloc(n2 * sizeof(h_A[0]));
h_x = (REAL*)Malloc(N * sizeof(h_x[0]));
cudaMallocHost((void **)&h_y, N*sizeof(h_y[0]));
blkNum = (N >> 4) + ((N & 15) ? 1 : 0);
dim3 threads(16, 16);
dim3 grid(blkNum, 1);
for (i = 0; i < n2; i++) {
h_A[i] = (REAL)drand48();
}
cudaMalloc((void **) &d_x, N * sizeof(REAL));
cudaMalloc((void **) &d_y, N * sizeof(REAL));
for (i = 0; i < N; i++) {
h_y[i] = (REAL)0;
}
cudaMemcpy(d_y, h_y, N * sizeof(REAL), cudaMemcpyHostToDevice);
for (j = 0; j < N; j++) {
h_x[j] = (REAL)drand48();
}
CUDA_SAFE_CALL(cudaMemcpy(d_x, h_x, N * sizeof(REAL), cudaMemcpyHostToDevice));
my_mv_kernel<<< grid, threads >>>(d_y, d_A, d_x, N);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
...
}
The error “Unspecified launch failure” show as guilty line in which there is CUDA_SAFE_CALL( cudaThreadSynchronize() ). What can I do?