Noob question: Kernel configuration failed, invalid configuration argument

Hi I’m familiarising myself with CUDA (and re-familiarising myself with C) by trying to edit the matrixMul example from CUDA SDK.

I’m receiving an error of "Kernel execution failed in <matrixMul.c>, line 119: invalid configuration argument.

The offending kernel configuration is:

[codebox]// execute the kernel

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);[/codebox]

My whole code is:

[codebox]/* Matrix multiplication: C = A * B.

  • Host code.

*/

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// includes, kernels

#include <matrixMul_kernel.cu>

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void runTest(int argc, char** argv);

void printDiff(float*, float*, int, int);

extern “C”

void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main(int argc, char** argv)

{

runTest(argc, argv);

cutilExit(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

void

runTest(int argc, char** argv)

{

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

    cutilDeviceInit(argc, argv);

else

    cudaSetDevice( cutGetMaxGflopsDeviceId() );

// allocate host memory for matrices A and B

unsigned int size_A = WA * HA;

unsigned int mem_size_A = sizeof(float) * size_A;

unsigned int size_B = WB * HB;

unsigned int mem_size_B = sizeof(float) * size_B;

float h_A[4][6] = {{3.1, 1.9, 13.4, -16.8, -0.1, -1.5},

				{2.7, 2.0, 16.4, -16.4, -1.6, -3.0},

				{3.1, 1.8, 15.8, -16.5, -1.3, -2.8},

				{3.3, 1.8, 14.6, -16.7, -1.1, -2.0}},

	h_B[4][6] = {{-2.0, -9.0, 5.0, 6.0, 15.0, 7.0},

			{-15.0, 9.0, 8.0, 6.0, 8.0, -4.0},

			{-8.0, -4.0, 6.0, 3.0, 2.0, 8.0},

			{-7.0, 6.0, 4.0, 2.0, -8.0, 7.0}};

			

// allocate device memory

float* d_A;

cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));

float* d_B;

cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

// copy host memory to device

cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,

                          cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,

                          cudaMemcpyHostToDevice) );

// allocate device memory for result

unsigned int size_C = WC * HC;

unsigned int mem_size_C = sizeof(float) * size_C;

float* d_C;

cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));

// allocate host memory for the result

float h_C[4][4] = {{0, 0, 0, 0},

			{0, 0, 0, 0},

			{0, 0, 0, 0},

			{0, 0, 0, 0}};

// create and start timer

unsigned int timer = 0;

cutilCheckError(cutCreateTimer(&timer));

cutilCheckError(cutStartTimer(timer));

// setup execution parameters

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

dim3 grid(WC / threads.x, HC / threads.y);

// execute the kernel

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

// check if kernel execution generated and error

cutilCheckMsg("Kernel execution failed");

// copy result from device to host

cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C,

                          cudaMemcpyDeviceToHost) );

// stop and destroy timer

cutilCheckError(cutStopTimer(timer));

printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));

cutilCheckError(cutDeleteTimer(timer));

// compute reference solution

float* reference = (float*) malloc(mem_size_C);

computeGold(reference, (const float*)h_A, (const float*)h_B, HA, WA, WB);

int i, j;

 for(i = 0; i < 4; i++)

{

	for( j = 0; j < 4; j++)

	printf("%f ", h_C[i][j]);

	putchar('\n'); 

}

// check result

CUTBoolean res = cutCompareL2fe(reference, (const float*)h_C, size_C, 1e-6f);

printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");

if (res!=1) printDiff(reference, (float*)h_C, WC, HC);

// clean up memory

free(h_A);

free(h_B);

free(h_C);

free(reference);

cutilSafeCall(cudaFree(d_A));

cutilSafeCall(cudaFree(d_B));

cutilSafeCall(cudaFree(d_C));

cudaThreadExit();

}

void printDiff(float *data1, float *data2, int width, int height)

{

int i,j,k;

int error_count=0;

for (j=0; j<height; j++) {

for (i=0; i<width; i++) {

  k = j*width+i;

  if (data1[k] != data2[k]) {

     printf("diff(%d,%d) CPU=%4.4f, GPU=%4.4f n", i,j, data1[k], data2[k]);

     error_count++;

  }

}

}

printf(" nTotal Errors = %d n", error_count);

}[/codebox]

Any help is greatly appreciated :)

Tom W

Hi,

where do you define these values?

WC, HC, BLOCK_SIZE, thread.x and thread.y

I’m using make which refernces matrixMul.h (which I’ve editied from the original)

matrixMul.h:

[codebox]#ifndef MATRIXMUL_H

define MATRIXMUL_H

// Thread block size

define BLOCK_SIZE 16

// Matrix dimensions

// (chosen as multiples of the thread block size for simplicity)

define WA 6 // Matrix A width

define HA 4 // Matrix A height

define WB 6 // Matrix B width

define HB 4 // Matrix B height

define WC 4 // Matrix C width

define HC 4 // Matrix C height

endif // MATRIXMUL_H[/codebox]

Aren’t threads.x and threads.y the x and y compnents of threads defined above? That part is unchanged from the original code in the examples.

Tom W

in integer arithmetic, x / 16 == 0 when x < 16, so you’re trying to launch 0 blocks

Ah I see, thanks for pointing that out.

Tom W

Bye!

OK everything compiles now but now I get a segfault when I try to copy from host to device memory. (I adjusted the code to all be in one .cu file and edited the Makefile accordingly)

[codebox]/*

/* Matrix multiplication: C = A * B.

  • Host code.

*/

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// Thread block size

#define BLOCK_SIZE 16

// Matrix dimensions

// (chosen as multiples of the thread block size for simplicity)

#define WA (6 * BLOCK_SIZE) // Matrix A width

#define HA (4 * BLOCK_SIZE) // Matrix A height

#define WB (4 * BLOCK_SIZE) // Matrix B width

#define HB WA // Matrix B height

#define WC WB // Matrix C width

#define HC HA // Matrix C height

#define CHECK_BANK_CONFLICTS 0

#if CHECK_BANK_CONFLICTS

#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))

#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))

#else

#define AS(i, j) As[i][j]

#define BS(i, j) Bs[i][j]

#endif

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void runTest(int argc, char** argv);

void printDiff(float*, float*, int, int);

global void matrixMul( float* C, float* A, float* B, int wA, int wB);

void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main(int argc, char** argv)

{

runTest(argc, argv);

cutilExit(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

void

runTest(int argc, char** argv)

{

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

    cutilDeviceInit(argc, argv);

else

    cudaSetDevice( cutGetMaxGflopsDeviceId() );

// allocate host memory for matrices A and B

unsigned int size_A = WA * HA;

unsigned int mem_size_A = sizeof(float) * size_A;

unsigned int size_B = WB * HB;

unsigned int mem_size_B = sizeof(float) * size_B;

float h_A[4][6] = {{3.1, 1.9, 13.4, -16.8, -0.1, -1.5},

				{2.7, 2.0, 16.4, -16.4, -1.6, -3.0},

				{3.1, 1.8, 15.8, -16.5, -1.3, -2.8},

				{3.3, 1.8, 14.6, -16.7, -1.1, -2.0}},

	h_B[6][4] = {{-2.0, -15.0, -8.0, -7.0},

			{-9.0, 9.0, -4.0, 6.0},

			{5.0,  8.0, 6.0, 4.0},

			{6.0, 6.0, 3.0, 2.0},

			{15.0, 8.0, 2.0, -8.0},

			{7.0, -4.0, 8.0, 7.0}};

			

int i, j;

			

// allocate device memory

float* d_A;

cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));

float* d_B;

cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

// copy host memory to device

cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,

                          cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,

                          cudaMemcpyHostToDevice) );

			

	for(i = 0; i < 4; i++)

{

	for( j = 0; j < 6; j++)

	printf("%f ", h_A[i][j]);

	putchar('\n'); 

}

// allocate device memory for result

unsigned int size_C = WC * HC;

unsigned int mem_size_C = sizeof(float) * size_C;

float* d_C;

cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));

// allocate host memory for the result

float h_C[4][4] = {{0, 0, 0, 0},

			{0, 0, 0, 0},

			{0, 0, 0, 0},

			{0, 0, 0, 0}};

// create and start timer

unsigned int timer = 0;

cutilCheckError(cutCreateTimer(&timer));

cutilCheckError(cutStartTimer(timer));

// setup execution parameters

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

dim3 grid(WC / threads.x, HC / threads.y);

// execute the kernel

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

// check if kernel execution generated and error

cutilCheckMsg("Kernel execution failed");

// copy result from device to host

cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C,

                          cudaMemcpyDeviceToHost) );

// stop and destroy timer

cutilCheckError(cutStopTimer(timer));

printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));

cutilCheckError(cutDeleteTimer(timer));

// compute reference solution

float* reference = (float*) malloc(mem_size_C);

computeGold(reference, (const float*)h_A, (const float*)h_B, HA, WA, WB);



 for(i = 0; i < 4; i++)

{

	for( j = 0; j < 4; j++)

	printf("%f ", h_C[i][j]);

	putchar('\n'); 

}

// check result

CUTBoolean res = cutCompareL2fe(reference, (const float*)h_C, size_C, 1e-6f);

printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");

if (res!=1) printDiff(reference, (float*)h_C, WC, HC);

// clean up memory

free(h_A);

free(h_B);

free(h_C);

free(reference);

cutilSafeCall(cudaFree(d_A));

cutilSafeCall(cudaFree(d_B));

cutilSafeCall(cudaFree(d_C));

cudaThreadExit();

}

void

computeGold(float* C, const float* A, const float* B, unsigned int hA, unsigned int wA, unsigned int wB)

{

for (unsigned int i = 0; i < hA; ++i)

    for (unsigned int j = 0; j < wB; ++j) {

        float sum = 0;

        for (unsigned int k = 0; k < wA; ++k) {

            float a = A[i * wA + k];

            float b = B[k * wB + j];

            sum += a * b;

        }

        C[i * wB + j] = (float)sum;

    }

}

////////////////////////////////////////////////////////////////////////////////

//! Matrix multiplication on the device: C = A * B

//! wA is A’s width and wB is B’s width

////////////////////////////////////////////////////////////////////////////////

global void

matrixMul( float* C, float* A, float* B, int wA, int wB)

{

// Block index

int bx = blockIdx.x;

int by = blockIdx.y;

// Thread index

int tx = threadIdx.x;

int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block

int aBegin = wA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block

int aEnd   = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A

int aStep  = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block

int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B

int bStep  = BLOCK_SIZE * wB;

// Csub is used to store the element of the block sub-matrix

// that is computed by the thread

float Csub = 0;

// Loop over all the sub-matrices of A and B

// required to compute the block sub-matrix

for (int a = aBegin, b = bBegin;

         a <= aEnd;

         a += aStep, b += bStep) {

// Declaration of the shared memory array As used to

    // store the sub-matrix of A

    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

// Declaration of the shared memory array Bs used to

    // store the sub-matrix of B

    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from device memory

    // to shared memory; each thread loads

    // one element of each matrix

    AS(ty, tx) = A[a + wA * ty + tx];

    BS(ty, tx) = B[b + wB * ty + tx];

// Synchronize to make sure the matrices are loaded

    __syncthreads();

// Multiply the two matrices together;

    // each thread computes one element

    // of the block sub-matrix

    for (int k = 0; k < BLOCK_SIZE; ++k)

        Csub += AS(ty, k) * BS(k, tx);

// Synchronize to make sure that the preceding

    // computation is done before loading two new

    // sub-matrices of A and B in the next iteration

    __syncthreads();

}

// Write the block sub-matrix to device memory;

// each thread writes one element

int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;

}

void printDiff(float *data1, float *data2, int width, int height)

{

int i,j,k;

int error_count=0;

for (j=0; j<height; j++) {

for (i=0; i<width; i++) {

  k = j*width+i;

  if (data1[k] != data2[k]) {

     printf("diff(%d,%d) CPU=%4.4f, GPU=%4.4f n", i,j, data1[k], data2[k]);

     error_count++;

  }

}

}

printf(" nTotal Errors = %d n", error_count);

}[/codebox]

The segfault occurs somewhere in this code block:

[codebox]// copy host memory to device

cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,

                          cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,

                          cudaMemcpyHostToDevice) )[/codebox]

Again any help is greatly appreciated :)

Tom W

I think the problem is you are trying to copy a 2D array in to a linear region.