Noob question: Kernel configuration failed, invalid configuration argument

tomw2005 · July 29, 2009, 4:13pm

Hi I’m familiarising myself with CUDA (and re-familiarising myself with C) by trying to edit the matrixMul example from CUDA SDK.

I’m receiving an error of "Kernel execution failed in <matrixMul.c>, line 119: invalid configuration argument.

The offending kernel configuration is:

[codebox]// execute the kernel

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);[/codebox]

My whole code is:

[codebox]/* Matrix multiplication: C = A * B.

Host code.

*/

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// includes, kernels

#include <matrixMul_kernel.cu>

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void runTest(int argc, char** argv);

void printDiff(float*, float*, int, int);

extern “C”

void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main(int argc, char** argv)

{

runTest(argc, argv);

cutilExit(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

void

runTest(int argc, char** argv)

{

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

    cutilDeviceInit(argc, argv);

else

    cudaSetDevice( cutGetMaxGflopsDeviceId() );

// allocate host memory for matrices A and B

unsigned int size_A = WA * HA;

unsigned int mem_size_A = sizeof(float) * size_A;

unsigned int size_B = WB * HB;

unsigned int mem_size_B = sizeof(float) * size_B;

float h_A[4][6] = {{3.1, 1.9, 13.4, -16.8, -0.1, -1.5},

				{2.7, 2.0, 16.4, -16.4, -1.6, -3.0},

				{3.1, 1.8, 15.8, -16.5, -1.3, -2.8},

				{3.3, 1.8, 14.6, -16.7, -1.1, -2.0}},

	h_B[4][6] = {{-2.0, -9.0, 5.0, 6.0, 15.0, 7.0},

			{-15.0, 9.0, 8.0, 6.0, 8.0, -4.0},

			{-8.0, -4.0, 6.0, 3.0, 2.0, 8.0},

			{-7.0, 6.0, 4.0, 2.0, -8.0, 7.0}};

			

// allocate device memory

float* d_A;

cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));

float* d_B;

cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

// copy host memory to device

cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,

                          cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,

                          cudaMemcpyHostToDevice) );

// allocate device memory for result

unsigned int size_C = WC * HC;

unsigned int mem_size_C = sizeof(float) * size_C;

float* d_C;

cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));

// allocate host memory for the result

float h_C[4][4] = {{0, 0, 0, 0},

			{0, 0, 0, 0},

			{0, 0, 0, 0},

			{0, 0, 0, 0}};

// create and start timer

unsigned int timer = 0;

cutilCheckError(cutCreateTimer(&timer));

cutilCheckError(cutStartTimer(timer));

// setup execution parameters

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

dim3 grid(WC / threads.x, HC / threads.y);

// execute the kernel

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

// check if kernel execution generated and error

cutilCheckMsg("Kernel execution failed");

// copy result from device to host

cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C,

                          cudaMemcpyDeviceToHost) );

// stop and destroy timer

cutilCheckError(cutStopTimer(timer));

printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));

cutilCheckError(cutDeleteTimer(timer));

// compute reference solution

float* reference = (float*) malloc(mem_size_C);

computeGold(reference, (const float*)h_A, (const float*)h_B, HA, WA, WB);

int i, j;

 for(i = 0; i < 4; i++)

{

	for( j = 0; j < 4; j++)

	printf("%f ", h_C[i][j]);

	putchar('\n'); 

}

// check result

CUTBoolean res = cutCompareL2fe(reference, (const float*)h_C, size_C, 1e-6f);

printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");

if (res!=1) printDiff(reference, (float*)h_C, WC, HC);

// clean up memory

free(h_A);

free(h_B);

free(h_C);

free(reference);

cutilSafeCall(cudaFree(d_A));

cutilSafeCall(cudaFree(d_B));

cutilSafeCall(cudaFree(d_C));

cudaThreadExit();

}

void printDiff(float *data1, float *data2, int width, int height)

{

int i,j,k;

int error_count=0;

for (j=0; j<height; j++) {

for (i=0; i<width; i++) {

  k = j*width+i;

  if (data1[k] != data2[k]) {

     printf("diff(%d,%d) CPU=%4.4f, GPU=%4.4f n", i,j, data1[k], data2[k]);

     error_count++;

  }

}

}

printf(" nTotal Errors = %d n", error_count);

}[/codebox]

Any help is greatly appreciated :)

Tom W

CaLu · July 29, 2009, 4:30pm

Hi,

where do you define these values?

WC, HC, BLOCK_SIZE, thread.x and thread.y

tomw2005 · July 29, 2009, 4:38pm

I’m using make which refernces matrixMul.h (which I’ve editied from the original)

matrixMul.h:

[codebox]#ifndef MATRIXMUL_H

define MATRIXMUL_H

// Thread block size

define BLOCK_SIZE 16

// Matrix dimensions

// (chosen as multiples of the thread block size for simplicity)

define WA 6 // Matrix A width

define HA 4 // Matrix A height

define WB 6 // Matrix B width

define HB 4 // Matrix B height

define WC 4 // Matrix C width

define HC 4 // Matrix C height

endif // MATRIXMUL_H[/codebox]

Aren’t threads.x and threads.y the x and y compnents of threads defined above? That part is unchanged from the original code in the examples.

Tom W

tmurray · July 29, 2009, 4:49pm

in integer arithmetic, x / 16 == 0 when x < 16, so you’re trying to launch 0 blocks

tomw2005 · July 29, 2009, 5:06pm

Ah I see, thanks for pointing that out.

Tom W

CaLu · July 29, 2009, 6:01pm

Bye!

tomw2005 · July 30, 2009, 11:06am

OK everything compiles now but now I get a segfault when I try to copy from host to device memory. (I adjusted the code to all be in one .cu file and edited the Makefile accordingly)

[codebox]/*

/* Matrix multiplication: C = A * B.

Host code.

*/

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// Thread block size

#define BLOCK_SIZE 16

// Matrix dimensions

// (chosen as multiples of the thread block size for simplicity)

#define WA (6 * BLOCK_SIZE) // Matrix A width

#define HA (4 * BLOCK_SIZE) // Matrix A height

#define WB (4 * BLOCK_SIZE) // Matrix B width

#define HB WA // Matrix B height

#define WC WB // Matrix C width

#define HC HA // Matrix C height

#define CHECK_BANK_CONFLICTS 0

#if CHECK_BANK_CONFLICTS

#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))

#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))

#else

#define AS(i, j) As[i][j]

#define BS(i, j) Bs[i][j]

#endif

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void runTest(int argc, char** argv);

void printDiff(float*, float*, int, int);

global void matrixMul( float* C, float* A, float* B, int wA, int wB);

void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main(int argc, char** argv)

{

runTest(argc, argv);

cutilExit(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

void

runTest(int argc, char** argv)

{

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

    cutilDeviceInit(argc, argv);

else

    cudaSetDevice( cutGetMaxGflopsDeviceId() );

// allocate host memory for matrices A and B

unsigned int size_A = WA * HA;

unsigned int mem_size_A = sizeof(float) * size_A;

unsigned int size_B = WB * HB;

unsigned int mem_size_B = sizeof(float) * size_B;

float h_A[4][6] = {{3.1, 1.9, 13.4, -16.8, -0.1, -1.5},

				{2.7, 2.0, 16.4, -16.4, -1.6, -3.0},

				{3.1, 1.8, 15.8, -16.5, -1.3, -2.8},

				{3.3, 1.8, 14.6, -16.7, -1.1, -2.0}},

	h_B[6][4] = {{-2.0, -15.0, -8.0, -7.0},

			{-9.0, 9.0, -4.0, 6.0},

			{5.0,  8.0, 6.0, 4.0},

			{6.0, 6.0, 3.0, 2.0},

			{15.0, 8.0, 2.0, -8.0},

			{7.0, -4.0, 8.0, 7.0}};

			

int i, j;

			

// allocate device memory

float* d_A;

cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));

float* d_B;

cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

// copy host memory to device

cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,

                          cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,

                          cudaMemcpyHostToDevice) );

			

	for(i = 0; i < 4; i++)

{

	for( j = 0; j < 6; j++)

	printf("%f ", h_A[i][j]);

	putchar('\n'); 

}

// allocate device memory for result

unsigned int size_C = WC * HC;

unsigned int mem_size_C = sizeof(float) * size_C;

float* d_C;

cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));

// allocate host memory for the result

float h_C[4][4] = {{0, 0, 0, 0},

			{0, 0, 0, 0},

			{0, 0, 0, 0},

			{0, 0, 0, 0}};

// create and start timer

unsigned int timer = 0;

cutilCheckError(cutCreateTimer(&timer));

cutilCheckError(cutStartTimer(timer));

// setup execution parameters

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

dim3 grid(WC / threads.x, HC / threads.y);

// execute the kernel

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

// check if kernel execution generated and error

cutilCheckMsg("Kernel execution failed");

// copy result from device to host

cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C,

                          cudaMemcpyDeviceToHost) );

// stop and destroy timer

cutilCheckError(cutStopTimer(timer));

printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));

cutilCheckError(cutDeleteTimer(timer));

// compute reference solution

float* reference = (float*) malloc(mem_size_C);

computeGold(reference, (const float*)h_A, (const float*)h_B, HA, WA, WB);



 for(i = 0; i < 4; i++)

{

	for( j = 0; j < 4; j++)

	printf("%f ", h_C[i][j]);

	putchar('\n'); 

}

// check result

CUTBoolean res = cutCompareL2fe(reference, (const float*)h_C, size_C, 1e-6f);

printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");

if (res!=1) printDiff(reference, (float*)h_C, WC, HC);

// clean up memory

free(h_A);

free(h_B);

free(h_C);

free(reference);

cutilSafeCall(cudaFree(d_A));

cutilSafeCall(cudaFree(d_B));

cutilSafeCall(cudaFree(d_C));

cudaThreadExit();

}

void

computeGold(float* C, const float* A, const float* B, unsigned int hA, unsigned int wA, unsigned int wB)

{

for (unsigned int i = 0; i < hA; ++i)

    for (unsigned int j = 0; j < wB; ++j) {

        float sum = 0;

        for (unsigned int k = 0; k < wA; ++k) {

            float a = A[i * wA + k];

            float b = B[k * wB + j];

            sum += a * b;

        }

        C[i * wB + j] = (float)sum;

    }

}

////////////////////////////////////////////////////////////////////////////////

//! Matrix multiplication on the device: C = A * B

//! wA is A’s width and wB is B’s width

////////////////////////////////////////////////////////////////////////////////

global void

matrixMul( float* C, float* A, float* B, int wA, int wB)

{

// Block index

int bx = blockIdx.x;

int by = blockIdx.y;

// Thread index

int tx = threadIdx.x;

int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block

int aBegin = wA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block

int aEnd   = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A

int aStep  = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block

int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B

int bStep  = BLOCK_SIZE * wB;

// Csub is used to store the element of the block sub-matrix

// that is computed by the thread

float Csub = 0;

// Loop over all the sub-matrices of A and B

// required to compute the block sub-matrix

for (int a = aBegin, b = bBegin;

         a <= aEnd;

         a += aStep, b += bStep) {

// Declaration of the shared memory array As used to

    // store the sub-matrix of A

    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

// Declaration of the shared memory array Bs used to

    // store the sub-matrix of B

    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from device memory

    // to shared memory; each thread loads

    // one element of each matrix

    AS(ty, tx) = A[a + wA * ty + tx];

    BS(ty, tx) = B[b + wB * ty + tx];

// Synchronize to make sure the matrices are loaded

    __syncthreads();

// Multiply the two matrices together;

    // each thread computes one element

    // of the block sub-matrix

    for (int k = 0; k < BLOCK_SIZE; ++k)

        Csub += AS(ty, k) * BS(k, tx);

// Synchronize to make sure that the preceding

    // computation is done before loading two new

    // sub-matrices of A and B in the next iteration

    __syncthreads();

}

// Write the block sub-matrix to device memory;

// each thread writes one element

int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;

}

void printDiff(float *data1, float *data2, int width, int height)

{

int i,j,k;

int error_count=0;

for (j=0; j<height; j++) {

for (i=0; i<width; i++) {

  k = j*width+i;

  if (data1[k] != data2[k]) {

     printf("diff(%d,%d) CPU=%4.4f, GPU=%4.4f n", i,j, data1[k], data2[k]);

     error_count++;

  }

}

}

printf(" nTotal Errors = %d n", error_count);

}[/codebox]

The segfault occurs somewhere in this code block:

[codebox]// copy host memory to device

cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,

                          cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,

                          cudaMemcpyHostToDevice) )[/codebox]

Again any help is greatly appreciated :)

Tom W

CaLu · July 31, 2009, 2:37pm

I think the problem is you are trying to copy a 2D array in to a linear region.

Topic		Replies	Views
fatal error LNK1104 Help CUDA Programming and Performance	1	591	February 15, 2013
multiplication of matrix using shared memory problem of multiplication CUDA Programming and Performance	2	3946	September 30, 2010
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10017	January 18, 2012
Warp shuffle instruction not working as expected CUDA Programming and Performance	7	842	September 6, 2023
Matrix Multiplication Garbage value :( CUDA Programming and Performance	10	3416	July 25, 2009
Matrix Addition Failing Simple matrix addition acting up... CUDA Programming and Performance	6	2775	July 16, 2010
char array - my mistake? CUDA Programming and Performance	33	7533	October 22, 2010
anybody can help me with the example in chaper 6? do follow the example on p&G but failed CUDA Programming and Performance	3	4496	March 13, 2008
Is there an error in the cuda manual matrix multiplication example? CUDA Programming and Performance	11	12894	December 1, 2016
Example of Matrix Multiplication(from cuda book) points that i dont anderstend ... CUDA Programming and Performance	6	34337	February 3, 2008

Noob question: Kernel configuration failed, invalid configuration argument

Related topics