Hi I’m familiarising myself with CUDA (and re-familiarising myself with C) by trying to edit the matrixMul example from CUDA SDK.
I’m receiving an error of "Kernel execution failed in <matrixMul.c>, line 119: invalid configuration argument.
The offending kernel configuration is:
[codebox]// execute the kernel
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);[/codebox]
My whole code is:
[codebox]/* Matrix multiplication: C = A * B.
- Host code.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil_inline.h>
// includes, kernels
#include <matrixMul_kernel.cu>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
void printDiff(float*, float*, int, int);
extern “C”
void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{
runTest(argc, argv);
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest(int argc, char** argv)
{
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float h_A[4][6] = {{3.1, 1.9, 13.4, -16.8, -0.1, -1.5},
{2.7, 2.0, 16.4, -16.4, -1.6, -3.0},
{3.1, 1.8, 15.8, -16.5, -1.3, -2.8},
{3.3, 1.8, 14.6, -16.7, -1.1, -2.0}},
h_B[4][6] = {{-2.0, -9.0, 5.0, 6.0, 15.0, 7.0},
{-15.0, 9.0, 8.0, 6.0, 8.0, -4.0},
{-8.0, -4.0, 6.0, 3.0, 2.0, 8.0},
{-7.0, 6.0, 4.0, 2.0, -8.0, 7.0}};
// allocate device memory
float* d_A;
cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));
float* d_B;
cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));
// copy host memory to device
cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,
cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,
cudaMemcpyHostToDevice) );
// allocate device memory for result
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* d_C;
cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));
// allocate host memory for the result
float h_C[4][4] = {{0, 0, 0, 0},
{0, 0, 0, 0},
{0, 0, 0, 0},
{0, 0, 0, 0}};
// create and start timer
unsigned int timer = 0;
cutilCheckError(cutCreateTimer(&timer));
cutilCheckError(cutStartTimer(timer));
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
// execute the kernel
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// copy result from device to host
cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C,
cudaMemcpyDeviceToHost) );
// stop and destroy timer
cutilCheckError(cutStopTimer(timer));
printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));
cutilCheckError(cutDeleteTimer(timer));
// compute reference solution
float* reference = (float*) malloc(mem_size_C);
computeGold(reference, (const float*)h_A, (const float*)h_B, HA, WA, WB);
int i, j;
for(i = 0; i < 4; i++)
{
for( j = 0; j < 4; j++)
printf("%f ", h_C[i][j]);
putchar('\n');
}
// check result
CUTBoolean res = cutCompareL2fe(reference, (const float*)h_C, size_C, 1e-6f);
printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");
if (res!=1) printDiff(reference, (float*)h_C, WC, HC);
// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_B));
cutilSafeCall(cudaFree(d_C));
cudaThreadExit();
}
void printDiff(float *data1, float *data2, int width, int height)
{
int i,j,k;
int error_count=0;
for (j=0; j<height; j++) {
for (i=0; i<width; i++) {
k = j*width+i;
if (data1[k] != data2[k]) {
printf("diff(%d,%d) CPU=%4.4f, GPU=%4.4f n", i,j, data1[k], data2[k]);
error_count++;
}
}
}
printf(" nTotal Errors = %d n", error_count);
}[/codebox]
Any help is greatly appreciated :)
Tom W