Hi all,
I’m performing a very simple operation that’s falling over on me: matrix addition.
I’m using the NVIDIA SDK code sample.
I keep getting this error:
“cutilCheckMsg() CUTIL CUDA error: kernel launch failure in file <./matAdd.cu>, line 24 : invalid configuration argument.”
Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include "cublas.h"
#include <cuda.h>
#include <cutil_inline.h>
typedef struct{
long w;
long h;
float* data;
}Matrix;
#include "./matAdd.cu"
void read_matrix_col_major(Matrix M,const char *fn)
{
long i,j;
char line[M.w*50]; //enough space
FILE *IN;
IN=fopen(fn,"r");
i=0;
while(fgets(line,sizeof(line),IN)!=NULL)
{
char *result;
result=strtok(line,",");
j=0;
while(result)
{
M.data[j*M.h+i]=(float)atof(result);
result=strtok(NULL,",");
j++;
}
i++;
}
fclose(IN);
}
void printUL_col_major(Matrix A) //prints the 10x10 upper-left corner of A
{
//prints the 10x10 upper-left sub-matrix
int i,j;
for(i=0;i<A.w;i++)
{
for(j=0;j<A.h;j++)
{
printf("%f ",A.data[i*A.h+j]);
if(j>10)
break;
}
printf("\n");
if(i>10)
break;
}
printf("\n\n");
}
int main()
{
long num_dims,num_hid;
num_dims=9600;
num_hid=4800;
Matrix X;
X.h=num_hid;X.w=num_dims;
X.data=(float*)calloc(X.h*X.w,sizeof(float));
Matrix Y;
Y.h=num_hid;Y.w=num_dims;
Y.data=(float*)calloc(Y.h*Y.w,sizeof(float));
Matrix Z;
Z.h=num_hid;Z.w=num_dims;
Z.data=(float*)calloc(Z.h*Z.w,sizeof(float));
read_matrix_col_major(X,"./X.txt");
read_matrix_col_major(Y,"./Y.txt");
Matrix X_d;
X_d.h=X.h;X_d.w=X.w;
cudaMalloc((void**)&X_d.data,X_d.h*X_d.w*sizeof(float));
cudaMemcpy(X_d.data,X.data,X.h*X.w*sizeof(float),cudaMemcpyHostToDevice);
Matrix Y_d;
Y_d.h=Y.h;Y_d.w=Y.w;
cudaMalloc((void**)&Y_d.data,Y_d.h*Y_d.w*sizeof(float));
cudaMemcpy(Y_d.data,Y.data,Y.h*Y.w*sizeof(float),cudaMemcpyHostToDevice);
Matrix Z_d;
Z_d.h=Z.h;Z_d.w=Z.w;
cudaMalloc((void**)&Z_d.data,Z_d.h*Z_d.w*sizeof(float));
cudaMemcpy(Z_d.data,Z.data,Z.h*Z.w*sizeof(float),cudaMemcpyHostToDevice);
//print matrices before operation:
cudaMemcpy(X.data,X_d.data,X_d.h*X_d.w*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(Y.data,Y_d.data,Y_d.h*Y_d.w*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(Z.data,Z_d.data,Z_d.h*Z_d.w*sizeof(float),cudaMemcpyDeviceToHost);
printf("X:\n");
printUL_col_major(X);
printf("Y:\n");
printUL_col_major(Y);
printf("Z:\n");
printUL_col_major(Z);
//perform the addition:
printf("performing sum operation:\n");
matAdd(Z_d,X_d,Y_d);
//print resulting operation:
cudaMemcpy(Z.data,Z_d.data,Z_d.h*Z_d.w*sizeof(float),cudaMemcpyDeviceToHost);
printf("Z:\n");
printUL_col_major(Z);
return 0;
}
And here’s the matAdd.cu kernel:
__global__ void matAdd_kernel(float *A,float *B,float *C,int N)
{
int block_id=blockIdx.x+gridDim.x*blockIdx.y;
int thread_id=blockDim.x*block_id+threadIdx.x;
if(thread_id<N)
{
C[thread_id]=A[thread_id]+B[thread_id];
}
}
void matAdd(Matrix C,Matrix A,Matrix B)
{
if(!(A.h==B.h && A.w==B.w && C.h==A.h && C.w==A.w))
{
printf("Error: matAdd.cu\nMatrix addition attempted.\nMatrix dimensions do not agree!\n");
printf("Tried A+B, where A: %ldx%ld, B: %ldx%ld\n",A.h,A.w,B.h,B.w);
exit(0);
}
int N=A.h*A.w;
int threadsPerBlock=256;
int blocksPerGrid=(N+threadsPerBlock-1)/threadsPerBlock;
matAdd_kernel<<<blocksPerGrid,threadsPerBlock>>>(A.data,B.data,C.data,N);
cutilCheckMsg("kernel launch failure");
}
Everything’s compiling fine, it’s just I keep getting a “invalid configuration argument” at run-time.
When I take out the line “cutilCheckMsg(“kernel launch failure”);”, it runs fine, but the matrix Z is empty… ;(
I’m using an Tesla C1060 with 4GB of RAM.
X is 4800x9600 and Y is 4800x9600…
Is this too big?
Also, here’s some output: