Matrix Addition Failing Simple matrix addition acting up...

Hi all,

I’m performing a very simple operation that’s falling over on me: matrix addition.

I’m using the NVIDIA SDK code sample.

I keep getting this error:

“cutilCheckMsg() CUTIL CUDA error: kernel launch failure in file <./matAdd.cu>, line 24 : invalid configuration argument.”

Here is the code:

#include <stdio.h>

#include <stdlib.h>

#include <sys/time.h>

#include <sys/types.h>

#include <unistd.h>

#include "cublas.h"

#include <cuda.h>

#include <cutil_inline.h>

typedef struct{

		long w;

		long h;

		float* data;

}Matrix;

#include "./matAdd.cu"

void read_matrix_col_major(Matrix M,const char *fn)

{

		long i,j;

		char line[M.w*50];   //enough space

		FILE *IN;

		IN=fopen(fn,"r");

		i=0;

		while(fgets(line,sizeof(line),IN)!=NULL)

		{

				char *result;

				result=strtok(line,",");

				j=0;

				while(result)

				{

						M.data[j*M.h+i]=(float)atof(result);

						result=strtok(NULL,",");

						j++;

				}

				i++;

		}

		fclose(IN);

}

void printUL_col_major(Matrix A)   //prints the 10x10 upper-left corner of A

{

	//prints the 10x10 upper-left sub-matrix

		int i,j;

		for(i=0;i<A.w;i++)

		{

				for(j=0;j<A.h;j++)

				{

						printf("%f ",A.data[i*A.h+j]);

						if(j>10)

								break;

				}

				printf("\n");

				if(i>10)

						break;

		}

		printf("\n\n");

}

int main()

{

		long num_dims,num_hid;

		num_dims=9600;

		num_hid=4800;

		Matrix X;

		X.h=num_hid;X.w=num_dims;

		X.data=(float*)calloc(X.h*X.w,sizeof(float));

		Matrix Y;

		Y.h=num_hid;Y.w=num_dims;

		Y.data=(float*)calloc(Y.h*Y.w,sizeof(float));

		Matrix Z;

		Z.h=num_hid;Z.w=num_dims;

		Z.data=(float*)calloc(Z.h*Z.w,sizeof(float));

		read_matrix_col_major(X,"./X.txt");

		read_matrix_col_major(Y,"./Y.txt");

		Matrix X_d;

		X_d.h=X.h;X_d.w=X.w;

		cudaMalloc((void**)&X_d.data,X_d.h*X_d.w*sizeof(float));

		cudaMemcpy(X_d.data,X.data,X.h*X.w*sizeof(float),cudaMemcpyHostToDevice);

		Matrix Y_d;

		Y_d.h=Y.h;Y_d.w=Y.w;

		cudaMalloc((void**)&Y_d.data,Y_d.h*Y_d.w*sizeof(float));

		cudaMemcpy(Y_d.data,Y.data,Y.h*Y.w*sizeof(float),cudaMemcpyHostToDevice);

		Matrix Z_d;

		Z_d.h=Z.h;Z_d.w=Z.w;

		cudaMalloc((void**)&Z_d.data,Z_d.h*Z_d.w*sizeof(float));

		cudaMemcpy(Z_d.data,Z.data,Z.h*Z.w*sizeof(float),cudaMemcpyHostToDevice);

		//print matrices before operation:

		cudaMemcpy(X.data,X_d.data,X_d.h*X_d.w*sizeof(float),cudaMemcpyDeviceToHost);

		cudaMemcpy(Y.data,Y_d.data,Y_d.h*Y_d.w*sizeof(float),cudaMemcpyDeviceToHost);

		cudaMemcpy(Z.data,Z_d.data,Z_d.h*Z_d.w*sizeof(float),cudaMemcpyDeviceToHost);

		printf("X:\n");

		printUL_col_major(X);

		printf("Y:\n");

		printUL_col_major(Y);

		printf("Z:\n");

		printUL_col_major(Z);

		//perform the addition:

		printf("performing sum operation:\n");

		matAdd(Z_d,X_d,Y_d);

		//print resulting operation:

		cudaMemcpy(Z.data,Z_d.data,Z_d.h*Z_d.w*sizeof(float),cudaMemcpyDeviceToHost);

		printf("Z:\n");

		printUL_col_major(Z);

		return 0;

}

And here’s the matAdd.cu kernel:

__global__ void matAdd_kernel(float *A,float *B,float *C,int N)

{

		int block_id=blockIdx.x+gridDim.x*blockIdx.y;

		int thread_id=blockDim.x*block_id+threadIdx.x;

		if(thread_id<N)

		{

				C[thread_id]=A[thread_id]+B[thread_id];

		}

}

void matAdd(Matrix C,Matrix A,Matrix B)

{

		if(!(A.h==B.h && A.w==B.w && C.h==A.h && C.w==A.w))

		{

				printf("Error: matAdd.cu\nMatrix addition attempted.\nMatrix dimensions do not agree!\n");

				printf("Tried A+B, where A: %ldx%ld, B: %ldx%ld\n",A.h,A.w,B.h,B.w);

				exit(0);

		}

		int N=A.h*A.w;

		int threadsPerBlock=256;

		int blocksPerGrid=(N+threadsPerBlock-1)/threadsPerBlock;

		matAdd_kernel<<<blocksPerGrid,threadsPerBlock>>>(A.data,B.data,C.data,N);

		cutilCheckMsg("kernel launch failure");

}

Everything’s compiling fine, it’s just I keep getting a “invalid configuration argument” at run-time.

When I take out the line “cutilCheckMsg(“kernel launch failure”);”, it runs fine, but the matrix Z is empty… ;(

I’m using an Tesla C1060 with 4GB of RAM.

X is 4800x9600 and Y is 4800x9600…

Is this too big?

Also, here’s some output:

No, but it’s too big for one dimension. Use a two-dimensional grid to get enough blocks.

You can check yourself in Appendix G.1 of the Programming Guide.

Hi thanks for that.

Sorry to be such a school boy, but could you suggest a fix? Do you know of any 2-D add kernels?

I tried this:

__global__ void matAdd_kernel(float *A,float *B,float *C,int N)

{

		int i=threadIdx.x;

		int j=threadIdx.y;

		C[j*N+i]=A[j*N+i]+B[j*N+i];

}

but I think it’s wrong…

Edit: CUDA Fail… :( sorry…

Try this:

void matAdd(Matrix C,Matrix A,Matrix B)

{

		if(!(A.h==B.h && A.w==B.w && C.h==A.h && C.w==A.w))

		{

				printf("Error: matAdd.cu\nMatrix addition attempted.\nMatrix dimensions do not agree!\n");

				printf("Tried A+B, where A: %ldx%ld, B: %ldx%ld\n",A.h,A.w,B.h,B.w);

				exit(0);

		}

		int threadsPerBlock = 64;

		dim3 blocksPerGrid;

		blocksPerGrid.x = (A.w+threadsPerBlock-1)/threadsPerBlock;

		blocksPerGrid.y = A.h;

		blocksPerGrid.z = 1;

		matAdd_kernel<<<blocksPerGrid,threadsPerBlock>>>(A.data, B.data, C.data, A.w*A.h);

		cutilCheckMsg("kernel launch failure");

}

Hey,

Thanks for that…

It seems to only work on one column…

Here’s the output I got:

At least it’s something and not all zeros now.

Any more ideas?

Edit: 9500x4800x4 is 550MB and the 8800 only has 512MB… Still looking for a solution to the above problem though…

When I try to run this on my desktop’s GT8800, I get this CUTIL CUDA error:

I can’t see anything else wrong. Are you really using my modified matAdd() together with the unmodified remaining code from your first post?