Matrix Addition Failing Simple matrix addition acting up...

HickoryDock · July 16, 2010, 2:15pm

Hi all,

I’m performing a very simple operation that’s falling over on me: matrix addition.

I’m using the NVIDIA SDK code sample.

I keep getting this error:

“cutilCheckMsg() CUTIL CUDA error: kernel launch failure in file <./matAdd.cu>, line 24 : invalid configuration argument.”

Here is the code:

#include <stdio.h>

#include <stdlib.h>

#include <sys/time.h>

#include <sys/types.h>

#include <unistd.h>

#include "cublas.h"

#include <cuda.h>

#include <cutil_inline.h>

typedef struct{

		long w;

		long h;

		float* data;

}Matrix;

#include "./matAdd.cu"

void read_matrix_col_major(Matrix M,const char *fn)

{

		long i,j;

		char line[M.w*50];   //enough space

		FILE *IN;

		IN=fopen(fn,"r");

		i=0;

		while(fgets(line,sizeof(line),IN)!=NULL)

		{

				char *result;

				result=strtok(line,",");

				j=0;

				while(result)

				{

						M.data[j*M.h+i]=(float)atof(result);

						result=strtok(NULL,",");

						j++;

				}

				i++;

		}

		fclose(IN);

}

void printUL_col_major(Matrix A)   //prints the 10x10 upper-left corner of A

{

	//prints the 10x10 upper-left sub-matrix

		int i,j;

		for(i=0;i<A.w;i++)

		{

				for(j=0;j<A.h;j++)

				{

						printf("%f ",A.data[i*A.h+j]);

						if(j>10)

								break;

				}

				printf("\n");

				if(i>10)

						break;

		}

		printf("\n\n");

}

int main()

{

		long num_dims,num_hid;

		num_dims=9600;

		num_hid=4800;

		Matrix X;

		X.h=num_hid;X.w=num_dims;

		X.data=(float*)calloc(X.h*X.w,sizeof(float));

		Matrix Y;

		Y.h=num_hid;Y.w=num_dims;

		Y.data=(float*)calloc(Y.h*Y.w,sizeof(float));

		Matrix Z;

		Z.h=num_hid;Z.w=num_dims;

		Z.data=(float*)calloc(Z.h*Z.w,sizeof(float));

		read_matrix_col_major(X,"./X.txt");

		read_matrix_col_major(Y,"./Y.txt");

		Matrix X_d;

		X_d.h=X.h;X_d.w=X.w;

		cudaMalloc((void**)&X_d.data,X_d.h*X_d.w*sizeof(float));

		cudaMemcpy(X_d.data,X.data,X.h*X.w*sizeof(float),cudaMemcpyHostToDevice);

		Matrix Y_d;

		Y_d.h=Y.h;Y_d.w=Y.w;

		cudaMalloc((void**)&Y_d.data,Y_d.h*Y_d.w*sizeof(float));

		cudaMemcpy(Y_d.data,Y.data,Y.h*Y.w*sizeof(float),cudaMemcpyHostToDevice);

		Matrix Z_d;

		Z_d.h=Z.h;Z_d.w=Z.w;

		cudaMalloc((void**)&Z_d.data,Z_d.h*Z_d.w*sizeof(float));

		cudaMemcpy(Z_d.data,Z.data,Z.h*Z.w*sizeof(float),cudaMemcpyHostToDevice);

		//print matrices before operation:

		cudaMemcpy(X.data,X_d.data,X_d.h*X_d.w*sizeof(float),cudaMemcpyDeviceToHost);

		cudaMemcpy(Y.data,Y_d.data,Y_d.h*Y_d.w*sizeof(float),cudaMemcpyDeviceToHost);

		cudaMemcpy(Z.data,Z_d.data,Z_d.h*Z_d.w*sizeof(float),cudaMemcpyDeviceToHost);

		printf("X:\n");

		printUL_col_major(X);

		printf("Y:\n");

		printUL_col_major(Y);

		printf("Z:\n");

		printUL_col_major(Z);

		//perform the addition:

		printf("performing sum operation:\n");

		matAdd(Z_d,X_d,Y_d);

		//print resulting operation:

		cudaMemcpy(Z.data,Z_d.data,Z_d.h*Z_d.w*sizeof(float),cudaMemcpyDeviceToHost);

		printf("Z:\n");

		printUL_col_major(Z);

		return 0;

}

And here’s the matAdd.cu kernel:

__global__ void matAdd_kernel(float *A,float *B,float *C,int N)

{

		int block_id=blockIdx.x+gridDim.x*blockIdx.y;

		int thread_id=blockDim.x*block_id+threadIdx.x;

		if(thread_id<N)

		{

				C[thread_id]=A[thread_id]+B[thread_id];

		}

}

void matAdd(Matrix C,Matrix A,Matrix B)

{

		if(!(A.h==B.h && A.w==B.w && C.h==A.h && C.w==A.w))

		{

				printf("Error: matAdd.cu\nMatrix addition attempted.\nMatrix dimensions do not agree!\n");

				printf("Tried A+B, where A: %ldx%ld, B: %ldx%ld\n",A.h,A.w,B.h,B.w);

				exit(0);

		}

		int N=A.h*A.w;

		int threadsPerBlock=256;

		int blocksPerGrid=(N+threadsPerBlock-1)/threadsPerBlock;

		matAdd_kernel<<<blocksPerGrid,threadsPerBlock>>>(A.data,B.data,C.data,N);

		cutilCheckMsg("kernel launch failure");

}

Everything’s compiling fine, it’s just I keep getting a “invalid configuration argument” at run-time.

When I take out the line “cutilCheckMsg(“kernel launch failure”);”, it runs fine, but the matrix Z is empty… ;(

I’m using an Tesla C1060 with 4GB of RAM.

X is 4800x9600 and Y is 4800x9600…

Is this too big?

Also, here’s some output:

./a.out

X:

-0.023930 0.047744 -0.074694 0.053555 -0.032298 0.038762 -0.068890 0.088894 -0.044989 0.005679 -0.054846 0.064743

-0.070026 0.059445 -0.078712 0.001957 -0.050910 0.067603 -0.089646 0.076562 -0.039840 0.052980 -0.074809 0.037390

-0.042785 0.087303 -0.005369 0.017769 -0.075572 0.075981 -0.064457 0.067737 -0.045192 0.046887 -0.030999 0.006888

-0.040708 0.020566 -0.089926 0.082820 -0.010478 0.021086 -0.086581 0.095966 -0.054339 0.068906 -0.060855 0.087460

-0.059717 0.038708 -0.026613 0.053984 -0.088490 0.066764 -0.005617 0.091969 -0.018239 0.097972 -0.073692 0.010064

-0.052374 0.048555 -0.037706 0.043377 -0.071556 0.075888 -0.002523 0.037950 -0.065693 0.078094 -0.011694 0.039196

-0.092799 0.011099 -0.056766 0.091866 -0.059577 0.029236 -0.063502 0.091717 -0.030844 0.079273 -0.087244 0.048310

-0.089582 0.004614 -0.002560 0.058306 -0.006922 0.097391 -0.099892 0.039699 -0.036129 0.038520 -0.084387 0.012408

-0.054143 0.048351 -0.006309 0.002902 -0.073858 0.012903 -0.089030 0.041077 -0.034445 0.030259 -0.071056 0.002762

-0.048605 0.047165 -0.082960 0.096326 -0.066084 0.029297 -0.070599 0.034394 -0.044475 0.075287 -0.063274 0.023137

-0.033560 0.030527 -0.019907 0.078961 -0.052821 0.088959 -0.043210 0.061808 -0.020862 0.058320 -0.028586 0.079149

-0.087878 0.034127 -0.040097 0.092205 -0.033817 0.099641 -0.002590 0.012473 -0.050764 0.093213 -0.065811 0.075233

Y:

-0.023930 0.047744 -0.074694 0.053555 -0.032298 0.038762 -0.068890 0.088894 -0.044989 0.005679 -0.054846 0.064743

-0.070026 0.059445 -0.078712 0.001957 -0.050910 0.067603 -0.089646 0.076562 -0.039840 0.052980 -0.074809 0.037390

-0.042785 0.087303 -0.005369 0.017769 -0.075572 0.075981 -0.064457 0.067737 -0.045192 0.046887 -0.030999 0.006888

-0.040708 0.020566 -0.089926 0.082820 -0.010478 0.021086 -0.086581 0.095966 -0.054339 0.068906 -0.060855 0.087460

-0.059717 0.038708 -0.026613 0.053984 -0.088490 0.066764 -0.005617 0.091969 -0.018239 0.097972 -0.073692 0.010064

-0.052374 0.048555 -0.037706 0.043377 -0.071556 0.075888 -0.002523 0.037950 -0.065693 0.078094 -0.011694 0.039196

-0.092799 0.011099 -0.056766 0.091866 -0.059577 0.029236 -0.063502 0.091717 -0.030844 0.079273 -0.087244 0.048310

-0.089582 0.004614 -0.002560 0.058306 -0.006922 0.097391 -0.099892 0.039699 -0.036129 0.038520 -0.084387 0.012408

-0.054143 0.048351 -0.006309 0.002902 -0.073858 0.012903 -0.089030 0.041077 -0.034445 0.030259 -0.071056 0.002762

-0.048605 0.047165 -0.082960 0.096326 -0.066084 0.029297 -0.070599 0.034394 -0.044475 0.075287 -0.063274 0.023137

-0.033560 0.030527 -0.019907 0.078961 -0.052821 0.088959 -0.043210 0.061808 -0.020862 0.058320 -0.028586 0.079149

-0.087878 0.034127 -0.040097 0.092205 -0.033817 0.099641 -0.002590 0.012473 -0.050764 0.093213 -0.065811 0.075233

Z:

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

performing sum operation:

cutilCheckMsg() CUTIL CUDA error: kernel launch failure in file <./matAdd.cu>, line 24 : invalid configuration argument.

tera · July 16, 2010, 2:27pm

No, but it’s too big for one dimension. Use a two-dimensional grid to get enough blocks.

You can check yourself in Appendix G.1 of the Programming Guide.

HickoryDock · July 16, 2010, 2:42pm

Hi thanks for that.

Sorry to be such a school boy, but could you suggest a fix? Do you know of any 2-D add kernels?

I tried this:

__global__ void matAdd_kernel(float *A,float *B,float *C,int N)

{

		int i=threadIdx.x;

		int j=threadIdx.y;

		C[j*N+i]=A[j*N+i]+B[j*N+i];

}

but I think it’s wrong…

Edit: CUDA Fail… :( sorry…

tera · July 16, 2010, 3:01pm

Try this:

void matAdd(Matrix C,Matrix A,Matrix B)

{

		if(!(A.h==B.h && A.w==B.w && C.h==A.h && C.w==A.w))

		{

				printf("Error: matAdd.cu\nMatrix addition attempted.\nMatrix dimensions do not agree!\n");

				printf("Tried A+B, where A: %ldx%ld, B: %ldx%ld\n",A.h,A.w,B.h,B.w);

				exit(0);

		}

		int threadsPerBlock = 64;

		dim3 blocksPerGrid;

		blocksPerGrid.x = (A.w+threadsPerBlock-1)/threadsPerBlock;

		blocksPerGrid.y = A.h;

		blocksPerGrid.z = 1;

		matAdd_kernel<<<blocksPerGrid,threadsPerBlock>>>(A.data, B.data, C.data, A.w*A.h);

		cutilCheckMsg("kernel launch failure");

}

HickoryDock · July 16, 2010, 3:09pm

Try this:

void matAdd(Matrix C,Matrix A,Matrix B)

{

		if(!(A.h==B.h && A.w==B.w && C.h==A.h && C.w==A.w))

		{

				printf("Error: matAdd.cu\nMatrix addition attempted.\nMatrix dimensions do not agree!\n");

				printf("Tried A+B, where A: %ldx%ld, B: %ldx%ld\n",A.h,A.w,B.h,B.w);

				exit(0);

		}

		int threadsPerBlock = 64;

		dim3 blocksPerGrid;

		blocksPerGrid.x = (A.w+threadsPerBlock-1)/threadsPerBlock;

		blocksPerGrid.y = A.h;

		blocksPerGrid.z = 1;

		matAdd_kernel<<<blocksPerGrid,threadsPerBlock>>>(A.data, B.data, C.data, A.w*A.h);

		cutilCheckMsg("kernel launch failure");

}

Hey,

Thanks for that…

It seems to only work on one column…

Here’s the output I got:

X:

-0.023930 0.047744 -0.074694 0.053555 -0.032298 0.038762 -0.068890 0.088894 -0.044989 0.005679 -0.054846 0.064743

-0.070026 0.059445 -0.078712 0.001957 -0.050910 0.067603 -0.089646 0.076562 -0.039840 0.052980 -0.074809 0.037390

-0.042785 0.087303 -0.005369 0.017769 -0.075572 0.075981 -0.064457 0.067737 -0.045192 0.046887 -0.030999 0.006888

-0.040708 0.020566 -0.089926 0.082820 -0.010478 0.021086 -0.086581 0.095966 -0.054339 0.068906 -0.060855 0.087460

-0.059717 0.038708 -0.026613 0.053984 -0.088490 0.066764 -0.005617 0.091969 -0.018239 0.097972 -0.073692 0.010064

-0.052374 0.048555 -0.037706 0.043377 -0.071556 0.075888 -0.002523 0.037950 -0.065693 0.078094 -0.011694 0.039196

-0.092799 0.011099 -0.056766 0.091866 -0.059577 0.029236 -0.063502 0.091717 -0.030844 0.079273 -0.087244 0.048310

-0.089582 0.004614 -0.002560 0.058306 -0.006922 0.097391 -0.099892 0.039699 -0.036129 0.038520 -0.084387 0.012408

-0.054143 0.048351 -0.006309 0.002902 -0.073858 0.012903 -0.089030 0.041077 -0.034445 0.030259 -0.071056 0.002762

-0.048605 0.047165 -0.082960 0.096326 -0.066084 0.029297 -0.070599 0.034394 -0.044475 0.075287 -0.063274 0.023137

-0.033560 0.030527 -0.019907 0.078961 -0.052821 0.088959 -0.043210 0.061808 -0.020862 0.058320 -0.028586 0.079149

-0.087878 0.034127 -0.040097 0.092205 -0.033817 0.099641 -0.002590 0.012473 -0.050764 0.093213 -0.065811 0.075233

Y:

-0.023930 0.047744 -0.074694 0.053555 -0.032298 0.038762 -0.068890 0.088894 -0.044989 0.005679 -0.054846 0.064743

-0.070026 0.059445 -0.078712 0.001957 -0.050910 0.067603 -0.089646 0.076562 -0.039840 0.052980 -0.074809 0.037390

-0.042785 0.087303 -0.005369 0.017769 -0.075572 0.075981 -0.064457 0.067737 -0.045192 0.046887 -0.030999 0.006888

-0.040708 0.020566 -0.089926 0.082820 -0.010478 0.021086 -0.086581 0.095966 -0.054339 0.068906 -0.060855 0.087460

-0.059717 0.038708 -0.026613 0.053984 -0.088490 0.066764 -0.005617 0.091969 -0.018239 0.097972 -0.073692 0.010064

-0.052374 0.048555 -0.037706 0.043377 -0.071556 0.075888 -0.002523 0.037950 -0.065693 0.078094 -0.011694 0.039196

-0.092799 0.011099 -0.056766 0.091866 -0.059577 0.029236 -0.063502 0.091717 -0.030844 0.079273 -0.087244 0.048310

-0.089582 0.004614 -0.002560 0.058306 -0.006922 0.097391 -0.099892 0.039699 -0.036129 0.038520 -0.084387 0.012408

-0.054143 0.048351 -0.006309 0.002902 -0.073858 0.012903 -0.089030 0.041077 -0.034445 0.030259 -0.071056 0.002762

-0.048605 0.047165 -0.082960 0.096326 -0.066084 0.029297 -0.070599 0.034394 -0.044475 0.075287 -0.063274 0.023137

-0.033560 0.030527 -0.019907 0.078961 -0.052821 0.088959 -0.043210 0.061808 -0.020862 0.058320 -0.028586 0.079149

-0.087878 0.034127 -0.040097 0.092205 -0.033817 0.099641 -0.002590 0.012473 -0.050764 0.093213 -0.065811 0.075233

Z:

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

performing sum operation:

Z:

-0.047860 0.095488 -0.149388 0.107110 -0.064596 0.077524 -0.137780 0.177788 -0.089978 0.011358 -0.109692 0.129486

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

At least it’s something and not all zeros now.

Any more ideas?

HickoryDock · July 16, 2010, 4:57pm

Edit: 9500x4800x4 is 550MB and the 8800 only has 512MB… Still looking for a solution to the above problem though…

When I try to run this on my desktop’s GT8800, I get this CUTIL CUDA error:

X:

-0.023930 0.047744 -0.074694 0.053555 -0.032298 0.038762 -0.068890 0.088894 -0.044989 0.005679 -0.054846 0.064743

-0.070026 0.059445 -0.078712 0.001957 -0.050910 0.067603 -0.089646 0.076562 -0.039840 0.052980 -0.074809 0.037390

-0.042785 0.087303 -0.005369 0.017769 -0.075572 0.075981 -0.064457 0.067737 -0.045192 0.046887 -0.030999 0.006888

-0.040708 0.020566 -0.089926 0.082820 -0.010478 0.021086 -0.086581 0.095966 -0.054339 0.068906 -0.060855 0.087460

-0.059717 0.038708 -0.026613 0.053984 -0.088490 0.066764 -0.005617 0.091969 -0.018239 0.097972 -0.073692 0.010064

-0.052374 0.048555 -0.037706 0.043377 -0.071556 0.075888 -0.002523 0.037950 -0.065693 0.078094 -0.011694 0.039196

-0.092799 0.011099 -0.056766 0.091866 -0.059577 0.029236 -0.063502 0.091717 -0.030844 0.079273 -0.087244 0.048310

-0.089582 0.004614 -0.002560 0.058306 -0.006922 0.097391 -0.099892 0.039699 -0.036129 0.038520 -0.084387 0.012408

-0.054143 0.048351 -0.006309 0.002902 -0.073858 0.012903 -0.089030 0.041077 -0.034445 0.030259 -0.071056 0.002762

-0.048605 0.047165 -0.082960 0.096326 -0.066084 0.029297 -0.070599 0.034394 -0.044475 0.075287 -0.063274 0.023137

-0.033560 0.030527 -0.019907 0.078961 -0.052821 0.088959 -0.043210 0.061808 -0.020862 0.058320 -0.028586 0.079149

-0.087878 0.034127 -0.040097 0.092205 -0.033817 0.099641 -0.002590 0.012473 -0.050764 0.093213 -0.065811 0.075233

Y:

-0.023930 0.047744 -0.074694 0.053555 -0.032298 0.038762 -0.068890 0.088894 -0.044989 0.005679 -0.054846 0.064743

-0.070026 0.059445 -0.078712 0.001957 -0.050910 0.067603 -0.089646 0.076562 -0.039840 0.052980 -0.074809 0.037390

-0.042785 0.087303 -0.005369 0.017769 -0.075572 0.075981 -0.064457 0.067737 -0.045192 0.046887 -0.030999 0.006888

-0.040708 0.020566 -0.089926 0.082820 -0.010478 0.021086 -0.086581 0.095966 -0.054339 0.068906 -0.060855 0.087460

-0.059717 0.038708 -0.026613 0.053984 -0.088490 0.066764 -0.005617 0.091969 -0.018239 0.097972 -0.073692 0.010064

-0.052374 0.048555 -0.037706 0.043377 -0.071556 0.075888 -0.002523 0.037950 -0.065693 0.078094 -0.011694 0.039196

-0.092799 0.011099 -0.056766 0.091866 -0.059577 0.029236 -0.063502 0.091717 -0.030844 0.079273 -0.087244 0.048310

-0.089582 0.004614 -0.002560 0.058306 -0.006922 0.097391 -0.099892 0.039699 -0.036129 0.038520 -0.084387 0.012408

-0.054143 0.048351 -0.006309 0.002902 -0.073858 0.012903 -0.089030 0.041077 -0.034445 0.030259 -0.071056 0.002762

-0.048605 0.047165 -0.082960 0.096326 -0.066084 0.029297 -0.070599 0.034394 -0.044475 0.075287 -0.063274 0.023137

-0.033560 0.030527 -0.019907 0.078961 -0.052821 0.088959 -0.043210 0.061808 -0.020862 0.058320 -0.028586 0.079149

-0.087878 0.034127 -0.040097 0.092205 -0.033817 0.099641 -0.002590 0.012473 -0.050764 0.093213 -0.065811 0.075233

Z:

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

performing sum operation:

Z:

./matAdd.cu(37) : cutilCheckMsg() CUTIL CUDA error : kernel launch failure : memory size or pointer value too large to fit in 32 bit.

tera · July 16, 2010, 8:00pm

I can’t see anything else wrong. Are you really using my modified matAdd() together with the unmodified remaining code from your first post?