I can't not get true answer at 3D array calculation

Hello.
I just started CUDA.
I want to calculate 10×10×10 array .
C[i][j][k]=A[i][j][k]+B[i][j][k]
So I had tested this code.
But C[i][j][k] had became -431602080.000 about all i,j,k.
How shall I do to get a true answer?

#define LENGTH 10
#define WIDTH 10
#define DEPTH 10
#define GETINDEX(x,y,z) (x+y*DEPTH+z*WIDTH*DEPTH)

__global__ void
matrixCalc(float* inMatA, float* inMatB, float* inMatC) {
	int len = blockIdx.x*blockDim.x + threadIdx.x;
	int wid = blockIdx.y*blockDim.y + threadIdx.y;
	int dep = blockIdx.z*blockDim.z + threadIdx.z;
	inMatC[GETINDEX(len, wid, dep)] = inMatA[GETINDEX(len, wid, dep)] + inMatB[GETINDEX(len, wid, dep)];
}

int main(int argc, char** argv) {
		// 行列のサイズをバイト単位で算出
	int matrixSize = sizeof(float) * GETINDEX(LENGTH,WIDTH,DEPTH);

	// ホスト側の行列変数設定
	float* hMatA;
	float* hMatB;
	float* hMatC;

	// 行列変数のメモリ確保
	hMatA = (float*)malloc(matrixSize);
	hMatB = (float*)malloc(matrixSize);
	
	
	// 初期値設定
	int len, row,wid;
	for(row=0;row<DEPTH;row++){
		for (wid = 0; wid < WIDTH; wid++) {
			for (len = 0; len < LENGTH; len++) {
				hMatA[GETINDEX(len,wid,row)] = 2;
				hMatB[GETINDEX(len, wid, row)] = 1;
			}
		}
	}
	float* dMatA;
	float* dMatB;
	float* dMatC;

	cudaMalloc((void**)&dMatA, GETINDEX(LENGTH, WIDTH, DEPTH));
	cudaMalloc((void**)&dMatB, GETINDEX(LENGTH, WIDTH, DEPTH));
	cudaMalloc((void**)&dMatC, GETINDEX(LENGTH, WIDTH, DEPTH));

	cudaMemcpy(dMatA, hMatA, matrixSize, cudaMemcpyHostToDevice);
	cudaMemcpy(dMatB, hMatB, matrixSize, cudaMemcpyHostToDevice);

	dim3 block(LENGTH, WIDTH,DEPTH);
	dim3 grid(matrixSize / LENGTH, matrixSize/ WIDTH,matrixSize/DEPTH);

	matrixCalc << <grid, block >> >(dMatA, dMatB, dMatC);
	cudaThreadSynchronize();

	hMatC = (float*)malloc(matrixSize);
	cudaMemcpy(hMatC, dMatC, matrixSize, cudaMemcpyDeviceToHost);

	for (row = 0; row<DEPTH; row++) {
		for (len = 0; len < LENGTH; len++) {
			for (wid = 0; wid < WIDTH; wid++) printf("%f,  ",hMatC[GETINDEX(len,wid,row)]);
			printf("\n");
		}
		printf("\n\n");
	}
	
	free(hMatA);
	free(hMatB);
	free(hMatC);
	cudaFree(dMatA);
	cudaFree(dMatB);
	cudaFree(dMatC);
	cudaThreadExit();
	
}

This code has a number of errors in it. Any time you are having trouble with a CUDA code, you should do proper CUDA error checking, and also run your code with cuda-memcheck.

Not sure what “proper CUDA error checking” is ? Google “proper CUDA error checking” and take the first hit, read it, and apply it to your code.

Not sure what cuda-memcheck is? Google “cuda-memcheck”

You should do these things before asking others for help. Even if you don’t understand the error output, it will be useful for others trying to help you.

Problems in your code:

  1. GETINDEX(LENGTH, WIDTH, DEPTH) calculates out to 1110. This is more than the actual matrix sizes needed, but this is not a critical problem.

  2. cudaMalloc, like malloc, takes a size in bytes. Therefore you should pass a size parameter to each of your cudaMalloc operations that is the same as the size parameter for your malloc operations:

cudaMalloc((void**)&dMatA, matrixSize);
	cudaMalloc((void**)&dMatB, matrixSize);
	cudaMalloc((void**)&dMatC, matrixSize);
  1. You are launching way too many blocks. Each block is 10x10x10 which is acceptable. But the grid calculation is very large:
dim3 block(LENGTH, WIDTH,DEPTH);
	dim3 grid(matrixSize / LENGTH, matrixSize/ WIDTH,matrixSize/DEPTH);

You are launching ~4000/10 = ~400 blocks in each dimension! You only actually need 1 block in this case. But even that is not a critical problem if you handle the extra blocks correctly in your kernel with a proper thread check:

if ((len < LENGTH) && (wid < WIDTH) && (dep < DEPTH))
          inMatC[GETINDEX(len, wid, dep)] = inMatA[GETINDEX(len, wid, dep)] + inMatB[GETINDEX(len, wid, dep)];

With changes like those, I was able to get your code working without error:

$ cat t74.cu
#include <stdio.h>
#define LENGTH 10
#define WIDTH 10
#define DEPTH 10
#define GETINDEX(x,y,z) (x+y*DEPTH+z*WIDTH*DEPTH)

__global__ void
matrixCalc(float* inMatA, float* inMatB, float* inMatC) {
        int len = blockIdx.x*blockDim.x + threadIdx.x;
        int wid = blockIdx.y*blockDim.y + threadIdx.y;
        int dep = blockIdx.z*blockDim.z + threadIdx.z;
        if ((len < LENGTH) && (wid < WIDTH) && (dep < DEPTH))
          inMatC[GETINDEX(len, wid, dep)] = inMatA[GETINDEX(len, wid, dep)] + inMatB[GETINDEX(len, wid, dep)];
}

int main(int argc, char** argv) {
                // 行列のサイズをバイト単位で算出
        int matrixSize = sizeof(float) * GETINDEX(LENGTH,WIDTH,DEPTH);

        // ホスト側の行列変数設定
        float* hMatA;
        float* hMatB;
        float* hMatC;

        // 行列変数のメモリ確保
        hMatA = (float*)malloc(matrixSize);
        hMatB = (float*)malloc(matrixSize);

// 初期値設定
        int len, row,wid;
        for(row=0;row<DEPTH;row++){
                for (wid = 0; wid < WIDTH; wid++) {
                        for (len = 0; len < LENGTH; len++) {
                                hMatA[GETINDEX(len,wid,row)] = 2;
                                hMatB[GETINDEX(len, wid, row)] = 1;
                        }
                }
        }
        float* dMatA;
        float* dMatB;
        float* dMatC;

        cudaMalloc((void**)&dMatA, matrixSize);
        cudaMalloc((void**)&dMatB, matrixSize);
        cudaMalloc((void**)&dMatC, matrixSize);

        cudaMemcpy(dMatA, hMatA, matrixSize, cudaMemcpyHostToDevice);
        cudaMemcpy(dMatB, hMatB, matrixSize, cudaMemcpyHostToDevice);

        printf("size = %d\n", GETINDEX(LENGTH,WIDTH,DEPTH));
        dim3 block(LENGTH, WIDTH,DEPTH);
        dim3 grid(matrixSize / LENGTH, matrixSize/ WIDTH,matrixSize/DEPTH);

        matrixCalc << <grid, block >> >(dMatA, dMatB, dMatC);
        cudaThreadSynchronize();

        hMatC = (float*)malloc(matrixSize);
        cudaMemcpy(hMatC, dMatC, matrixSize, cudaMemcpyDeviceToHost);

        for (row = 0; row<DEPTH; row++) {
                for (len = 0; len < LENGTH; len++) {
                        for (wid = 0; wid < WIDTH; wid++) printf("%f,  ",hMatC[GETINDEX(len,wid,row)]);
                        printf("\n");
                }
                printf("\n\n");
        }

        free(hMatA);
        free(hMatB);
        free(hMatC);
        cudaFree(dMatA);
        cudaFree(dMatB);
        cudaFree(dMatC);

}
nvidia@nvidia-DiGiTS-Dev-Box:~/bobc$ nvcc -arch=sm_61 -o t74 t74.cu
nvidia@nvidia-DiGiTS-Dev-Box:~/bobc$ cuda-memcheck ./t74
========= CUDA-MEMCHECK
size = 1110
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

========= ERROR SUMMARY: 0 errors
$

txbob,
thank you for advising me.
I didn’t know about both “proper CUDA error checking” and “cuda-memcheck”,so I googled.
I could understand about “proper CUDA error checking”, but I couldn’t understand “cuda-memcheck” because of my english is not good.
Could you teach me how to use “cuda-memcheck”?

Anyway,I cannot get true answer even when I copy and paste your code.So I did gpuErrchk, about cudaMemcpy like this.

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
	if (code != cudaSuccess)
	{
		fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		if (abort) exit(code);
	}
	else {
		printf("no err on %d\n", line);
	}
}

And in main,

gpuErrchk(cudaMemcpy(dMatA, hMatA, matrixSize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(dMatB, hMatB, matrixSize, cudaMemcpyHostToDevice));

//(after call kernel)
        gpuErrchk(cudaMemcpy(hMatC, dMatC, matrixSize, cudaMemcpyDeviceToHost));

It says,there are no error about MatA and MatB, but mat C had error,named " unspecified launch failure".
I couldn’t understand what is that mean.

Could you teach me how can I do?

Simplest usage of cuda-memcheck:

cuda-memcheck [executable-name]

“unspecified launch failure”:

A CUDA kernel failed while running on the GPU. If you do not have proper error checking for the kernel itself, the error will be reported on the next CUDA API call, here cudaMempcy(), since errors are sticky. Try this:

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

[...]
matrixCalc << <grid, block >> >(dMatA, dMatB, dMatC);
CHECK_LAUNCH_ERROR();
[...]

njuffa,
Thank you for your advice.
when I write “cuda-memchek” in code, it becomes error.

I did CHECK_LAUNCH_ERROR() like your code, and it says “unspecified launch failure in line 108”. But in line 108, there is CHECK_LAUNCH_ERROR().
What is that meaning?

Since CHECK_LAUNCH_ERROR() immediately follows the launch of the kernel matrixCalc(), that means that kernel experienced an “unspecified launch error”, and CHECK_LAUNCH_ERROR() caught this. This error is the equivalent of a segfault in host code, meaning you have an out-of-bounds access in your kernel, that is, a bug (or several bugs).

cuda-memcheck is a tool (like gdb, cuda-gdb, or nvprof) that you run from the operating system command line, this is not something you stick in your source code. So if your CUDA program is in ‘foo.cu’, and compiles into an executable (binary) file ‘foo’ (or ‘foo.exe’ if you are on Windows), then, on the command line:

cuda-memcheck foo

Check out the cool documentation: http://docs.nvidia.com/cuda/cuda-memcheck/

It success, that I change grid size,

#define LENGTH 40
#define WIDTH 40
#define DEPTH 40
[...]
dim3 grid(matrixSize / LENGTH, matrixSize / WIDTH, matrixSize / DEPTH);

to

#define LENGTH 10
#define WIDTH 10
#define DEPTH 10
dim3 grid(1, 1, 1);

then I can get true answer,hMatC=3.0000 each [x,y,z].

I think grid is too big (it is pointed out by txbob first time…), so

int len = blockIdx.x*blockDim.x + threadIdx.x;
	int wid = blockIdx.y*blockDim.y + threadIdx.y;
	int dep = blockIdx.z*blockDim.z + threadIdx.z;

became too big and it cause of out-of-bounds access.
But when

#define LENGTH 40
#define WIDTH 40
#define DEPTH 40

then it failure,and CHECK_LAUNCH_ERROR() says “invalid configuration argument”.
Why it became invalid?

Because you can’t have a threadblock of 40,40,40 thread dimensions.

The maximum is 1024 threads per block, which is the product of the dimensions.

Hmmm…I forgotten about that.
I want to change value of LENGTH,WIDTH,DEPTH,and run this program independent from these value.
Could you teach me what shall I do?

Pass LENGTH,WIDTH,DEPTH as additional parameters to you kernel, or put them into device variables (preferably in constant memory for best performance).

// 少々変更しました。こんなんでいかがでしょ?

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>

#define LENGTH 10
#define WIDTH 10
#define DEPTH 10

__host__ __device__ inline size_t getIndex(int x, int y, int z) {
  return x + y*DEPTH + z*WIDTH*DEPTH;
}

__global__ void matrixCalc(float* inMatA, float* inMatB, float* inMatC, int xsize, int ysize, int zsize) {
  int len = blockIdx.x*blockDim.x + threadIdx.x;
  int wid = blockIdx.y*blockDim.y + threadIdx.y;
  int dep = blockIdx.z*blockDim.z + threadIdx.z;
  if ( len < xsize && wid < ysize && dep < zsize ) {
    inMatC[getIndex(len, wid, dep)] = inMatA[getIndex(len, wid, dep)] + inMatB[getIndex(len, wid, dep)];
  }
}

int main(int argc, char** argv) {
  // 行列のサイズをバイト単位で算出
  int matrixSize = sizeof(float) * LENGTH*WIDTH*DEPTH;

  // ホスト側の行列変数設定
  float* hMatA;
  float* hMatB;
  float* hMatC;

  // 行列変数のメモリ確保
  hMatA = (float*)malloc(matrixSize);
  hMatB = (float*)malloc(matrixSize);

  // 初期値設定
  int len, row,wid;
  for(row=0;row<DEPTH;row++){
    for (wid = 0; wid < WIDTH; wid++) {
      for (len = 0; len < LENGTH; len++) {
        hMatA[getIndex(len, wid, row)] = 2;
        hMatB[getIndex(len, wid, row)] = 1;
      }
    }
  }

  float* dMatA;
  float* dMatB;
  float* dMatC;

  cudaMalloc((void**)&dMatA, matrixSize);
  cudaMalloc((void**)&dMatB, matrixSize);
  cudaMalloc((void**)&dMatC, matrixSize);

  cudaMemcpy(dMatA, hMatA, matrixSize, cudaMemcpyHostToDevice);
  cudaMemcpy(dMatB, hMatB, matrixSize, cudaMemcpyHostToDevice);

  dim3 block(32, 8, 4);
  dim3 grid((LENGTH+31)/32, (WIDTH+7)/8, (DEPTH+3)/4);

  matrixCalc <<<grid,block>>>(dMatA, dMatB, dMatC, LENGTH, WIDTH, DEPTH);

  cudaDeviceSynchronize();

  hMatC = (float*)malloc(matrixSize);
  cudaMemcpy(hMatC, dMatC, matrixSize, cudaMemcpyDeviceToHost);

  for (row = 0; row<DEPTH; row++) {
    for (len = 0; len < LENGTH; len++) {
      for (wid = 0; wid < WIDTH; wid++) {
        printf("%f,  ",hMatC[getIndex(len,wid,row)]);
      }
      printf("\n");
    }
    printf("\n\n");
  }

  free(hMatA);
  free(hMatB);
  free(hMatC);

  cudaFree(dMatA);
  cudaFree(dMatB);
  cudaFree(dMatC);

  cudaDeviceReset();

}

It success!!
Thank you so match episteme!
And thank you all advising me!