I can't not get true answer at 3D array calculation

Teploobmen · January 8, 2017, 3:07pm

Hello.
I just started CUDA.
I want to calculate 10×10×10 array .
C[i][j][k]=A[i][j][k]+B[i][j][k]
So I had tested this code.
But C[i][j][k] had became -431602080.000 about all i,j,k.
How shall I do to get a true answer?

#define LENGTH 10
#define WIDTH 10
#define DEPTH 10
#define GETINDEX(x,y,z) (x+y*DEPTH+z*WIDTH*DEPTH)

__global__ void
matrixCalc(float* inMatA, float* inMatB, float* inMatC) {
	int len = blockIdx.x*blockDim.x + threadIdx.x;
	int wid = blockIdx.y*blockDim.y + threadIdx.y;
	int dep = blockIdx.z*blockDim.z + threadIdx.z;
	inMatC[GETINDEX(len, wid, dep)] = inMatA[GETINDEX(len, wid, dep)] + inMatB[GETINDEX(len, wid, dep)];
}

int main(int argc, char** argv) {
		// 行列のサイズをバイト単位で算出
	int matrixSize = sizeof(float) * GETINDEX(LENGTH,WIDTH,DEPTH);

	// ホスト側の行列変数設定
	float* hMatA;
	float* hMatB;
	float* hMatC;

	// 行列変数のメモリ確保
	hMatA = (float*)malloc(matrixSize);
	hMatB = (float*)malloc(matrixSize);
	
	
	// 初期値設定
	int len, row,wid;
	for(row=0;row<DEPTH;row++){
		for (wid = 0; wid < WIDTH; wid++) {
			for (len = 0; len < LENGTH; len++) {
				hMatA[GETINDEX(len,wid,row)] = 2;
				hMatB[GETINDEX(len, wid, row)] = 1;
			}
		}
	}
	float* dMatA;
	float* dMatB;
	float* dMatC;

	cudaMalloc((void**)&dMatA, GETINDEX(LENGTH, WIDTH, DEPTH));
	cudaMalloc((void**)&dMatB, GETINDEX(LENGTH, WIDTH, DEPTH));
	cudaMalloc((void**)&dMatC, GETINDEX(LENGTH, WIDTH, DEPTH));

	cudaMemcpy(dMatA, hMatA, matrixSize, cudaMemcpyHostToDevice);
	cudaMemcpy(dMatB, hMatB, matrixSize, cudaMemcpyHostToDevice);

	dim3 block(LENGTH, WIDTH,DEPTH);
	dim3 grid(matrixSize / LENGTH, matrixSize/ WIDTH,matrixSize/DEPTH);

	matrixCalc << <grid, block >> >(dMatA, dMatB, dMatC);
	cudaThreadSynchronize();

	hMatC = (float*)malloc(matrixSize);
	cudaMemcpy(hMatC, dMatC, matrixSize, cudaMemcpyDeviceToHost);

	for (row = 0; row<DEPTH; row++) {
		for (len = 0; len < LENGTH; len++) {
			for (wid = 0; wid < WIDTH; wid++) printf("%f,  ",hMatC[GETINDEX(len,wid,row)]);
			printf("\n");
		}
		printf("\n\n");
	}
	
	free(hMatA);
	free(hMatB);
	free(hMatC);
	cudaFree(dMatA);
	cudaFree(dMatB);
	cudaFree(dMatC);
	cudaThreadExit();
	
}

Robert_Crovella · January 8, 2017, 3:20pm

This code has a number of errors in it. Any time you are having trouble with a CUDA code, you should do proper CUDA error checking, and also run your code with cuda-memcheck.

Not sure what “proper CUDA error checking” is ? Google “proper CUDA error checking” and take the first hit, read it, and apply it to your code.

Not sure what cuda-memcheck is? Google “cuda-memcheck”

You should do these things before asking others for help. Even if you don’t understand the error output, it will be useful for others trying to help you.

Robert_Crovella · January 8, 2017, 3:32pm

Problems in your code:

GETINDEX(LENGTH, WIDTH, DEPTH) calculates out to 1110. This is more than the actual matrix sizes needed, but this is not a critical problem.
cudaMalloc, like malloc, takes a size in bytes. Therefore you should pass a size parameter to each of your cudaMalloc operations that is the same as the size parameter for your malloc operations:

cudaMalloc((void**)&dMatA, matrixSize);
	cudaMalloc((void**)&dMatB, matrixSize);
	cudaMalloc((void**)&dMatC, matrixSize);

You are launching way too many blocks. Each block is 10x10x10 which is acceptable. But the grid calculation is very large:

dim3 block(LENGTH, WIDTH,DEPTH);
	dim3 grid(matrixSize / LENGTH, matrixSize/ WIDTH,matrixSize/DEPTH);

You are launching ~4000/10 = ~400 blocks in each dimension! You only actually need 1 block in this case. But even that is not a critical problem if you handle the extra blocks correctly in your kernel with a proper thread check:

if ((len < LENGTH) && (wid < WIDTH) && (dep < DEPTH))
          inMatC[GETINDEX(len, wid, dep)] = inMatA[GETINDEX(len, wid, dep)] + inMatB[GETINDEX(len, wid, dep)];

With changes like those, I was able to get your code working without error:

$ cat t74.cu
#include <stdio.h>
#define LENGTH 10
#define WIDTH 10
#define DEPTH 10
#define GETINDEX(x,y,z) (x+y*DEPTH+z*WIDTH*DEPTH)

__global__ void
matrixCalc(float* inMatA, float* inMatB, float* inMatC) {
        int len = blockIdx.x*blockDim.x + threadIdx.x;
        int wid = blockIdx.y*blockDim.y + threadIdx.y;
        int dep = blockIdx.z*blockDim.z + threadIdx.z;
        if ((len < LENGTH) && (wid < WIDTH) && (dep < DEPTH))
          inMatC[GETINDEX(len, wid, dep)] = inMatA[GETINDEX(len, wid, dep)] + inMatB[GETINDEX(len, wid, dep)];
}

int main(int argc, char** argv) {
                // 行列のサイズをバイト単位で算出
        int matrixSize = sizeof(float) * GETINDEX(LENGTH,WIDTH,DEPTH);

        // ホスト側の行列変数設定
        float* hMatA;
        float* hMatB;
        float* hMatC;

        // 行列変数のメモリ確保
        hMatA = (float*)malloc(matrixSize);
        hMatB = (float*)malloc(matrixSize);

// 初期値設定
        int len, row,wid;
        for(row=0;row<DEPTH;row++){
                for (wid = 0; wid < WIDTH; wid++) {
                        for (len = 0; len < LENGTH; len++) {
                                hMatA[GETINDEX(len,wid,row)] = 2;
                                hMatB[GETINDEX(len, wid, row)] = 1;
                        }
                }
        }
        float* dMatA;
        float* dMatB;
        float* dMatC;

        cudaMalloc((void**)&dMatA, matrixSize);
        cudaMalloc((void**)&dMatB, matrixSize);
        cudaMalloc((void**)&dMatC, matrixSize);

        cudaMemcpy(dMatA, hMatA, matrixSize, cudaMemcpyHostToDevice);
        cudaMemcpy(dMatB, hMatB, matrixSize, cudaMemcpyHostToDevice);

        printf("size = %d\n", GETINDEX(LENGTH,WIDTH,DEPTH));
        dim3 block(LENGTH, WIDTH,DEPTH);
        dim3 grid(matrixSize / LENGTH, matrixSize/ WIDTH,matrixSize/DEPTH);

        matrixCalc << <grid, block >> >(dMatA, dMatB, dMatC);
        cudaThreadSynchronize();

        hMatC = (float*)malloc(matrixSize);
        cudaMemcpy(hMatC, dMatC, matrixSize, cudaMemcpyDeviceToHost);

        for (row = 0; row<DEPTH; row++) {
                for (len = 0; len < LENGTH; len++) {
                        for (wid = 0; wid < WIDTH; wid++) printf("%f,  ",hMatC[GETINDEX(len,wid,row)]);
                        printf("\n");
                }
                printf("\n\n");
        }

        free(hMatA);
        free(hMatB);
        free(hMatC);
        cudaFree(dMatA);
        cudaFree(dMatB);
        cudaFree(dMatC);

}
nvidia@nvidia-DiGiTS-Dev-Box:~/bobc$ nvcc -arch=sm_61 -o t74 t74.cu
nvidia@nvidia-DiGiTS-Dev-Box:~/bobc$ cuda-memcheck ./t74
========= CUDA-MEMCHECK
size = 1110
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,
3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,  3.000000,

========= ERROR SUMMARY: 0 errors
$

Teploobmen · January 9, 2017, 8:34am

txbob,
thank you for advising me.
I didn’t know about both “proper CUDA error checking” and “cuda-memcheck”,so I googled.
I could understand about “proper CUDA error checking”, but I couldn’t understand “cuda-memcheck” because of my english is not good.
Could you teach me how to use “cuda-memcheck”?

Anyway,I cannot get true answer even when I copy and paste your code.So I did gpuErrchk, about cudaMemcpy like this.

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
	if (code != cudaSuccess)
	{
		fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		if (abort) exit(code);
	}
	else {
		printf("no err on %d\n", line);
	}
}

And in main,

gpuErrchk(cudaMemcpy(dMatA, hMatA, matrixSize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(dMatB, hMatB, matrixSize, cudaMemcpyHostToDevice));

//(after call kernel)
        gpuErrchk(cudaMemcpy(hMatC, dMatC, matrixSize, cudaMemcpyDeviceToHost));

It says,there are no error about MatA and MatB, but mat C had error,named " unspecified launch failure".
I couldn’t understand what is that mean.

Could you teach me how can I do?

njuffa · January 9, 2017, 8:41am

Simplest usage of cuda-memcheck:

cuda-memcheck [executable-name]

“unspecified launch failure”:

A CUDA kernel failed while running on the GPU. If you do not have proper error checking for the kernel itself, the error will be reported on the next CUDA API call, here cudaMempcy(), since errors are sticky. Try this:

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

[...]
matrixCalc << <grid, block >> >(dMatA, dMatB, dMatC);
CHECK_LAUNCH_ERROR();
[...]

Teploobmen · January 9, 2017, 9:14am

njuffa,
Thank you for your advice.
when I write “cuda-memchek” in code, it becomes error.

I did CHECK_LAUNCH_ERROR() like your code, and it says “unspecified launch failure in line 108”. But in line 108, there is CHECK_LAUNCH_ERROR().
What is that meaning?

njuffa · January 9, 2017, 9:26am

Since CHECK_LAUNCH_ERROR() immediately follows the launch of the kernel matrixCalc(), that means that kernel experienced an “unspecified launch error”, and CHECK_LAUNCH_ERROR() caught this. This error is the equivalent of a segfault in host code, meaning you have an out-of-bounds access in your kernel, that is, a bug (or several bugs).

cuda-memcheck is a tool (like gdb, cuda-gdb, or nvprof) that you run from the operating system command line, this is not something you stick in your source code. So if your CUDA program is in ‘foo.cu’, and compiles into an executable (binary) file ‘foo’ (or ‘foo.exe’ if you are on Windows), then, on the command line:

cuda-memcheck foo

Check out the cool documentation: http://docs.nvidia.com/cuda/cuda-memcheck/

Teploobmen · January 9, 2017, 11:06am

It success, that I change grid size,

#define LENGTH 40
#define WIDTH 40
#define DEPTH 40
[...]
dim3 grid(matrixSize / LENGTH, matrixSize / WIDTH, matrixSize / DEPTH);

to

#define LENGTH 10
#define WIDTH 10
#define DEPTH 10
dim3 grid(1, 1, 1);

then I can get true answer,hMatC=3.0000 each [x,y,z].

I think grid is too big (it is pointed out by txbob first time…), so

int len = blockIdx.x*blockDim.x + threadIdx.x;
	int wid = blockIdx.y*blockDim.y + threadIdx.y;
	int dep = blockIdx.z*blockDim.z + threadIdx.z;

became too big and it cause of out-of-bounds access.
But when

#define LENGTH 40
#define WIDTH 40
#define DEPTH 40

then it failure,and CHECK_LAUNCH_ERROR() says “invalid configuration argument”.
Why it became invalid?

Robert_Crovella · January 9, 2017, 12:26pm

Because you can’t have a threadblock of 40,40,40 thread dimensions.

The maximum is 1024 threads per block, which is the product of the dimensions.

Teploobmen · January 9, 2017, 1:46pm

Hmmm…I forgotten about that.
I want to change value of LENGTH,WIDTH,DEPTH,and run this program independent from these value.
Could you teach me what shall I do?

tera · January 9, 2017, 2:01pm

Pass LENGTH,WIDTH,DEPTH as additional parameters to you kernel, or put them into device variables (preferably in constant memory for best performance).

episteme · January 10, 2017, 10:56pm

// 少々変更しました。こんなんでいかがでしょ?

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>

#define LENGTH 10
#define WIDTH 10
#define DEPTH 10

__host__ __device__ inline size_t getIndex(int x, int y, int z) {
  return x + y*DEPTH + z*WIDTH*DEPTH;
}

__global__ void matrixCalc(float* inMatA, float* inMatB, float* inMatC, int xsize, int ysize, int zsize) {
  int len = blockIdx.x*blockDim.x + threadIdx.x;
  int wid = blockIdx.y*blockDim.y + threadIdx.y;
  int dep = blockIdx.z*blockDim.z + threadIdx.z;
  if ( len < xsize && wid < ysize && dep < zsize ) {
    inMatC[getIndex(len, wid, dep)] = inMatA[getIndex(len, wid, dep)] + inMatB[getIndex(len, wid, dep)];
  }
}

int main(int argc, char** argv) {
  // 行列のサイズをバイト単位で算出
  int matrixSize = sizeof(float) * LENGTH*WIDTH*DEPTH;

  // ホスト側の行列変数設定
  float* hMatA;
  float* hMatB;
  float* hMatC;

  // 行列変数のメモリ確保
  hMatA = (float*)malloc(matrixSize);
  hMatB = (float*)malloc(matrixSize);

  // 初期値設定
  int len, row,wid;
  for(row=0;row<DEPTH;row++){
    for (wid = 0; wid < WIDTH; wid++) {
      for (len = 0; len < LENGTH; len++) {
        hMatA[getIndex(len, wid, row)] = 2;
        hMatB[getIndex(len, wid, row)] = 1;
      }
    }
  }

  float* dMatA;
  float* dMatB;
  float* dMatC;

  cudaMalloc((void**)&dMatA, matrixSize);
  cudaMalloc((void**)&dMatB, matrixSize);
  cudaMalloc((void**)&dMatC, matrixSize);

  cudaMemcpy(dMatA, hMatA, matrixSize, cudaMemcpyHostToDevice);
  cudaMemcpy(dMatB, hMatB, matrixSize, cudaMemcpyHostToDevice);

  dim3 block(32, 8, 4);
  dim3 grid((LENGTH+31)/32, (WIDTH+7)/8, (DEPTH+3)/4);

  matrixCalc <<<grid,block>>>(dMatA, dMatB, dMatC, LENGTH, WIDTH, DEPTH);

  cudaDeviceSynchronize();

  hMatC = (float*)malloc(matrixSize);
  cudaMemcpy(hMatC, dMatC, matrixSize, cudaMemcpyDeviceToHost);

  for (row = 0; row<DEPTH; row++) {
    for (len = 0; len < LENGTH; len++) {
      for (wid = 0; wid < WIDTH; wid++) {
        printf("%f,  ",hMatC[getIndex(len,wid,row)]);
      }
      printf("\n");
    }
    printf("\n\n");
  }

  free(hMatA);
  free(hMatB);
  free(hMatC);

  cudaFree(dMatA);
  cudaFree(dMatB);
  cudaFree(dMatC);

  cudaDeviceReset();

}

Teploobmen · January 13, 2017, 12:27pm

It success!!
Thank you so match episteme!
And thank you all advising me!

Topic		Replies	Views
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10000	January 18, 2012
a problem complex array add with cuda ????? CUDA Programming and Performance	2	983	August 17, 2017
Program hit cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaMemsetAsync CUDA Programming and Performance	7	7371	January 11, 2020
multi dimension array CUDA Programming and Performance	26	32772	February 12, 2010
UNKNOWN_CUDA_ERROR after/during kernel execution using Numba CUDA Programming and Performance	4	5687	October 10, 2019
what's wrong with my code? CUDA Programming and Performance	6	3955	July 7, 2009
modifying simulator to work with CUDA.?. CUDA Programming and Performance	36	18515	April 15, 2009
Working with really large arrays in CUDA (how to prevent negative indexes?) CUDA Programming and Performance	5	2226	November 19, 2019
3D arrays - where to start? CUDA Programming and Performance	12	20256	October 13, 2013
Why is kernel getting failed? CUDA Programming and Performance	8	3198	November 28, 2018

I can't not get true answer at 3D array calculation

Related topics