Hello zuda:
I wrote a simple program for myself.
I hope this gives a hit.
Please just change inside of “compare()” function.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define M (8)
#define N 50
__device__ int dA[M][M][M];
__device__ unsigned int dResult[2];
__global__ void compare()
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int k = threadIdx.z + blockDim.z * blockIdx.z;
if (dA[k][j][i] > N) {
// atomicInc((unsigned int *)&dResult[1], (M*M*M));
dResult[1]=dResult[1]+1;
}
}
int main(void)
{
int i, j, k;
int A[M][M][M];
int Result[2];
dim3 ThreadPerBlock(M, M, M);
dim3 BlockPerGrid(1, 1, 1);
cudaError_t cuda_ret;
fprintf(stderr, "%d * %d * %d * %d = %d\n", sizeof(int), M, M, M,
sizeof(int) * M * M *M);
Result[0] = Result[1] = 0;
for (k = 0; k < M; k++) {
for (j = 0; j < M; j++) {
for (i = 0; i < M; i++) {
A[k][j][i] = 100 - rand() % 100;
if (A[k][j][i] > N)
Result[0]=Result[0]+1;
}
}
}
printf("Before\n");
for (k = 0; k < M; k++) {
for (j = 0; j < M; j++) {
for (i = 0; i < M; i++) {
printf("%9d", A[k][j][i]);
}
printf("\n");
}
printf("\n");
}
printf("\n");
printf("From host, Greater than %d: %d/%d\n", N, Result[0], M * M * M);
size_t size = M * M * M * sizeof(int);
cuda_ret = cudaMemcpyToSymbol(dA, A, size, 0, cudaMemcpyDefault);
if (cuda_ret != cudaSuccess) {
fprintf(stderr, "[cudaError] %s (%d) at line:%d, %s\n",
cudaGetErrorString(cuda_ret), cuda_ret, __LINE__,
__FILE__);
exit(1);
}
cuda_ret =
cudaMemcpyToSymbol(dResult, Result, (size_t)(sizeof(int) * 2), 0, cudaMemcpyDefault);
if (cuda_ret != cudaSuccess) {
fprintf(stderr, "[cudaError] %s (%d) at line:%d, %s\n",
cudaGetErrorString(cuda_ret), cuda_ret, __LINE__,
__FILE__);
exit(1);
}
compare <<< BlockPerGrid, ThreadPerBlock >>> ();
if (cuda_ret != cudaGetLastError()) {
fprintf(stderr, "[cudaError] %s (%d) at line:%d, %s\n",
cudaGetErrorString(cuda_ret), cuda_ret, __LINE__,
__FILE__);
exit(1);
}
cuda_ret =
cudaMemcpyFromSymbol(Result, dResult, (size_t)(sizeof(int) * 2), 0, cudaMemcpyDefault);
if (cuda_ret != cudaSuccess) {
fprintf(stderr, "[cudaError] %s (%d) at line:%d, %s\n",
cudaGetErrorString(cuda_ret), cuda_ret, __LINE__,
__FILE__);
exit(1);
}
printf("From CUDA, Greater than %d: %d/%d\n", N, Result[1], M * M * M);
exit(0);
}
$ nvcc -gencode=arch=compute_52,code=compute_52 -O2 cuda-3d-array-atom.cu -o cuda-3d-array-atom
$ ./cuda-3d-array-atom
From host, Greater than 50: 264/512
From CUDA, Greater than 50: 1/512