method atomic and concurent write [résolu]

Hi,

I have a question about concurent write in Cuda. For a problem where there are some concurrent write, we must use atomic function to insure the correctness of the concurrent data. But the atomic functions impact drasticaly the performance of the program. And in my case, I’m not sure it is necessary.

I just want to test if in my array, there some value > N. To do this with atomic function, I just test if (array[i] > N) then atomicAdd(testRes, true).
But in this case, all my thread just set my data testRes to the same value = true, only if the condition I want to test is true. So, without atomic function, could my result be incoherent ?
Could testRes don’t change value while there are some value in my array > N ?

I test my kernel without atomic function, and it seems it works, but I don’t know if it correctness is insured for all the case.

Appendix G. COMPUTE CAPABILITIES of the programming guide discusses shared memory for each compute capability, and in particular what happens in the case of multiple threads writing to the same address

based on that, i would like to think that the case you describe is safe, particularly for the newer cc’s

you could also warp vote (__any(), etc), and have a single thread per warp write to the address; this already should reduce the number of atomics from one per thread to one per warp

Thanks,

So I will use my test without atomic function if it is safe

Hello zuda:

I wrote a simple program for myself.
I hope this gives a hit.
Please just change inside of “compare()” function.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>

#define M (8)
#define N 50

__device__ int dA[M][M][M];
__device__ unsigned int dResult[2];

__global__ void compare()
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    int j = threadIdx.y + blockDim.y * blockIdx.y;
    int k = threadIdx.z + blockDim.z * blockIdx.z;

    if (dA[k][j][i] > N) {
//      atomicInc((unsigned int *)&dResult[1], (M*M*M));
        dResult[1]=dResult[1]+1;
     }

}

int main(void)
{
    int i, j, k;
    int A[M][M][M];
    int Result[2];

    dim3 ThreadPerBlock(M, M, M);
    dim3 BlockPerGrid(1, 1, 1);
    cudaError_t cuda_ret;

    fprintf(stderr, "%d * %d * %d * %d = %d\n", sizeof(int), M, M, M,
            sizeof(int) * M * M *M);

    Result[0] = Result[1] = 0;

    for (k = 0; k < M; k++) {
        for (j = 0; j < M; j++) {
            for (i = 0; i < M; i++) {
                A[k][j][i] = 100 - rand() % 100;
                if (A[k][j][i] > N)
                    Result[0]=Result[0]+1;
            }
        }
    }

    printf("Before\n");
    for (k = 0; k < M; k++) {
        for (j = 0; j < M; j++) {
            for (i = 0; i < M; i++) {
                printf("%9d", A[k][j][i]);
            }
            printf("\n");
        }
        printf("\n");
    }
    printf("\n");
    printf("From host, Greater than %d: %d/%d\n", N, Result[0], M * M * M);

    size_t size = M * M * M * sizeof(int);

    cuda_ret = cudaMemcpyToSymbol(dA, A, size, 0, cudaMemcpyDefault);
    if (cuda_ret != cudaSuccess) {
        fprintf(stderr, "[cudaError] %s (%d) at line:%d, %s\n",
                cudaGetErrorString(cuda_ret), cuda_ret, __LINE__,
                __FILE__);
        exit(1);
    }
    cuda_ret =
        cudaMemcpyToSymbol(dResult, Result, (size_t)(sizeof(int) * 2), 0, cudaMemcpyDefault);
    if (cuda_ret != cudaSuccess) {
        fprintf(stderr, "[cudaError] %s (%d) at line:%d, %s\n",
                cudaGetErrorString(cuda_ret), cuda_ret, __LINE__,
                __FILE__);
        exit(1);
    }

    compare <<< BlockPerGrid, ThreadPerBlock >>> ();
    if (cuda_ret != cudaGetLastError()) {
        fprintf(stderr, "[cudaError] %s (%d) at line:%d, %s\n",
                cudaGetErrorString(cuda_ret), cuda_ret, __LINE__,
                __FILE__);
        exit(1);
    }

    cuda_ret =
        cudaMemcpyFromSymbol(Result, dResult, (size_t)(sizeof(int) * 2), 0, cudaMemcpyDefault);
    if (cuda_ret != cudaSuccess) {
        fprintf(stderr, "[cudaError] %s (%d) at line:%d, %s\n",
                cudaGetErrorString(cuda_ret), cuda_ret, __LINE__,
                __FILE__);
        exit(1);
    }


    printf("From CUDA, Greater than %d: %d/%d\n", N, Result[1], M * M * M);

    exit(0);

}

nvcc -gencode=arch=compute_52,code=compute_52 -O2 cuda-3d-array-atom.cu -o cuda-3d-array-atom ./cuda-3d-array-atom
From host, Greater than 50: 264/512
From CUDA, Greater than 50: 1/512