Unexpected result running basic kernel code

I have the following code:

#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>

struct debug_data {
    float value;
    int index;
    float address_value;
    int address_index;
    float old;
};

__device__ __forceinline__ int atomicMaxFloatWithIndex(float* addr, float value, int* maxIndex, int valueIndex, int arraySize, debug_data* debugArray) {
    float old;

    old = (value >= 0) ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) :
            __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));


    // Determine the index of the maximum value atomically
    int prevMaxIndex = atomicExch(maxIndex, (old == value) ?  *maxIndex : valueIndex);

    
    if (valueIndex < arraySize) {
        debugArray[valueIndex].value = value;
        debugArray[valueIndex].index = valueIndex;
        debugArray[valueIndex].address_value = *addr;
        debugArray[valueIndex].address_index = prevMaxIndex;
        debugArray[valueIndex].old = old;
    }

    return *maxIndex;
}

__global__ void reduceKernel(float* data, int* indexArr, int* maxIndices, int arraySize, debug_data* debugArray) {
    int tid = threadIdx.x;
    int index = indexArr[tid];
    float value = data[index];
    int* maxIndex = &maxIndices[tid];

    atomicMaxFloatWithIndex(&data[tid], value, maxIndex, index,  arraySize, debugArray);
}


int main(void) {
    const int arraySize = 10;
    float x[arraySize] = {10.0, 13.0, 8.0, 3.0, 5.0, 9.0, 10.0, 6.0, 19.0, 20.0};
    int idx[arraySize] = {0, 0, 1, 2, 3, 3, 1, 3, 0, 0};
    int maxIndicesSize = arraySize;
    int* h_maxIndices = new int[maxIndicesSize];
    int* d_maxIndices;
    debug_data* d_debugArray;



    float* d_x;
    int* d_idx;

    cudaMalloc((void**)&d_x, arraySize * sizeof(float));
    cudaMalloc((void**)&d_idx, arraySize * sizeof(int));
    cudaMalloc((void**)&d_maxIndices, maxIndicesSize * sizeof(int));
    cudaMalloc((void**)&d_debugArray, arraySize * sizeof(debug_data));
    cudaMemset(d_maxIndices, 0, maxIndicesSize * sizeof(int));


    cudaMemcpy(d_x, x, arraySize * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_idx, idx, arraySize * sizeof(int), cudaMemcpyHostToDevice);

    reduceKernel<<<1, arraySize>>>(d_x, d_idx, d_maxIndices, arraySize, d_debugArray);

    // Copy the updated d_x back to the host
    cudaMemcpy(x, d_x, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
    // Copy the initialized d_maxIndices back to the host
    cudaMemcpy(h_maxIndices, d_maxIndices, maxIndicesSize * sizeof(int), cudaMemcpyDeviceToHost);

    debug_data h_debugArray[arraySize];
    cudaMemcpy(h_debugArray, d_debugArray, arraySize * sizeof(debug_data), cudaMemcpyDeviceToHost);


    // Print x and maxIndices
    printf("x: ");
    for (int i = 0; i < arraySize; i++) {
        printf("%.2f ", x[i]);
    }
    printf("\n");

    printf("maxIndices: ");
    for (int i = 0; i < maxIndicesSize; i++) {
        printf("%d ", h_maxIndices[i]);
    }
    printf("\n");

    for (int i = 0; i < arraySize; i++) {
        printf("Debug Data[%d]: value = %f, index = %d, address_value = %f, address_index = %d, old = %f\n",
            i, h_debugArray[i].value, h_debugArray[i].index, h_debugArray[i].address_value, h_debugArray[i].address_index, h_debugArray[i].old);
    }

    cudaFree(d_x);
    cudaFree(d_idx);
    // Don't forget to free the device memory when you're done
    cudaFree(d_maxIndices);
    delete[] h_maxIndices;  // Free the host array
    cudaFree(d_debugArray);

    return 0;
}

But I get this as a result:

x: 10.00 13.00 13.00 8.00 5.00 9.00 13.00 6.00 19.00 20.00 
maxIndices: 0 0 1 2 3 3 1 3 0 0 
Debug Data[0]: value = 10.000000, index = 0, address_value = 10.000000, address_index = 0, old = 10.000000
Debug Data[1]: value = 13.000000, index = 1, address_value = 13.000000, address_index = 0, old = 8.000000
Debug Data[2]: value = 8.000000, index = 2, address_value = 8.000000, address_index = 0, old = 3.000000
Debug Data[3]: value = 3.000000, index = 3, address_value = 5.000000, address_index = 0, old = 5.000000
Debug Data[4]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[5]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[6]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[7]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[8]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[9]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000

I expect x to become:

x = {20,10,3,4,0,0,0,0,0,0}

But this is unfortunately not the case. I cannot figure out why, am I doing something wrong?

Your expectations are not reasonable.

Each thread is doing exactly one atomic, and each thread atomic is operating on a separate location in the x array.

Thread 0 does an atomic update to location 0. Thread 1 does an atomic update to location 1, etc.

So considering thread 0, the location it is updating is location 0. The value originally in that location is 10. The value supplied to the atomic update is:

The index supplied to thread 0 is 0. Therefore, thread 0 attempts to update location 0, using the value in location 0. Therefore no change will be made to location 0. Location 0 started with 10, it will end up with 10. Your expectation of a 20 showing up there is not sensible.

No other thread does an atomic update to location 0.

Finally, another point to be made, although its not the crux of the issue at the moment:

CUDA specifies no particular order of thread execution. The proper mental model is that threads may execute in any order, regardless of whatever else you may have heard on this topic.

Since you have each thread reading from a location in the x array:

float value = data[index];

that another thread is writing to via the atomic, you have a race condition. Your code is broken because any ordering is possible. When a thread reads data[index], its’ possible that another thread has written to it, and its also possible that another thread has not yet written to it, but will later. It’s almost certainly the case that this is not your algorithm intent, and could not produce sensible or predictable results.

I changed the code to:

#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>

struct debug_data {
    float value;
    int index;
    float address_value;
    int address_index;
    float old;
};

__device__ __forceinline__ int atomicMaxFloatWithIndex(float* addr, float value, int* maxIndex, int valueIndex, int arraySize, debug_data* debugArray) {
    float old;

    old = (value >= 0) ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) :
            __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));


    // Determine the index of the maximum value atomically
    int prevMaxIndex = atomicExch(maxIndex, (old == value) ?  *maxIndex : valueIndex);

    
    if (valueIndex < arraySize) {
        debugArray[valueIndex].value = value;
        debugArray[valueIndex].index = valueIndex;
        debugArray[valueIndex].address_value = *addr;
        debugArray[valueIndex].address_index = *maxIndex;
        debugArray[valueIndex].old = old;
    }

    return *maxIndex;
}

__global__ void reduceKernel(float* data,float* target ,int* indices,int* indexArr, int* maxIndices, int arraySize, debug_data* debugArray) {
    int tid = threadIdx.x;
    int index = indexArr[tid];
    float value = data[tid];
    int* maxIndex = &maxIndices[tid];
    int x_index = indices[tid];

    atomicMaxFloatWithIndex(&target[index], value, maxIndex, x_index,  arraySize, debugArray);
}


int main(void) {
    const int arraySize = 10;
    float x[arraySize] = {10.0, 13.0, 8.0, 3.0, 5.0, 9.0, 10.0, 6.0, 19.0, 20.0};
    int idx[arraySize] = {0, 0, 1, 2, 3, 3, 1, 3, 0, 0};
    int maxIndicesSize = arraySize;
    float t[arraySize] = {0.0};  // Initialize 't' with zeros
    int indices[arraySize]; // Initialize 'indices' array
    int* h_maxIndices = new int[maxIndicesSize];
    float* d_t;  // Device memory for t
    int* d_maxIndices;
    int* d_indices;
    debug_data* d_debugArray;

    // Initialize 'indices' array
    for (int i = 0; i < arraySize; i++) {
        indices[i] = i;
    }


    float* d_x;
    int* d_idx;

    cudaMalloc((void**)&d_t, arraySize * sizeof(float)); // Allocate device memory for t
    cudaMemcpy(d_t, t, arraySize * sizeof(float), cudaMemcpyHostToDevice); // Copy t to device
    cudaMalloc((void**)&d_x, arraySize * sizeof(float));
    cudaMalloc((void**)&d_idx, arraySize * sizeof(int));
    cudaMalloc((void**)&d_maxIndices, maxIndicesSize * sizeof(int));
    cudaMalloc((void**)&d_indices, arraySize * sizeof(int)); // Allocate device memory for indices
    cudaMalloc((void**)&d_debugArray, arraySize * sizeof(debug_data));
    cudaMemset(d_maxIndices, 0, maxIndicesSize * sizeof(int));

    // Copy the 'indices' array to the device
    cudaMemcpy(d_indices, indices, arraySize * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, x, arraySize * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_idx, idx, arraySize * sizeof(int), cudaMemcpyHostToDevice);

    reduceKernel<<<1, arraySize>>>(d_x,d_t,d_indices ,d_idx, d_maxIndices, arraySize, d_debugArray);

    // Copy the updated d_x back to the host
    cudaMemcpy(t, d_t, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
    // Copy the initialized d_maxIndices back to the host
    cudaMemcpy(h_maxIndices, d_maxIndices, maxIndicesSize * sizeof(int), cudaMemcpyDeviceToHost);

    debug_data h_debugArray[arraySize];
    cudaMemcpy(h_debugArray, d_debugArray, arraySize * sizeof(debug_data), cudaMemcpyDeviceToHost);


    // Print x and maxIndices
    printf("t: ");
    for (int i = 0; i < arraySize; i++) {
        printf("%.2f ", t[i]);
    }
    printf("\n");

    printf("maxIndices: ");
    for (int i = 0; i < maxIndicesSize; i++) {
        printf("%d ", h_maxIndices[i]);
    }
    printf("\n");

    for (int i = 0; i < arraySize; i++) {
        printf("Debug Data[%d]: value = %f, index = %d, address_value = %f, address_index = %d, old = %f\n",
            i, h_debugArray[i].value, h_debugArray[i].index, h_debugArray[i].address_value, h_debugArray[i].address_index, h_debugArray[i].old);
    }

    cudaFree(d_x);
    cudaFree(d_idx);
    // Don't forget to free the device memory when you're done
    cudaFree(d_maxIndices);
    delete[] h_maxIndices;  // Free the host array
    cudaFree(d_debugArray);

    return 0;
}

Now it does give the desired result. And the result is quite deterministic.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.