I have the following code:
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
struct debug_data {
float value;
int index;
float address_value;
int address_index;
float old;
};
__device__ __forceinline__ int atomicMaxFloatWithIndex(float* addr, float value, int* maxIndex, int valueIndex, int arraySize, debug_data* debugArray) {
float old;
old = (value >= 0) ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));
// Determine the index of the maximum value atomically
int prevMaxIndex = atomicExch(maxIndex, (old == value) ? *maxIndex : valueIndex);
if (valueIndex < arraySize) {
debugArray[valueIndex].value = value;
debugArray[valueIndex].index = valueIndex;
debugArray[valueIndex].address_value = *addr;
debugArray[valueIndex].address_index = prevMaxIndex;
debugArray[valueIndex].old = old;
}
return *maxIndex;
}
__global__ void reduceKernel(float* data, int* indexArr, int* maxIndices, int arraySize, debug_data* debugArray) {
int tid = threadIdx.x;
int index = indexArr[tid];
float value = data[index];
int* maxIndex = &maxIndices[tid];
atomicMaxFloatWithIndex(&data[tid], value, maxIndex, index, arraySize, debugArray);
}
int main(void) {
const int arraySize = 10;
float x[arraySize] = {10.0, 13.0, 8.0, 3.0, 5.0, 9.0, 10.0, 6.0, 19.0, 20.0};
int idx[arraySize] = {0, 0, 1, 2, 3, 3, 1, 3, 0, 0};
int maxIndicesSize = arraySize;
int* h_maxIndices = new int[maxIndicesSize];
int* d_maxIndices;
debug_data* d_debugArray;
float* d_x;
int* d_idx;
cudaMalloc((void**)&d_x, arraySize * sizeof(float));
cudaMalloc((void**)&d_idx, arraySize * sizeof(int));
cudaMalloc((void**)&d_maxIndices, maxIndicesSize * sizeof(int));
cudaMalloc((void**)&d_debugArray, arraySize * sizeof(debug_data));
cudaMemset(d_maxIndices, 0, maxIndicesSize * sizeof(int));
cudaMemcpy(d_x, x, arraySize * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_idx, idx, arraySize * sizeof(int), cudaMemcpyHostToDevice);
reduceKernel<<<1, arraySize>>>(d_x, d_idx, d_maxIndices, arraySize, d_debugArray);
// Copy the updated d_x back to the host
cudaMemcpy(x, d_x, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
// Copy the initialized d_maxIndices back to the host
cudaMemcpy(h_maxIndices, d_maxIndices, maxIndicesSize * sizeof(int), cudaMemcpyDeviceToHost);
debug_data h_debugArray[arraySize];
cudaMemcpy(h_debugArray, d_debugArray, arraySize * sizeof(debug_data), cudaMemcpyDeviceToHost);
// Print x and maxIndices
printf("x: ");
for (int i = 0; i < arraySize; i++) {
printf("%.2f ", x[i]);
}
printf("\n");
printf("maxIndices: ");
for (int i = 0; i < maxIndicesSize; i++) {
printf("%d ", h_maxIndices[i]);
}
printf("\n");
for (int i = 0; i < arraySize; i++) {
printf("Debug Data[%d]: value = %f, index = %d, address_value = %f, address_index = %d, old = %f\n",
i, h_debugArray[i].value, h_debugArray[i].index, h_debugArray[i].address_value, h_debugArray[i].address_index, h_debugArray[i].old);
}
cudaFree(d_x);
cudaFree(d_idx);
// Don't forget to free the device memory when you're done
cudaFree(d_maxIndices);
delete[] h_maxIndices; // Free the host array
cudaFree(d_debugArray);
return 0;
}
But I get this as a result:
x: 10.00 13.00 13.00 8.00 5.00 9.00 13.00 6.00 19.00 20.00
maxIndices: 0 0 1 2 3 3 1 3 0 0
Debug Data[0]: value = 10.000000, index = 0, address_value = 10.000000, address_index = 0, old = 10.000000
Debug Data[1]: value = 13.000000, index = 1, address_value = 13.000000, address_index = 0, old = 8.000000
Debug Data[2]: value = 8.000000, index = 2, address_value = 8.000000, address_index = 0, old = 3.000000
Debug Data[3]: value = 3.000000, index = 3, address_value = 5.000000, address_index = 0, old = 5.000000
Debug Data[4]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[5]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[6]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[7]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[8]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
Debug Data[9]: value = 0.000000, index = 0, address_value = 0.000000, address_index = 0, old = 0.000000
I expect x to become:
x = {20,10,3,4,0,0,0,0,0,0}
But this is unfortunately not the case. I cannot figure out why, am I doing something wrong?