Hi, I try to use cub::DeviceRadixSort with below simply test program:
I found that whenever num_items larger than 4800 (e.g. 4900), the program failed to give correct result.
When num_items is 4800, the output looks like this (which is expected)
However, when num_items is set to 4900, the output array is not work as expected:
#include <cub/cub.cuh>
#include <cstdio>
int main()
{
int num_items = 4800; // e.g., 7
uint32_t *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
uint32_t *d_keys_out; // e.g., [ ... ]
uint32_t *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
uint32_t *d_values_out; // e.g., [ ... ]
cudaMallocManaged(&d_keys_in, num_items * sizeof(uint32_t));
cudaMallocManaged(&d_keys_out, num_items * sizeof(uint32_t));
cudaMallocManaged(&d_values_in, num_items * sizeof(uint32_t));
cudaMallocManaged(&d_values_out, num_items * sizeof(uint32_t));
for(int i = 0; i < num_items; i++)
{
d_keys_in[i] = num_items - i;
d_values_in[i] = i;
}
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in,
d_values_out, num_items);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
cudaDeviceSynchronize();
for (int i = 0; i < num_items; i++)
printf("%d ", d_keys_out[i]);
printf("\n");
for (int i = 0; i < num_items; i++)
printf("%d ", d_values_out[i]);
printf("\n");
cudaFree(d_keys_in);
cudaFree(d_keys_out);
cudaFree(d_values_in);
cudaFree(d_values_out);
return 0;
}
I wonder if I’m using this function in wrong way. My cuda version is 12.6 and nvcc version is 12.5.
Thanks in advance.