Hi all,
I’m encountering different behavior of this test code, depending on the platform that it runs:
#include <cub/device/device_select.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
struct LessThan {
int compare;
__host__ __device__ __forceinline__
LessThan(int compare): compare(compare) {}
__host__ __device__ __forceinline__
bool operator()(const int &a) const {
return (a < compare);
}
};
__global__ void set_num_selected_out (int *x) {
*x = 1234;
}
int main (int argc, char *argv[]) {
int num_items = 8;
int h_in[num_items] = {0, 2, 3, 9, 5, 2, 81, 8};
int *d_in;
cudaMalloc((void**)&d_in, num_items * sizeof(int));
cudaMemcpy(d_in, h_in, num_items * sizeof(int), cudaMemcpyHostToDevice);
int *d_out;
cudaMalloc((void**)&d_out, num_items * sizeof(int));
int *d_num_selected_out;
cudaMalloc((void**)&d_num_selected_out, sizeof(int));
LessThan select_op(7);
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceSelect::If(
d_temp_storage, temp_storage_bytes,
d_in, d_out, d_num_selected_out, num_items, select_op);
printf ("Error: %s\n", cudaGetErrorString(cudaGetLastError()));
printf ("temp_storage_bytes: %d\n", temp_storage_bytes);
}
I have two test systems: An x86 host with A100 GPUs and a Grace-Hopper(H100) system. On the first one, I use the HPC SDK 24.3 module (NVIDIA HPC SDK 24.3 Release | NVIDIA Developer). I compile with
nvcc test_cub.cu -o test.x`
When I run this I get
Error: no error
temp_storage_bytes: 767
So far so good. On the Grace-Hopper system, I download the SDK from the same location as above, but obviously the ARM version. I compile it in the same way and get
Error: no error
temp_storage_bytes: 0
I tried some previous SDK versions, and at least with version 23.3, the results agree. So it’s nothing on GH100 that’s making trouble per se, maybe it’s a regression?