CUB complex number reduction iterator_traits problem

Hello all,

I have implemented a reduction with complex numbers using CUDA Thrust but would now like to implement a version using CUDA CUB library. I am using CUDA 11.4 and have the following code in file myReductions.cu:

#include <iostream>
#include <cub/cub.cuh>
#include <cuComplex.h>

struct ComplexProdFloat {
    __device__ __forceinline__
    cuComplex operator()(cuComplex &a, cuComplex &b) {
        cuComplex tmp;
        tmp.x = a.x * b.x - a.y * b.y;
        tmp.y = a.x * b.y + a.y * b.x;
        a.x = tmp.x;
        a.y = tmp.y;
        return a;
    }
};

// PARAM: a is HOST-side allocated array
// PARAM: N is length of a array
cuComplex reduceCUB(cuComplex *a, const size_t N) {
    ComplexProdFloat prod;
    cuComplex init;
    void *d_temp_storage = NULL;
    size_t temp_storage_bytes = 0;

    init.x = init.y = 1;

    cuComplex *d_a, d_prod;
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_a), sizeof(cuComplex) * N));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_prod), sizeof(cuComplex)));
    CUDA_CHECK(cudaMemcpy(d_a, a, N * sizeof(cuComplex), cudaMemcpyHostToDevice));

    // This is where the error occurs when compiling
    cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_test, N, prod, init);
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_temp_storage), temp_storage_bytes));

    ...

    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(&d_temp_storage)));

    return h_sum;

The above code is just a snippet of the full code but exposes the point at which the compilation fails. This failure occurs at the line containing cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_test, N, prod, init);

The error that is output when compiling follows:

/usr/local/cuda/bin/…/targets/x86_64-linux/include/cub/device/dispatch/dispatch_reduce.cuh(325): error: class “std::iterator_traits” has no member “value_type”
detected during:
processing of template argument list for “cub::DispatchReduce” based on template arguments <cuComplex *, cuComplex, OffsetT, ComplexProdFloat>
/usr/local/cuda/bin/…/targets/x86_64-linux/include/cub/device/device_reduce.cuh(162):

I am hoping this is simple overlook on my part regarding the use of CUB, but am unsure. Can anyone assist by providing some hints/help?

Thanks

P.S. I am using C++ 17

You might also want to post the issue on CUB’s Github.

I didn’t seem to have any trouble with what you have shown on CUDA 11.4. My attempt to build a complete code out of it:

$ cat t2114.cu
#include <iostream>
#include <cub/cub.cuh>
#include <cuComplex.h>
#define CUDA_CHECK(x) x
struct ComplexProdFloat {
    __device__ __forceinline__
    cuComplex operator()(cuComplex &a, cuComplex &b) {
        cuComplex tmp;
        tmp.x = a.x * b.x - a.y * b.y;
        tmp.y = a.x * b.y + a.y * b.x;
        a.x = tmp.x;
        a.y = tmp.y;
        return a;
    }
};

// PARAM: a is HOST-side allocated array
// PARAM: N is length of a array
cuComplex reduceCUB(cuComplex *a, const size_t N) {
    ComplexProdFloat prod;
    cuComplex init;
    void *d_temp_storage = NULL;
    size_t temp_storage_bytes = 0;

    init.x = init.y = 1;

    cuComplex *d_a, *d_prod, *d_test, h_prod;
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_a), sizeof(cuComplex) * N));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_test), sizeof(cuComplex)));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_prod), sizeof(cuComplex)));
    CUDA_CHECK(cudaMemcpy(d_a, a, N * sizeof(cuComplex), cudaMemcpyHostToDevice));

    // This is where the error occurs when compiling
    cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_test, N, prod, init);
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_temp_storage), temp_storage_bytes));
    cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_test, N, prod, init);


    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(d_temp_storage)));
    cudaMemcpy(&h_prod, d_test, sizeof(cuComplex), cudaMemcpyDeviceToHost);
    return h_prod;
}

int main(){
    int N = 2;
    cuComplex *h_a = new cuComplex[N];
    for (int i = 0; i < N; i++ ) {h_a[i].x = 1; h_a[i].y = 0;}
    cuComplex r = reduceCUB(h_a, N);
    std::cout << r.x << "," << r.y << std::endl;
    return 0;
}
$ nvcc -std=c++17 t2114.cu -o t2114
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Wed_Jul_14_19:41:19_PDT_2021
Cuda compilation tools, release 11.4, V11.4.100
Build cuda_11.4.r11.4/compiler.30188945_0
$ g++ --version
g++ (GCC) 7.3.1 20180303 (Red Hat 7.3.1-5)
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
$ compute-sanitizer ./t2114
========= COMPUTE-SANITIZER
1,1
========= ERROR SUMMARY: 0 errors
$

Please review it for the changes. I made changes to your code (it is incomplete, your usage of d_prod is incorrect and irrelevant, your cudaFree statement was incorrect, etc.) but nothing that I think would have a material impact on that compile error.

In the future, I’ll be less likely to go to this effort if you supply an incomplete code. Please supply a complete code for these questions, just as I have supplied a complete code in my answer. It will make the exchanges more efficient. Thanks.

1 Like

Thanks @Robert_Crovella for the help.

Sorry about the incomplete code. I was just trying to show-case where the error was occurring in the code.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.