Compilation error when using type_traits with extended lambda expresssion

I’m compiling a project using CUDA 11 with the feature of extended lambda expression, I found that the following code cannot pass the compilation using NVCC (CUDA 11.8)

// test.cu
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <type_traits>

template<typename Lam>
__global__ void map(int n, Lam func) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        func(tid);
    }
}
template<typename S,typename F>
__global__ void diveq(size_t len, S* src, F f) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid >= len) return;
    src[tid] /= f;
}

template <typename T>
struct array_t
{
    T *_ptr;
    size_t _len;
    __host__ __device__ array_t(T *ptr, size_t len) : _ptr(ptr), _len(len) {}
    template <typename F, std::enable_if_t<std::is_arithmetic_v<F>, int> = 0>
    __host__ array_t& operator/=(F f)
    {
        T *src = _ptr;
        size_t grid_size, block_size = 512;
        grid_size = (_len + block_size - 1) / block_size;
        auto ker = [=] __device__(int eid) { src[eid] /= f; };
        map<<<grid_size, block_size>>>(_len, ker);
        cudaDeviceSynchronize();
        return (*this);
    }
};

void testMain(void)
{
    float *pfloats;
    cudaMalloc(&pfloats, sizeof(float) * 10000);
    array_t<float> farr(pfloats, 10000);
    farr /= 2.f;
    cudaFree(pfloats);
}

I run the command nvcc --std=c++17 --extended-lambda --compile ./test.cu -o test.o and it gives me error :

./test.cu: In member function ‘array_t<T>& array_t<T>::operator/=(F)’:
./test.cu:26:102: error: ‘__T0’ was not declared in this scope; did you mean ‘__y0’?
   26 |         auto ker = [=] __device__(int eid) { src[eid] /= f; };
      |                                                                                                      ^   
      |                                                                                                      __y0
./test.cu: In instantiation of ‘array_t<T>& array_t<T>::operator/=(F) [with F = float; int <anonymous> = 0; T = float]’:
./test.cu:38:17:   required from here
./test.cu:26:12: error: could not convert ‘&((array_t<float>*)this)->*operator/=<<template arguments error> >’ from ‘<unresolved overloaded function type>’ to ‘array_t<float>& (array_t<float>::*)(float)’
   26 |         auto ker = [=] __device__(int eid) { src[eid] /= f; };
      |            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                                                                                                                                                                                                                   
./test.cu:26:12: error: could not convert ‘&((array_t<float>*)this)->*operator/=<<template arguments error> >’ from ‘<unresolved overloaded function type>’ to ‘array_t<float>& (array_t<float>::*)(float)’

But if I substitute std::enable_if_t<std::is_arithmetic_v<F>, int> = 0 with int K = 0, Or, if I replace the lambda expression and map function with a template __global__ function call diveq<<<grid_size,block_size>>>(_len, src, f), it passes the compilation,. Why does it happen?

I suggest filing a bug.

Thanks for help, I have submitted the bug.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.