I’m compiling a project using CUDA 11 with the feature of extended lambda expression, I found that the following code cannot pass the compilation using NVCC (CUDA 11.8)
// test.cu
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <type_traits>
template<typename Lam>
__global__ void map(int n, Lam func) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
func(tid);
}
}
template<typename S,typename F>
__global__ void diveq(size_t len, S* src, F f) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid >= len) return;
src[tid] /= f;
}
template <typename T>
struct array_t
{
T *_ptr;
size_t _len;
__host__ __device__ array_t(T *ptr, size_t len) : _ptr(ptr), _len(len) {}
template <typename F, std::enable_if_t<std::is_arithmetic_v<F>, int> = 0>
__host__ array_t& operator/=(F f)
{
T *src = _ptr;
size_t grid_size, block_size = 512;
grid_size = (_len + block_size - 1) / block_size;
auto ker = [=] __device__(int eid) { src[eid] /= f; };
map<<<grid_size, block_size>>>(_len, ker);
cudaDeviceSynchronize();
return (*this);
}
};
void testMain(void)
{
float *pfloats;
cudaMalloc(&pfloats, sizeof(float) * 10000);
array_t<float> farr(pfloats, 10000);
farr /= 2.f;
cudaFree(pfloats);
}
I run the command nvcc --std=c++17 --extended-lambda --compile ./test.cu -o test.o
and it gives me error :
./test.cu: In member function ‘array_t<T>& array_t<T>::operator/=(F)’:
./test.cu:26:102: error: ‘__T0’ was not declared in this scope; did you mean ‘__y0’?
26 | auto ker = [=] __device__(int eid) { src[eid] /= f; };
| ^
| __y0
./test.cu: In instantiation of ‘array_t<T>& array_t<T>::operator/=(F) [with F = float; int <anonymous> = 0; T = float]’:
./test.cu:38:17: required from here
./test.cu:26:12: error: could not convert ‘&((array_t<float>*)this)->*operator/=<<template arguments error> >’ from ‘<unresolved overloaded function type>’ to ‘array_t<float>& (array_t<float>::*)(float)’
26 | auto ker = [=] __device__(int eid) { src[eid] /= f; };
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
./test.cu:26:12: error: could not convert ‘&((array_t<float>*)this)->*operator/=<<template arguments error> >’ from ‘<unresolved overloaded function type>’ to ‘array_t<float>& (array_t<float>::*)(float)’
But if I substitute std::enable_if_t<std::is_arithmetic_v<F>, int> = 0
with int K = 0
, Or, if I replace the lambda expression and map function with a template __global__
function call diveq<<<grid_size,block_size>>>(_len, src, f)
, it passes the compilation,. Why does it happen?