I’m trying to use thrust::copy_if
function, but getting a runtime error. A reproducible example is given below
#include <iostream>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#define CHECK_CUDA(call) \
{ \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \
__LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}
struct IsTrue {
__host__ __device__ bool operator()(uint8_t v) { return v == 1; }
};
template <uint32_t block_size>
__global__ void test(uint8_t *oldData, uint8_t *data, uint32_t size,
uint16_t *output) {
__shared__ uint8_t d[block_size];
__shared__ uint8_t isChanged[block_size];
__shared__ uint16_t tPos[block_size];
__shared__ uint16_t cd[block_size];
tPos[threadIdx.x] = threadIdx.x;
d[threadIdx.x] = data[threadIdx.x];
isChanged[threadIdx.x] = data[threadIdx.x] != oldData[threadIdx.x];
__syncthreads();
uint16_t *pos = thrust::copy_if(thrust::device, tPos, tPos + size, isChanged,
cd, IsTrue());
uint32_t compactSize = pos - cd;
if (threadIdx.x < compactSize) {
output[threadIdx.x] = cd[threadIdx.x];
}
}
int main() {
uint8_t *oldData, *data;
uint16_t *output;
CHECK_CUDA(cudaMallocManaged(&oldData, 256));
CHECK_CUDA(cudaMallocManaged(&data, 256));
CHECK_CUDA(cudaMallocManaged(&output, sizeof(uint16_t) * 256));
for (int i = 0; i < 256; ++i) {
oldData[i] = rand() % 3;
data[i] = rand() % 3;
}
test<256><<<1, 256>>>(oldData, data, 128, output);
CHECK_CUDA(cudaDeviceSynchronize());
for (int i = 0; i < 256; ++i) {
if (output[i])
std::cout << i << "," << output[i] << std::endl;
}
}
When built with cuda-11.5
, using the following command,
nvcc -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -rdc=true ./thrust_single_copy_if.cu
It throws out the following error
Cuda error in file './thrust_single_copy_if.cu' in line 55 : operation not supported on global/shared address space.
I’ve tried on both v100
and a100
GPUs and was able to recreate the issue
But if I remove rdc=true
flag, It works without any issue. Removing this flag is not a feasible solution since in the actual use case, I’m compiling multiple cu source files.
Is there a limitation in using the thrust library in multiple compile units? If so is there an alternative solution that can be used to filter values from an array using a stencil similar to thrust::copy_if