Thrust conflict with pcl (cudaErrorIllegalAddress)

Error

After running any thrust function, I am not able to build a pcl::gpu::Octree::PointCloud, and got the error.

terminate called after throwing an instance of 'thrust::system::system_error'
  what():  parallel_for: failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered
Aborted (core dumped)

the code: src/thrust_pcl_conflict.cu

  1. If I don’t run any of thrust function, the code work well. (comment out run_thrust and run_thrust_stream in the main function)
  2. Both run_thrust and run_thrust_stream function will cause the error at the line octree_device.build();
  3. if the num_points set to 1000, the error will not occur, but 10000 would cause the error.
  4. I have tried to check the memory. Since thrust vector is inside the function, and nothing to be used or passed outside the function, it would be freed after the the usage of the function (run_thrust or run_thrust_stream). The memory usage print confirmed this, no any memory used by thrust after the usage of the function.
#include <iostream>
#include <pcl/point_cloud.h>
#include <pcl/point_types.h>
#include <pcl/gpu/octree/octree.hpp>
#include <pcl/gpu/containers/device_array.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>

__global__ void initialize_points_pointxyz_kernel(int num_points, pcl::PointXYZ* output) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < num_points) {
        output[idx].x = static_cast<float>(idx);
        output[idx].y = static_cast<float>(idx);
        output[idx].z = static_cast<float>(idx);
    }
}

void initialize_points_pointxyz(int num_points, pcl::PointXYZ* d_points){
    int threads_per_block = 256;
    int num_blocks = (num_points + threads_per_block - 1) / threads_per_block;
    initialize_points_pointxyz_kernel<<<num_blocks, threads_per_block>>>(num_points, d_points);
    cudaDeviceSynchronize();
}

void run_thrust(int num_points){
    thrust::device_vector<float> d_vec(num_points, 0.0f);
    float error_flag = thrust::reduce(d_vec.begin(), d_vec.end(), -FLT_MAX, thrust::maximum<float>());
}

void run_thrust_stream(int num_points){
    cudaStream_t stream_thrust;
    cudaStreamCreate(&stream_thrust);
    thrust::device_vector<float> d_vec(num_points, 0.0f);
    float error_flag = thrust::reduce(thrust::cuda::par.on(stream_thrust), d_vec.begin(), d_vec.end(), -FLT_MAX, thrust::maximum<float>());
    cudaStreamSynchronize(stream_thrust);
    cudaStreamDestroy(stream_thrust);
}

void report_mem(size_t allocd, bool first=false)
{
    size_t freeMem, totalMem;
    cudaMemGetInfo(&freeMem, &totalMem);
    if (first) 
        std::cout << "Allocated | Total Memory | Free Memory "<< std::endl;
    std::cout << allocd << ", " << totalMem << ", " << freeMem << std::endl;
}

int main(){
    int num_points = 10000; // set to 1000, main function can run successfully
    report_mem(0, true); // 0, 17063280640, 16947806208
    
//     run_thrust(num_points); // comment out this and next line, main function can run successfully
    run_thrust_stream(num_points);
    report_mem(0); // 0, 17063280640, 16947806208

    pcl::PointXYZ* d_points_pointxyz;
    cudaMalloc(&d_points_pointxyz, num_points * sizeof(pcl::PointXYZ));
    initialize_points_pointxyz(num_points, d_points_pointxyz);
    report_mem(0); // 0, 17063280640, 16945709056
    
    pcl::gpu::Octree octree_device;
    pcl::gpu::Octree::PointCloud d_cloud_pointxyz(d_points_pointxyz, num_points);
    octree_device.setCloud(d_cloud_pointxyz);
    octree_device.build();
    report_mem(0);

    cudaFree(d_points_pointxyz);
    report_mem(0);
}

Compile: CmakeLists.txt

# Minimum CMake version required
cmake_minimum_required(VERSION 3.10)

# Project name and language
project(PCCudaFilter LANGUAGES C CXX CUDA)

# Specify C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)

# Find package
find_package(PCL 1.14 REQUIRED)
find_package(CUDA REQUIRED)

# Include directories
include_directories(${CUDA_INCLUDE_DIRS})

# CREATE EXECUTABLE
add_executable(thrust_pcl_conflict src/thrust_pcl_conflict.cu) 
target_include_directories(thrust_pcl_conflict PRIVATE ${PCL_INCLUDE_DIRS})
target_link_libraries(thrust_pcl_conflict PRIVATE ${PCL_LIBRARIES})

nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Mon_Oct_24_19:12:58_PDT_2022
Cuda compilation tools, release 12.0, V12.0.76
Build cuda_12.0.r12.0/compiler.31968024_0

Hardware

Has been tested on A4000 16GB and P5000 16GB.

It could still be a memory issue such as fragmentation.

If it were my code, probably the first thing I would do is to run the test case under compute-sanitizer, in particular to see if any and what kind of errors are reported during the pcl call(s). In particular, to see if there are any CUDA-detectable errors that occur prior to the cudaErrorIllegalAddress. A device allocation failure, if not properly handled, could lead to an illegal address error.

Thanks for you advice, I have checked the output of compute-sanitizer ./thrust_pcl_conflict. Seems to be conflict on the shared memory. But I don’y have the knowledge to debug it.

========= COMPUTE-SANITIZER
Allocated | Total Memory | Free Memory 
0, 8361738240, 8215068672
0, 8361738240, 8215068672
0, 8361738240, 8212971520
========= Invalid __shared__ write of size 4 bytes
=========     at 0xe540 in void cub::DeviceRadixSortDownsweepKernel<cub::DeviceRadixSortPolicy<int, int, unsigned int>::Policy800, (bool)0, (bool)0, int, int, unsigned int>(const T4 *, T4 *, const T5 *, T5 *, T6 *, T6, int, int, cub::GridEvenShare<T6>)
=========     by thread (64,0,0) in block (0,0,0)
=========     Address 0x9d2b6050 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x305122]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:__cudart798 [0x2f41b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaLaunchKernel [0x8b31b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:void cub::DeviceRadixSortDownsweepKernel<cub::DeviceRadixSortPolicy<int, int, unsigned int>::Policy800, false, false, int, int, unsigned int>(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>) [0x2000f]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaError thrust::cuda_cub::launcher::triple_chevron::doit_host<void (*)(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>), int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int> >(void (*)(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>), int const* const&, int* const&, int const* const&, int* const&, unsigned int* const&, unsigned int const&, int const&, int const&, cub::GridEvenShare<unsigned int> const&) const [clone .isra.0] [0x20b63]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaError cub::DeviceRadixSort::SortPairs<int, int, int>(void*, unsigned long&, cub::DoubleBuffer<int>&, cub::DoubleBuffer<int>&, int, int, int, CUstream_st*) [0x26ced]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::device::OctreeImpl::build() [0x1d99b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::gpu::Octree::build() [0x14931]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:main [0xe324]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xddf5]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
========= 
========= Invalid __shared__ write of size 4 bytes
=========     at 0xe540 in void cub::DeviceRadixSortDownsweepKernel<cub::DeviceRadixSortPolicy<int, int, unsigned int>::Policy800, (bool)0, (bool)0, int, int, unsigned int>(const T4 *, T4 *, const T5 *, T5 *, T6 *, T6, int, int, cub::GridEvenShare<T6>)
=========     by thread (65,0,0) in block (0,0,0)
=========     Address 0x9d2b6054 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x305122]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:__cudart798 [0x2f41b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaLaunchKernel [0x8b31b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:void cub::DeviceRadixSortDownsweepKernel<cub::DeviceRadixSortPolicy<int, int, unsigned int>::Policy800, false, false, int, int, unsigned int>(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>) [0x2000f]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaError thrust::cuda_cub::launcher::triple_chevron::doit_host<void (*)(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>), int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int> >(void (*)(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>), int const* const&, int* const&, int const* const&, int* const&, unsigned int* const&, unsigned int const&, int const&, int const&, cub::GridEvenShare<unsigned int> const&) const [clone .isra.0] [0x20b63]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaError cub::DeviceRadixSort::SortPairs<int, int, int>(void*, unsigned long&, cub::DoubleBuffer<int>&, cub::DoubleBuffer<int>&, int, int, int, CUstream_st*) [0x26ced]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::device::OctreeImpl::build() [0x1d99b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::gpu::Octree::build() [0x14931]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:main [0xe324]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xddf5]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
========= 
========= Invalid __shared__ write of size 4 bytes
=========     at 0xe540 in void cub::DeviceRadixSortDownsweepKernel<cub::DeviceRadixSortPolicy<int, int, unsigned int>::Policy800, (bool)0, (bool)0, int, int, unsigned int>(const T4 *, T4 *, const T5 *, T5 *, T6 *, T6, int, int, cub::GridEvenShare<T6>)
=========     by thread (66,0,0) in block (0,0,0)
......
========= 
========= Invalid __shared__ write of size 4 bytes
=========     at 0xe560 in void cub::DeviceRadixSortDownsweepKernel<cub::DeviceRadixSortPolicy<int, int, unsigned int>::Policy800, (bool)0, (bool)0, int, int, unsigned int>(const T4 *, T4 *, const T5 *, T5 *, T6 *, T6, int, int, cub::GridEvenShare<T6>)
=========     by thread (157,0,0) in block (1,0,0)
=========     Address 0x1032c is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x305122]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:__cudart798 [0x2f41b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaLaunchKernel [0x8b31b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:void cub::DeviceRadixSortDownsweepKernel<cub::DeviceRadixSortPolicy<int, int, unsigned int>::Policy800, false, false, int, int, unsigned int>(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>) [0x2000f]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaError thrust::cuda_cub::launcher::triple_chevron::doit_host<void (*)(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>), int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int> >(void (*)(int const*, int*, int const*, int*, unsigned int*, unsigned int, int, int, cub::GridEvenShare<unsigned int>), int const* const&, int* const&, int const* const&, int* const&, unsigned int* const&, unsigned int const&, int const&, int const&, cub::GridEvenShare<unsigned int> const&) const [clone .isra.0] [0x20b63]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:cudaError cub::DeviceRadixSort::SortPairs<int, int, int>(void*, unsigned long&, cub::DoubleBuffer<int>&, cub::DoubleBuffer<int>&, int, int, int, CUstream_st*) [0x26ced]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::device::OctreeImpl::build() [0x1d99b]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::gpu::Octree::build() [0x14931]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:main [0xe324]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xddf5]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
========= 
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaStreamSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x441886]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaStreamSynchronize [0x8b0fb]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::device::OctreeImpl::build() [0x1de65]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::gpu::Octree::build() [0x14931]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:main [0xe324]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xddf5]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
========= 
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaGetLastError.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x441886]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaGetLastError [0x689a7]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::device::OctreeImpl::build() [0x1de6d]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:pcl::gpu::Octree::build() [0x14931]
=========                in /usr/local/lib/libpcl_gpu_octree.so.1.14
=========     Host Frame:main [0xe324]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xddf5]
=========                in /notebooks/PCCudaFilter/build/./thrust_pcl_conflict
========= 
========= Error: process didn't terminate successfully
========= Target application returned an error
========= ERROR SUMMARY: 128 errors

After updating to cuda newest version 12.6, the error solved as stated in the pcl issue [GPU Octree] Thrust conflict with GPU Octree (cudaErrorIllegalAddress) · Issue #6218 · PointCloudLibrary/pcl · GitHub.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.