Unspecified launch failure error when thrust::device is used in transform_reduce

For some reason whenever, I run transform_reduce with thrust::device I get an error message. The same exact function works, perfectly, if I just replace thrust::device with thrust::seq. This is demonstrated in my below working code snippet. I have tried debugging with cuda-memcheck and the error message I get is “an unknown error” and some other information that doesn’t really help. The confusing part is that the program runs fine in Visual Studio 2017 on my laptop which has a NVIDIA GeForce GT 750M; However, when I launch the application, from the command line, on the target GPUs which is a Tesla P100-PCIE, I encountered this problem. I have even considered that maybe one of the Tesla P100-PCIE GPUs was defective, so I run this code sample on several different Tesla P100-PCIE with the same result. As I am launching this application from a container that has access to 7 of theses GPUs. The installed version of CUDA for the Tesla P100-PCIE is 11.0.

Anyways, here are the compile instructions that I used:

singularity exec --nv /home/containers/cuda92.sif nvcc -arch=sm_50 -gencode=arch=compute_60,code=compute_60 -rdc=true -std=c++11  Testprogram.cu TimingGPU.cu -o Bonus
singularity exec --nv /home/containers/cuda92.sif /home/#/Cuda/GMP/Bonus

Or

singularity exec --nv /home/containers/cuda92.sif nvcc -G -Xcompiler -rdynamic -arch compute_50 -rdc=true -std=c++11  Testprogram.cu TimingGPU.cu -o Bonus
singularity exec --nv /home/containers/cuda92.sif /home/#/Cuda/GMP/Bonus

Here are the Error messages I get:

GPUassert: unspecified launch failure Testprogram.cu 174
GPUassert: an illegal memory access was encountered Testprogram.cu 174

And with running cuda-memcheck :

========= CUDA-MEMCHECK
========= Unknown Error
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/.singularity.d/libs/libcuda.so.1 [0x3eef63]
=========     Host Frame:/home/#/Cuda/GMP/Bonus [0x443b6]
=========     Host Frame:/home/#/Cuda/GMP/Bonus [0x4c52]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
=========     Host Frame:/home/#/Cuda/GMP/Bonus [0x42c9]
=========
========= LEAK SUMMARY: 0 bytes leaked in 0 allocations
========= ERROR SUMMARY: 2 errors

And, here is a complete working code sample.

//CUDA Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform_reduce.h>

//CUDA runtime
//#include <cuda.h>
//#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda_runtime_api.h>
//In VS not working on GPU on Cloud
//#include <helper_functions.h>
//#include <helper_cuda.h>

//C++ 
#include <string>
#include <iostream>     // std::cout, std::fixed
#include <iomanip>      // std::setprecision
#include <time.h>       // --- time
#include <stdlib.h>     // --- srand, rand
#include <cmath>
#include<algorithm>
#include <vector>
#include <assert.h>

//My stuff
#include "TimingGPU.cuh"
//#include "SetUpGPU.cuh"


//Used for CUDA error checking to the kernel launches
#define DEBUG           //Normal Call
#define DEBUG_CDP       //CDP


#define BLOCK_SIZE 64
#define GRID_SIZE 256
//
#define MAX_BLOCK_SIZE 1024


/// Source https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

//For CDP Dynamic Parallelism, Source https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
#define cdpErrchk(ans) { cdpAssert((ans), __FILE__, __LINE__); }
__device__ void cdpAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        printf("GPU kernel assert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) assert(0);
    }
}

struct square {
    __host__ __device__
        float operator()(float x) {
        return x * x;
    }
};


//avoid using CDP by using thrust::seq, This Works on Both Tesla P100-PCIE - Server and NVIDIA GeForce GT 750M - Windows/Visual Studio
__global__  void l2_norm_thrust_seq(const float * __restrict__ d_data,
    const int   * __restrict__ JDS_Section_start_B,
    const int   * __restrict__ d_vector_of_limits_B,
    size_t                     indexs_size_B_rows,
    float       * __restrict__ y) {
    int start_elem_B, size_of_B_i;
    int ThreadID = blockIdx.x * blockDim.x + threadIdx.x;
    //int cacheIndex = threadIdx.y * blockDim.x + threadIdx.x;
    if (ThreadID < indexs_size_B_rows) {
        start_elem_B = JDS_Section_start_B[ThreadID];
        size_of_B_i = d_vector_of_limits_B[ThreadID];

        //int col = ThreadID % size_of_B_i;
        int index_in_flatten_data = (start_elem_B + size_of_B_i);
        y[ThreadID] = sqrt(thrust::transform_reduce(thrust::seq, d_data + start_elem_B, d_data + index_in_flatten_data, square(), 0.0f, thrust::plus<float>()));


#ifdef DEBUG_CDP
        cdpErrchk(cudaPeekAtLastError());
        cdpErrchk(cudaDeviceSynchronize());
        printf("ThreadID = %d  index_in_flatten_data %d resuts %f  \n", ThreadID, index_in_flatten_data, y[ThreadID]);
#endif
    }
}

//Using CDP thrust::device Error Tesla P100-PCIE - Server and Runs on NVIDIA GeForce GT 750M - Windows/Visual Studio
__global__  void l2_norm_thrust_device(const float * __restrict__ d_data,
    int *JDS_Section_start_B,
    int *d_vector_of_limits_B,
    size_t indexs_size_B_rows,
    float * __restrict__ y) {
    int start_elem_B, size_of_B_i;
    int ThreadID = blockIdx.x * blockDim.x + threadIdx.x;

    //int cacheIndex = threadIdx.y * blockDim.x + threadIdx.x;
    if (ThreadID < indexs_size_B_rows) {
        start_elem_B = JDS_Section_start_B[ThreadID];
        size_of_B_i = d_vector_of_limits_B[ThreadID];

        //int col = ThreadID % size_of_B_i;
        int index_in_flatten_data = (start_elem_B + size_of_B_i);
        printf("ThreadID = %d  start_elem_B %d index_in_flatten_data %d  \n", ThreadID, start_elem_B, index_in_flatten_data);
        y[ThreadID] = sqrt(thrust::transform_reduce(thrust::device, d_data + start_elem_B, d_data + index_in_flatten_data, square(), 0.0f, thrust::plus<float>()));
        
#ifdef DEBUG_CDP
        cdpErrchk(cudaPeekAtLastError());
        cdpErrchk(cudaDeviceSynchronize());
        printf("ThreadID = %d  index_in_flatten_data %d resuts %f  \n", ThreadID, index_in_flatten_data, y[ThreadID]);
#endif
    }
}


int main() {

    TimingGPU timerGPU;
    //SetUpGPU setUp;

    std::vector<int>  h_terms_B{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 3, 5, 8, 4, 5 ,6, 7,10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20 };
    std::vector<float> h_data_B{ 1.69314718F, 1.69314718F, 1.69314718F, 1.69314718F, 1.69314718F,
        1.69314718F, 1.69314718F, 1.69314718F,1.69314718F,
        1.69314718F, 1.69314718F, 1.69314718F,
        3.38629436F, 1.69314718F, 1.69314718F, 3.38629436F,
        10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0,  20.0 };
    size_t  term_value_size_B = h_terms_B.size();
    std::vector<int> h_indexs_start_B{ 0,9,12,16 };
    std::vector<int> h_vector_of_limits_B{ 9,3,4, 11 };
    size_t indexs_size_B = h_indexs_start_B.size();

    thrust::device_vector<int>  d_terms_B(h_terms_B);
    thrust::device_vector<float>  d_data_B(h_data_B);
    thrust::device_vector<int>  d_indexs_start_B(h_indexs_start_B);
    thrust::device_vector<int>  d_vector_of_limits(h_vector_of_limits_B);
    thrust::device_vector<float>  d_norm_results_Asa(indexs_size_B);

    timerGPU.StartCounter();
    dim3 dimBlock(4);
    dim3 dimGrid(1);
    printf("dimGrid %d dimBlock %d \n",1, 4);
    
    //Error
    l2_norm_thrust_device<< < 1, 4 >> >(thrust::raw_pointer_cast(d_data_B.data()),
                                                    thrust::raw_pointer_cast(d_indexs_start_B.data()), 
                                                    thrust::raw_pointer_cast(d_vector_of_limits.data()),
                                                    indexs_size_B, 
                                                    thrust::raw_pointer_cast(d_norm_results_Asa.data()));
    //Runs with no Errors                                           
        // l2_norm_thrust_seq<< < 1, 4 >> >(thrust::raw_pointer_cast(d_data_B.data()),
                                        // thrust::raw_pointer_cast(d_indexs_start_B.data()), 
                                        // thrust::raw_pointer_cast(d_vector_of_limits.data()),
                                        // indexs_size_B, 
                                        // thrust::raw_pointer_cast(d_norm_results_Asa.data()));                                    
                                            
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif

printf("Timing GPU = %f\n", timerGPU.GetCounter());

    system("pause");
}

Lastly, I did make a stack overflow post regarding this issue. It can be found at https://stackoverflow.com/questions/67422528/unspecified-launch-failure-error-with-thrustdevice . I hope that’s okay.

Any help or suggestions debugging this problem would be greatly appreciated.

I tried reproducing the problem using few different CUDA versions, a couple different GPU types, and a variety of different compile switches, all on linux. I was never able to see any error.

You might try advancing to a newer CUDA version.

Sorry, for the delayed response. I was able to run my code on a completely different GPU, so the problem is the container that I am using. I sincerely appreciate you taking the time to try to debug this problem and I’m currently working on trying to figure out if it’s a driver issue or CUDA version issue. The version of CUDA running on the container is 11; so, I might try downgrading.