For some reason whenever, I run transform_reduce
with thrust::device
I get an error message. The same exact function works, perfectly, if I just replace thrust::device
with thrust::seq.
This is demonstrated in my below working code snippet. I have tried debugging with cuda-memcheck and the error message I get is “an unknown error” and some other information that doesn’t really help. The confusing part is that the program runs fine in Visual Studio 2017 on my laptop which has a NVIDIA GeForce GT 750M; However, when I launch the application, from the command line, on the target GPUs which is a Tesla P100-PCIE, I encountered this problem. I have even considered that maybe one of the Tesla P100-PCIE GPUs was defective, so I run this code sample on several different Tesla P100-PCIE with the same result. As I am launching this application from a container that has access to 7 of theses GPUs. The installed version of CUDA for the Tesla P100-PCIE is 11.0.
Anyways, here are the compile instructions that I used:
singularity exec --nv /home/containers/cuda92.sif nvcc -arch=sm_50 -gencode=arch=compute_60,code=compute_60 -rdc=true -std=c++11 Testprogram.cu TimingGPU.cu -o Bonus
singularity exec --nv /home/containers/cuda92.sif /home/#/Cuda/GMP/Bonus
Or
singularity exec --nv /home/containers/cuda92.sif nvcc -G -Xcompiler -rdynamic -arch compute_50 -rdc=true -std=c++11 Testprogram.cu TimingGPU.cu -o Bonus
singularity exec --nv /home/containers/cuda92.sif /home/#/Cuda/GMP/Bonus
Here are the Error messages I get:
GPUassert: unspecified launch failure Testprogram.cu 174
GPUassert: an illegal memory access was encountered Testprogram.cu 174
And with running cuda-memcheck :
========= CUDA-MEMCHECK
========= Unknown Error
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/.singularity.d/libs/libcuda.so.1 [0x3eef63]
========= Host Frame:/home/#/Cuda/GMP/Bonus [0x443b6]
========= Host Frame:/home/#/Cuda/GMP/Bonus [0x4c52]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
========= Host Frame:/home/#/Cuda/GMP/Bonus [0x42c9]
=========
========= LEAK SUMMARY: 0 bytes leaked in 0 allocations
========= ERROR SUMMARY: 2 errors
And, here is a complete working code sample.
//CUDA Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform_reduce.h>
//CUDA runtime
//#include <cuda.h>
//#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda_runtime_api.h>
//In VS not working on GPU on Cloud
//#include <helper_functions.h>
//#include <helper_cuda.h>
//C++
#include <string>
#include <iostream> // std::cout, std::fixed
#include <iomanip> // std::setprecision
#include <time.h> // --- time
#include <stdlib.h> // --- srand, rand
#include <cmath>
#include<algorithm>
#include <vector>
#include <assert.h>
//My stuff
#include "TimingGPU.cuh"
//#include "SetUpGPU.cuh"
//Used for CUDA error checking to the kernel launches
#define DEBUG //Normal Call
#define DEBUG_CDP //CDP
#define BLOCK_SIZE 64
#define GRID_SIZE 256
//
#define MAX_BLOCK_SIZE 1024
/// Source https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//For CDP Dynamic Parallelism, Source https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
#define cdpErrchk(ans) { cdpAssert((ans), __FILE__, __LINE__); }
__device__ void cdpAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
printf("GPU kernel assert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) assert(0);
}
}
struct square {
__host__ __device__
float operator()(float x) {
return x * x;
}
};
//avoid using CDP by using thrust::seq, This Works on Both Tesla P100-PCIE - Server and NVIDIA GeForce GT 750M - Windows/Visual Studio
__global__ void l2_norm_thrust_seq(const float * __restrict__ d_data,
const int * __restrict__ JDS_Section_start_B,
const int * __restrict__ d_vector_of_limits_B,
size_t indexs_size_B_rows,
float * __restrict__ y) {
int start_elem_B, size_of_B_i;
int ThreadID = blockIdx.x * blockDim.x + threadIdx.x;
//int cacheIndex = threadIdx.y * blockDim.x + threadIdx.x;
if (ThreadID < indexs_size_B_rows) {
start_elem_B = JDS_Section_start_B[ThreadID];
size_of_B_i = d_vector_of_limits_B[ThreadID];
//int col = ThreadID % size_of_B_i;
int index_in_flatten_data = (start_elem_B + size_of_B_i);
y[ThreadID] = sqrt(thrust::transform_reduce(thrust::seq, d_data + start_elem_B, d_data + index_in_flatten_data, square(), 0.0f, thrust::plus<float>()));
#ifdef DEBUG_CDP
cdpErrchk(cudaPeekAtLastError());
cdpErrchk(cudaDeviceSynchronize());
printf("ThreadID = %d index_in_flatten_data %d resuts %f \n", ThreadID, index_in_flatten_data, y[ThreadID]);
#endif
}
}
//Using CDP thrust::device Error Tesla P100-PCIE - Server and Runs on NVIDIA GeForce GT 750M - Windows/Visual Studio
__global__ void l2_norm_thrust_device(const float * __restrict__ d_data,
int *JDS_Section_start_B,
int *d_vector_of_limits_B,
size_t indexs_size_B_rows,
float * __restrict__ y) {
int start_elem_B, size_of_B_i;
int ThreadID = blockIdx.x * blockDim.x + threadIdx.x;
//int cacheIndex = threadIdx.y * blockDim.x + threadIdx.x;
if (ThreadID < indexs_size_B_rows) {
start_elem_B = JDS_Section_start_B[ThreadID];
size_of_B_i = d_vector_of_limits_B[ThreadID];
//int col = ThreadID % size_of_B_i;
int index_in_flatten_data = (start_elem_B + size_of_B_i);
printf("ThreadID = %d start_elem_B %d index_in_flatten_data %d \n", ThreadID, start_elem_B, index_in_flatten_data);
y[ThreadID] = sqrt(thrust::transform_reduce(thrust::device, d_data + start_elem_B, d_data + index_in_flatten_data, square(), 0.0f, thrust::plus<float>()));
#ifdef DEBUG_CDP
cdpErrchk(cudaPeekAtLastError());
cdpErrchk(cudaDeviceSynchronize());
printf("ThreadID = %d index_in_flatten_data %d resuts %f \n", ThreadID, index_in_flatten_data, y[ThreadID]);
#endif
}
}
int main() {
TimingGPU timerGPU;
//SetUpGPU setUp;
std::vector<int> h_terms_B{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 3, 5, 8, 4, 5 ,6, 7,10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 };
std::vector<float> h_data_B{ 1.69314718F, 1.69314718F, 1.69314718F, 1.69314718F, 1.69314718F,
1.69314718F, 1.69314718F, 1.69314718F,1.69314718F,
1.69314718F, 1.69314718F, 1.69314718F,
3.38629436F, 1.69314718F, 1.69314718F, 3.38629436F,
10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0 };
size_t term_value_size_B = h_terms_B.size();
std::vector<int> h_indexs_start_B{ 0,9,12,16 };
std::vector<int> h_vector_of_limits_B{ 9,3,4, 11 };
size_t indexs_size_B = h_indexs_start_B.size();
thrust::device_vector<int> d_terms_B(h_terms_B);
thrust::device_vector<float> d_data_B(h_data_B);
thrust::device_vector<int> d_indexs_start_B(h_indexs_start_B);
thrust::device_vector<int> d_vector_of_limits(h_vector_of_limits_B);
thrust::device_vector<float> d_norm_results_Asa(indexs_size_B);
timerGPU.StartCounter();
dim3 dimBlock(4);
dim3 dimGrid(1);
printf("dimGrid %d dimBlock %d \n",1, 4);
//Error
l2_norm_thrust_device<< < 1, 4 >> >(thrust::raw_pointer_cast(d_data_B.data()),
thrust::raw_pointer_cast(d_indexs_start_B.data()),
thrust::raw_pointer_cast(d_vector_of_limits.data()),
indexs_size_B,
thrust::raw_pointer_cast(d_norm_results_Asa.data()));
//Runs with no Errors
// l2_norm_thrust_seq<< < 1, 4 >> >(thrust::raw_pointer_cast(d_data_B.data()),
// thrust::raw_pointer_cast(d_indexs_start_B.data()),
// thrust::raw_pointer_cast(d_vector_of_limits.data()),
// indexs_size_B,
// thrust::raw_pointer_cast(d_norm_results_Asa.data()));
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
printf("Timing GPU = %f\n", timerGPU.GetCounter());
system("pause");
}
Lastly, I did make a stack overflow post regarding this issue. It can be found at https://stackoverflow.com/questions/67422528/unspecified-launch-failure-error-with-thrustdevice . I hope that’s okay.
Any help or suggestions debugging this problem would be greatly appreciated.