Hi,dear all
I use thrust:sort() ,get error ,such as
First-chance exception at 0x000007FEFD6BA06D in xxx.exe: Microsoft C++ exception: thrust::system::system_error at memory location 0x000000000021D600.
Unhandled exception at 0x000007FEFD6BA06D in xxx.exe: Microsoft C++ exception: thrust::system::system_error at memory location 0x000000000021D600.
invalid device pointer
{device_radix_sort_dispatch.cuh}
Internal debugger error occurred while attempting to launch _ZN6thrust6system4cuda6detail4cub_30DeviceRadixSortDownsweepKernelINS3_23DeviceRadixSortDispatchILb0EjNS3_8NullTypeEiE18PtxDownsweepPolicyELb0EjS6_iEEvPT1_SA_PT2_SC_PT3_SD_iibbNS3_13GridEvenShareISD_EE in
CUcontext 0x0a5cab00, CUmodule 0xa96699a0:
code patching failed for unknown reason.
All breakpoints for function _ZN6thrust6system4cuda6detail4cub_30DeviceRadixSortDownsweepKernelINS3_23DeviceRadixSortDispatchILb0EjNS3_8NullTypeEiE18PtxDownsweepPolicyELb0EjS6_iEEvPT1_SA_PT2_SC_PT3_SD_iibbNS3_13GridEvenShareISD_EE have been removed.
See Output View for additional messages of this type.
my env is:
NVIDIA Nsight Visual Studio Edition Version 5.2
{Build Number 5.2.0.16321}
CUDA8.0 >{CUDA Toolkit 7.5 or 8.0}
GTX1080
Visual Studio 2013
Driver{376.51}>{NVIDIA Display Driver version 376.09}
codes as:
thrust::device_vector bucket_indices(nLevelSize);
xxx…
thrust::sort(bucket_indices.begin(), bucket_indices.end());
can anyone tell me how i can resolve it?
thrust bug?
nsight bug?
my code bug?
first exec is ok.
second exec is error. such as for 1:10
thanks very much!
Hello,Dear harryz_
first,thanks very much for your reply.
I haved {1} disable the TDR in nsight monitor and {2} disable the nsight memcheck,
but this error also occur.
so sure that the error caused by a memory leak in my own code.
I am confused that
first time exec is ok {n=0}
second time exec is error {n=1} inside a for loop.
for (int n = 0;n<3;n++)
{
my code exec;
}
why this case occur?
how i can quick fix this memory leakage error?
hope that your helps!thanks very much!
this is my mini sample code, thanks very much!
my env is:
NVIDIA Nsight Visual Studio Edition Version 5.2
{Build Number 5.2.0.16321}
CUDA8.0 >{CUDA Toolkit 7.5 or 8.0}
GTX1080
Visual Studio 2013
Driver{376.51}>{NVIDIA Display Driver version 376.09}
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include “device_launch_parameters.h”
using namespace std;
// thrust
#include <thrust/host_vector.h> // host vector
#include <thrust/device_vector.h> // device vector
#include <thrust/device_ptr.h> // device ptr
#include <thrust/iterator/counting_iterator.h> // iterator
#include <thrust/transform.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/generate.h>
#include <thrust/binary_search.h>
#include <thrust/random.h>
#include <thrust/functional.h>
struct point_to_bucket_index : public thrust::unary_function<float2, unsigned int>
{
// constructor function
host device point_to_bucket_index(unsigned int width, unsigned int height) :w(width), h(height){}
host device unsigned int operator()(float2 p) const
{
// find the raster indices of p’s bucket
unsigned int x = static_cast(p.x * (w - 1));
unsigned int y = static_cast(p.y * (h - 1));
// return the bucket’s linear index
return y * w + x;
}
// member variable
unsigned int w, h;
};
global void TransToFloat2(const float* d_Input1, const float* d_Input2, float2* d_Output, const int nSize, const int nBlocksNumX)
{
const int tid = (blockIdx.y * nBlocksNumX + blockIdx.x) * blockDim.x + threadIdx.x;
if (tid < nSize)
{
d_Output[tid] = make_float2(d_Input1[tid], d_Input2[tid]);
}
}
void MiniLevelCompute(const int nLevel)
{
int nMaxIterNum = 15;
int nWidth = 512;
int nHeight = 512;
int nSlice = 32;
int nLevelWidth = int((nWidth - 1) / pow(2, nLevel)) + 1;
int nLevelHeight = int((nHeight - 1) / pow(2, nLevel)) + 1;
int nLevelSlice = int((nSlice - 1) / pow(2, nLevel)) + 1;
int nLevelSize = nLevelWidth * nLevelHeight * nLevelSlice;
int nDataSizeByte = nLevelSize * sizeof(float);
float* d_ImageDataRef_tmp;
float* d_ImageDataMov_tmp;
checkCudaErrors(cudaMalloc((void**)&d_ImageDataRef_tmp, nDataSizeByte));
checkCudaErrors(cudaMalloc((void**)&d_ImageDataMov_tmp, nDataSizeByte));
checkCudaErrors(cudaMemset(d_ImageDataRef_tmp, 0, nDataSizeByte));
checkCudaErrors(cudaMemset(d_ImageDataMov_tmp, 0, nDataSizeByte));
int nHistBinsNum = 150;
thrust::device_vector bucket_begin(nHistBinsNum * nHistBinsNum);
thrust::device_vector bucket_end(nHistBinsNum * nHistBinsNum);
thrust::device_vector bucket_indices(nLevelSize);
thrust::device_vector bucket_sizes(nHistBinsNum * nHistBinsNum);
float2 d_points;
cudaMalloc((void*)&d_points, sizeof(float2)* nLevelSize);
int nBlocksNumX = 1024;
int nThreadsNumPerBlock = 256;
dim3 nblocks;
nblocks.x = nBlocksNumX;
nblocks.y = ((1 + (nLevelSize - 1) / nThreadsNumPerBlock) - 1) / nBlocksNumX + 1;
for (int nIter = 0; nIter < nMaxIterNum; nIter++)
{
TransToFloat2 << <nblocks, nThreadsNumPerBlock >> >(d_ImageDataRef_tmp, d_ImageDataMov_tmp, d_points, nLevelSize, nBlocksNumX);
thrust::device_ptr points_t(d_points);
thrust::transform(points_t, points_t + nLevelSize, bucket_indices.begin(), point_to_bucket_index(nHistBinsNum, nHistBinsNum));
cout << "debug Iter : " << nIter << endl;
thrust::sort(bucket_indices.begin(), bucket_indices.end());
cout << "debug Iter : " << nIter << endl;
}
cudaFree(d_points);
cudaFree(d_ImageDataMov_tmp);
cudaFree(d_ImageDataMov_tmp);
}
int test_CUDA_error()
{
int nDevice = 0;
cudaSetDevice(nDevice);
int nDeviceCount;
cudaDeviceProp cDeviceProp;
cudaGetDeviceCount(&nDeviceCount);
cudaGetDeviceProperties(&cDeviceProp, nDevice);
if (1)
{
cout << "Using device # " << nDevice << endl;
cout << "Max threads per block: " << cDeviceProp.maxThreadsPerBlock << endl;
cout << "Max Threads DIM: " << cDeviceProp.maxThreadsDim[0] << " x " << cDeviceProp.maxThreadsDim[1] << " x " << cDeviceProp.maxThreadsDim[2] << endl;
cout << "Max Grid Size: " << cDeviceProp.maxGridSize[0] << " x " << cDeviceProp.maxGridSize[1] << " x " << cDeviceProp.maxGridSize[2] << endl;
printf(“Device %d: "%s" with Compute %d.%d capability\n”, nDevice, cDeviceProp.name, cDeviceProp.major, cDeviceProp.minor);
}
for (int nLevel = 1; nLevel >= 0; nLevel–)
{
MiniLevelCompute(nLevel);
}
return 0;
}
I can repo it on my 1070 + cuda 9.2, your app crashes at line 92 thrust::device_vector bucket_begin(nHistBinsNum * nHistBinsNum); in the second level, it causes gpu exception, so the nsight doesn’t work, it even doesn’t work without nsight. I’m not familiar with thrust programming, I think you should post your question at cuda programming forum.